# HG changeset patch
# User bgruening
# Date 1441223746 14400
# Node ID 4084128e7ccaf1ea1d00c5c3ad93a55e9ba35f20
# Parent 047eb877b6f0c8918279e6972509fa3fe6331a76
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit 18df9e67efd4adafcde4eb9b62cd815e4afe9733-dirty
diff -r 047eb877b6f0 -r 4084128e7cca bismark.tar.bz2
Binary file bismark.tar.bz2 has changed
diff -r 047eb877b6f0 -r 4084128e7cca bismark_bowtie2_wrapper.xml
--- a/bismark_bowtie2_wrapper.xml Sun Jun 28 07:23:35 2015 -0400
+++ b/bismark_bowtie2_wrapper.xml Wed Sep 02 15:55:46 2015 -0400
@@ -121,6 +121,8 @@
--output-report-file $report_file
#end if
+ $params.non_directional
+
#end if
##
@@ -221,6 +223,9 @@
+
+
diff -r 047eb877b6f0 -r 4084128e7cca bismark_bowtie_wrapper.xml
--- a/bismark_bowtie_wrapper.xml Sun Jun 28 07:23:35 2015 -0400
+++ b/bismark_bowtie_wrapper.xml Wed Sep 02 15:55:46 2015 -0400
@@ -104,6 +104,9 @@
#if $params.isReportOutput:
--output-report-file $report_file
#end if
+ #if $params.non_directional_option:
+ $params.non_directional_option
+ #end if
#end if
@@ -201,6 +204,15 @@
+
+
+
+
+
+
+
+
read 1 reads overlapping
+ # <------------------------- read 2
+ #
+ # or
+ #
+ # -------------------------> read 1
+ # <----------------------- read 2 read 2 contained within read 1
+ #
+ # or
+ #
+ # -------------------------> read 1 reads 1 and 2 exactly overlapping
+ # <------------------------- read 2
+ #
+
+ # dovetailing of reads is not enabled for Bowtie 2 alignments
+
+ $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
+ $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
+ }
+ elsif ($end_read_2 < $end_read_1){
+
+ # -------------------------> read 1
+ # <----------- read 2 read 2 contained within read 1
+ #
+ # or
+ #
+ # -------------------------> read 1
+ # <------------------------ read 2 read 2 contained within read 1
+
+ # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable
+ $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign,
+ $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this
+ ### as a request by frozenlyse on SeqAnswers on 24 July 2013
+ }
+
+ }
+
+ elsif ($start_read_2 < $start_read_1){
+
+ if ($end_read_1 >= $end_read_2){
+
+ # Read 2 alignment is leftmost
+
+ # -------------------------> read 2 reads overlapping
+ # <------------------------- read 1
+ #
+ # or
+ #
+ # -------------------------> read 2
+ # <----------------------- read 1 read 1 contained within read 2
+ #
+ #
+
+ $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
+ $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
+ }
+ elsif ($end_read_1 < $end_read_2){
+
+ # -------------------------> read 2
+ # <----------- read 1 read 1 contained within read 2
+ #
+ # or
+ #
+ # -------------------------> read 2
+ # <------------------------ read 1 read 1 contained within read 2
+
+ # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable
+ $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign,
+ $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this
+ ### as a request by frozenlyse on SeqAnswers on 24 July 2013
+ }
+ }
+ }
+
+ else{ # Bowtie 1
+
+ if ($end_read_2 >= $end_read_1){
+ # Read 1 alignment is leftmost
+ # -------------------------> read 1
+ # <------------------------- read 2
+ # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
+
+ $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
+ $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
+ }
+ else{
+ # Read 2 alignment is leftmost
+ # -------------------------> read 2
+ # <------------------------- read 1
+ # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
+
+ $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
+ $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
+ }
+ }
+
+ #####
+
+ # adjusting the strand of the sequence before we use them to generate mismatch strings
+ if ($strand_1 eq '-'){
+ $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
+ $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
+ if ($cigar_1 =~ /D/){
+ $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} );
+ }
+ $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
+ }
+ if ($strand_2 eq '-'){
+ $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
+ $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
+ if ($cigar_2 =~ /D/){
+ $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} );
+ }
+ $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
+ }
+
+ # print "$actual_seq_1\n$ref_seq_1\n\n";
+ # print "$actual_seq_2\n$ref_seq_2\n\n";
+
+ #####
+
+ my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
+ my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
+ if ($bowtie2){
+ $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
+ $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
+ }
+ my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
+ my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
+
+ #####
+
+ my $MD_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1,$cigar_1,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1}); # Optional tag MD: String providing mismatched reference bases in the alignment (including indel information)
+ my $MD_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2,$cigar_2,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2});
+
+ # my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
+ # my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
+
+ #####
+
+ my $XM_tag_1; # Optional tag XM: Methylation call string
+ my $XM_tag_2;
+
+ if ($strand_1 eq '-'){
+ $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
+ }
+ else{
+ $XM_tag_1 = "XM:Z:$methcall_1";
+ }
+
+ if ($strand_2 eq '-'){
+ $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
+ }
+ else{
+ $XM_tag_2 = "XM:Z:$methcall_2";
+ }
+
+ #####
+
+ my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
+ my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
+
+ #####
+
+ my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
+
+ #####
+
+ # Optionally calculating number of mismatches for Bowtie 2 alignments
+
+ if ($non_bs_mm) {
+ if ($bowtie2) {
+
+ $number_of_mismatches_1 =~ s/-//; # removing the minus sign
+ $number_of_mismatches_2 =~ s/-//;
+
+ ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
+
+ ### CIGAR 1
+ if ($cigar_1 =~ /(D|I)/) {
+ # warn "$cigar_1\n";
+
+ # parsing CIGAR string
+ my @len = split (/\D+/,$cigar_1); # storing the length per operation
+ my @ops = split (/\d+/,$cigar_1); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ foreach (0..$#len) {
+ if ($ops[$_] eq 'M') {
+ # warn "skipping\n";
+ next; # irrelevant
+ }
+ elsif ($ops[$_] eq 'I') { # insertion in the read sequence
+ $number_of_mismatches_1 -= $insertion_open;
+ $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
+ # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
+ }
+ elsif ($ops[$_] eq 'D') { # deletion in the read sequence
+ $number_of_mismatches_1 -= $deletion_open;
+ $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
+ # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
+ }
+ elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
+ }
+ else {
+ die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
+ }
+ }
+
+ # warn "Alignment score $number_of_mismatches_1\n";
+ # print "Mismatches $number_of_mismatches_1\n\n";
+ }
+
+ ### CIGAR 2
+ if ($cigar_2 =~ /(D|I)/) {
+ # warn "$cigar_2\n";
+
+ # parsing CIGAR string
+ my @len = split (/\D+/,$cigar_2); # storing the length per operation
+ my @ops = split (/\d+/,$cigar_2); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ foreach (0..$#len) {
+ if ($ops[$_] eq 'M') {
+ # warn "skipping\n";
+ next; #irrelevant
+ }
+ elsif ($ops[$_] eq 'I') { # insertion in the read sequence
+ $number_of_mismatches_2 -= $insertion_open;
+ $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
+ # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
+ }
+ elsif ($ops[$_] eq 'D') { # deletion in the read sequence
+ $number_of_mismatches_2 -= $deletion_open;
+ $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
+ # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
+ }
+ elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
+ }
+ else {
+ die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
+ }
+ }
+ }
+
+ ### Now we have InDel corrected Alignment scores
+
+ ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
+ ### sequence contained more than 5 Ns, but this should occur close to never
+
+ my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
+ my $seq_2_N_count = $number_of_mismatches_2 % 6;
+ # warn "N count 1: $seq_1_N_count\n";
+ # warn "N count 2: $seq_2_N_count\n";
+
+ $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
+ $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
+
+ # warn "MM1 $number_of_mismatches_1 \n";
+ # warn "MM2 $number_of_mismatches_2 \n";
+ }
+ }
+
+ ####
+
+ my $XA_tag = "XA:Z:$number_of_mismatches_1";
+ my $XB_tag = "XB:Z:$number_of_mismatches_2";
+
+
+ # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
+ ### optionally print number of non-bisulfite mismatches
+ if ($non_bs_mm){
+ print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
+ print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
+ }
+ else{ # default
+ print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
+ print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
+ }
+}
+
+sub revcomp{
+ my $seq = shift or die "Missing seq to reverse complement\n";
+ $seq = reverse $seq;
+ $seq =~ tr/ACTGactg/TGACTGAC/;
+ return $seq;
+}
+
+sub hemming_dist{
+ my $matches = 0;
+ my @actual_seq = split //,(shift @_);
+ my @ref_seq = split //,(shift @_);
+
+ foreach (0..$#actual_seq){
+ ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
+ }
+ return my $hd = scalar @actual_seq - $matches;
+}
+
+
+### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might
+### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels
+### 05 June 2014
+
+
+sub make_mismatch_string{
+ my ($actual_seq,$ref_seq,$cigar,$md_sequence) = @_;
+
+ my $MD_tag = "MD:Z:";
+ my $prev_matching = 0;
+ my $last_char;
+
+ my $ref_base;
+ my $actual_base;
+
+ foreach my $pos ( 0..(length$actual_seq) - 1 ){
+
+ $actual_base = substr($actual_seq,$pos,1);
+ $ref_base = substr($ref_seq,$pos,1);
+ # if ($verbose){ warn "reference: $ref_base\tseen base: $actual_base\n";}
+
+ if ( $actual_base eq $ref_base ){
+ ++$prev_matching;
+ }
+ else{
+ # If the mismatch is due to an insertion we simply move on, else we print the previously matching bases as well as the mismatching genomic base
+ if ($ref_base eq 'X'){
+ # if ($verbose){ warn "The genome base was an artificually padded '$ref_base' due to an insertion in the read at this position. Just ignoring it for the MD tag\n"; sleep(1);}
+ }
+ else{
+ # if ($verbose){ warn "previous matching bases: $prev_matching\n";}
+
+ ### There is a mismatch between the sequence and the genome. First we need to write out how may bases matched until now
+ if ($prev_matching == 0){
+ # if ($verbose){ warn "Got a mismatch either at the very start or next to another mismatch. Need to add a padding 0 as well as the mismatch\n";}
+ # if ($verbose){ warn "${prev_matching}$ref_base\n";}
+ $MD_tag .= $prev_matching;
+ $MD_tag .= $ref_base;
+ }
+ else{
+ # if ($verbose){ warn "${prev_matching}$ref_base\n";}
+ $MD_tag .= $prev_matching;
+ $MD_tag .= $ref_base;
+ }
+
+ $prev_matching = 0; # resetting $prev_matching
+ }
+
+ }
+
+ }
+ ### appending the number of matches one last time
+ $MD_tag .= $prev_matching;
+
+
+ ### If the read contains deletion(s) we need to take care of these in the MD-tag as well
+ if ($cigar =~ /D/){
+ my $deletions_total = 0;
+ while ($cigar =~ /D/g){
+ ++$deletions_total;
+ }
+ if ($verbose){ warn "Read contains $deletions_total deletions in total\n\n";}
+
+ if ($verbose){ warn "There was a deletion in the read!\n";}
+ if ($verbose){ warn "actual:\t$actual_seq\nref:\t$ref_seq\nMD-seq:\t$md_sequence\nMD-tag: $MD_tag\n";}
+
+ # parsing CIGAR string
+ my @len = split (/\D+/,$cigar); # storing the length per operation
+ my @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ my $MD_pos_so_far = 0;
+ my $deletions_processed = 0;
+ my $del_pos = 0;
+ my $deleted_bases = '';
+ my $new_MD = $1 if ($MD_tag =~ /MD:Z:(.*)/);
+ my $md_index_already_processed;
+
+ my @md = split //,$new_MD;
+
+ if ($verbose){ warn "New MD-tag: $new_MD\n\n";}
+ $MD_tag = "MD:Z:"; ### reconstituting a new MD-tag
+ $new_MD = ''; # using this to build up a new string that will replace the old \@md
+
+ if ($verbose){ warn "CIGAR string; $cigar\n";}
+ ### determining end position of a read
+ foreach my $index(0..$#len){
+
+ if ($ops[$index] eq 'M'){ # matching bases
+ $del_pos += $len[$index];
+ if ($verbose){ warn "Operation is 'M', adding $len[$index] bp\n";}
+ }
+ elsif($ops[$index] eq 'I'){ # insertion
+ $del_pos += $len[$index];
+ ### need to add insertions in the read to MD pos so far!
+ $MD_pos_so_far += $len[$index];
+ if ($verbose){ warn "Operation is 'I', adding $len[$index] bp\n";}
+ }
+ elsif($ops[$index] eq 'D'){ # deletion
+ if ($verbose){ warn "Operation is 'D', extracting $len[$index] bp\n";}
+ $deleted_bases = substr($md_sequence,$del_pos,$len[$index]);
+ if ($verbose){ warn "Deleted bases: $deleted_bases\n\n";}
+
+ ### Now we need to process the MD-tag so far and write out everything up until this point, inlcuding the deletion
+ if ($verbose){ warn "Now processing the MD-tag\n";}
+ my $op;
+
+ my $this_deletion_processed;
+ my $md_processed_so_far;
+ my $current_md_index;
+
+ foreach my $el (@md){
+
+ unless (defined $current_md_index){
+ $current_md_index = 0; # first element = index 0
+ }
+ else{
+ ++$current_md_index;
+ }
+
+ if ($md_index_already_processed and ($current_md_index <= $md_index_already_processed)){
+ if ($verbose){ warn "This has to be another deletion within the same read. Currently processing index $current_md_index, but have already processed $md_index_already_processed indexes previously\n";}
+ $new_MD .= $el;
+ next;
+ }
+
+ if ($verbose){ warn "Current element: $el\n";}
+ unless (defined $op){ # initialize
+ $op = $el;
+ if ($verbose){ warn "Initializing \$op as $op\n";}
+ next;
+ }
+
+ if ($deletions_processed == $deletions_total){
+ if ($verbose){ warn "Processed $deletions_processed in the read so far, out of $deletions_total total. Just appending elements until the end of the string: here $el\n";}
+ $MD_tag .= $el;
+ $new_MD .= $el;
+ next;
+ }
+ # this only occurs when there are more deletions in the read but we want to regenerate a new MD tag
+ if ($this_deletion_processed){
+ $new_MD .= $el;
+ next;
+ }
+
+ if ($op =~ /^\d+$/){
+ if ($verbose){ warn "Operation so far was a digit: $op\n";}
+ if ($el =~ /\d/){
+ $op .= $el;
+ if ($verbose){ warn "Appending current operation $el. New operation is: $op\n";}
+ next;
+ }
+ else{
+ if ($verbose){ warn "current element is a word character: $el\n";}
+
+ ### Need to determine if the matching operation length includes the deletion position
+ if ($verbose){ warn "Processing operation $op and adding it to MD pos which is so far: $MD_pos_so_far; deletion pos is $del_pos.\n";}
+ $MD_pos_so_far += $op;
+ if ($verbose){ warn "MD pos so far: $MD_pos_so_far\n";}
+ if ($MD_pos_so_far < $del_pos){
+ if ($verbose){ warn "Doesn't cover the deletion yet. Writing back out.\n";}
+ $MD_tag .= $op;
+ $new_MD .= $op;
+ if ($verbose){ warn "Setting new operation to: $el\n";}
+ $op = $el; # setting new $op
+ }
+ else{
+ if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";}
+ ### splitting up the number of matching bases in number before and after the deletion
+
+ my $pos_after_deletion = $MD_pos_so_far - $del_pos;
+ my $pos_before_deletion = $op - $pos_after_deletion;
+ if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";}
+ $MD_tag .= "${pos_before_deletion}^${deleted_bases}";
+ $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}";
+ if ($verbose){ warn "\$newMD after adjusting for the current deletion: $new_MD\n";}
+
+ #adjusting the MD_position by the number of bases after the deletion
+ $MD_pos_so_far -= $pos_after_deletion;
+ if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; }
+ ### also appending the current element because we are writing out the rest of the MD-string unchanged to $new_MD
+ $new_MD .= $el;
+
+ $deletions_processed += 1;
+ $this_deletion_processed = 1;
+
+ if ($deletions_processed == $deletions_total){ # this was the last deletion of the read
+ if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";}
+ $MD_tag .= "${pos_after_deletion}";
+
+ ### also appending the current element because we are writing out the rest of the MD-string unchanged
+ if ($verbose){ warn "also appending the current element $el\n";}
+ $MD_tag .= $el;
+ ### Finally also adding the length of the deletion to $del_pos
+ $del_pos += $len[$index];
+ if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";}
+ }
+ else{
+ if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";}
+ if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";}
+
+
+ ### This migh need looking at!!
+
+ $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op;
+ if ($verbose){ warn "Current index = $current_md_index\n";}
+
+ if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";}
+ $md_index_already_processed = $current_md_index - 1;
+
+ if ($verbose){ warn "Exiting now and waiting for the next deletion\n";}
+
+ ### Finally also adding the length of the deletion to $del_pos
+ $del_pos += $len[$index];
+ $MD_pos_so_far += $len[$index];
+ if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";}
+ if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
+ #setting $op to en empty string so it is not being processed as the last element
+ $op = '';
+ # last; # exiting the loop and processing the CIGAR string further until we hit the next deletion
+ }
+ }
+ }
+ if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
+ }
+ else{
+ if ($verbose){ warn "Operation so far was a word character: $op\n";}
+ if ($el =~ /\d+/){
+ # processing the previous mismatch position
+ $MD_tag .= $op;
+ $new_MD .= $op;
+ $MD_pos_so_far += length($op);
+ if ($verbose){ warn "Writing out mismatching base $op and adding length ",length($op),"\n";}
+ }
+ else{
+ # this should never occur since mismatches are followed by a 0 or another digit
+ die "current element is a another word character: $el. This should never happen!\n";
+ }
+ if ($verbose){ warn "Setting new operation to: $el\n";}
+ $op = $el; # setting new $op
+ if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
+ }
+ }
+
+ ### need to consider last element if it was a digit or number and we are expecting the deletion in the last element of the MD-tag
+ if ($op =~ /\d+/ and $deletions_processed < $deletions_total){
+ if ($verbose){ warn "\n\nlast operation was $op\n";}
+ if ($verbose){ warn "Processing operation $op; deletion pos is $del_pos. MD so far was: $MD_pos_so_far\n";}
+
+ $MD_pos_so_far += $op;
+ if ($verbose){ warn "Adding $op to MD pos so far: $MD_pos_so_far\n";}
+ if ($verbose){ warn "Deletions already processed: $deletions_processed, del total: $deletions_total\n\n";}
+ if ($MD_pos_so_far >= $del_pos){
+ if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";}
+ ### splitting up the number of matching bases in number before and after the deletion
+
+ my $pos_after_deletion = $MD_pos_so_far - $del_pos;
+ my $pos_before_deletion = $op - $pos_after_deletion;
+ if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";}
+
+ $MD_tag .= "${pos_before_deletion}^${deleted_bases}";
+ $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}";
+
+ #adjusting the MD_position by the number of bases after the deletion
+ $MD_pos_so_far -= $pos_after_deletion;
+ if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; }
+
+ $deletions_processed += 1;
+ $this_deletion_processed = 1;
+
+ if ($deletions_processed == $deletions_total){ # this was the last deletion of the read
+ if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";}
+ $MD_tag .= "${pos_after_deletion}";
+
+ }
+ else{
+ if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";}
+ if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";}
+
+ $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op;
+ if ($verbose){ warn "Current index = $current_md_index\n";}
+
+ if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";}
+ # since we are no longer in the loop we don't have to subtract 1 from $current_md_index (tit hasn't been incremented in the first place...)
+ $md_index_already_processed = $current_md_index;
+
+ if ($verbose){ warn "Exiting now and waiting for the next deletion\n";}
+
+ $MD_pos_so_far += $len[$index];
+ if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
+ }
+ ### Finally also adding the length of the deletion to $del_pos
+ $del_pos += $len[$index];
+ if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";}
+ }
+ else{
+ die "Something went wrong, we haven't seen a deletion so far even though we should have...\n\n";
+ }
+ }
+
+ # forming a new @md
+ @md = split //,$new_MD;
+ $new_MD = '';
+ if ($verbose){ warn "New \@md array: @md\n\n";}
+ if ($verbose){ warn "MD-tag so far: $MD_tag ~~\nnew_MD so far: $new_MD\n\n";}
+
+ }
+ else{
+ die "Found CIGAR operations other than M, I, D or N: '$ops[$index]'. Not allowed at the moment\n";
+ }
+ }
+
+ }
+ if ($verbose){ warn "Returning MD-tag: $MD_tag\n";}
+ return $MD_tag;
+
+}
+
+### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might
+### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels
+# sub make_mismatch_string{
+# my $actual_seq = shift or die "Missing actual sequence\n";
+# my $ref_seq = shift or die "Missing reference sequence\n";
+# my $XX_tag = "XX:Z:";
+
+# my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
+
+# warn "'$tmp'\n"; sleep(1);
+# my $prev_mm_pos = 0;
+
+# while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
+# my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
+# my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
+# $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
+# $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
+# $prev_mm_pos = pos($tmp); # Position of last mismatch
+# }
+# my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
+# $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
+# return $XX_tag;
+# }
+
+
+
+sub print_helpfile{
+ print << "HOW_TO";
+
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+
+
+DESCRIPTION
+
+
+The following is a brief description of command line options and arguments to control the Bismark
+bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
+reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
+version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
+Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
+(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
+forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
+are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
+sequence from the genome and determine if there were any protected C's present or not.
+
+As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
+re-enabled by using --non_directional.
+
+The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
+'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
+alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
+
+
+USAGE: bismark [options] {-1 -2 | }
+
+
+ARGUMENTS:
+
+ The path to the folder containing the unmodified reference genome
+ as well as the subfolders created by the Bismark_Genome_Preparation
+ script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
+ Bismark expects one or more fastA files in this folder (file extension: .fa
+ or .fasta). The path can be relative or absolute.
+
+-1 Comma-separated list of files containing the #1 mates (filename usually includes
+ "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
+ correspond file-for-file and read-for-read with those specified in .
+ Reads may be a mix of different lengths. Bismark will produce one mapping result
+ and one report file per paired-end input file pair.
+
+-2 Comma-separated list of files containing the #2 mates (filename usually includes
+ "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
+ correspond file-for-file and read-for-read with those specified in .
+ Reads may be a mix of different lengths.
+
+ A comma- or space-separated list of files containing the reads to be aligned (e.g.
+ lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
+ produce one mapping result and one report file per input file.
+
+
+OPTIONS:
+
+
+Input:
+
+-q/--fastq The query input files (specified as , or are FASTQ
+ files (usually having extension .fg or .fastq). This is the default. See also
+ --solexa-quals.
+
+-f/--fasta The query input files (specified as , or are FASTA
+ files (usually havin extension .fa, .mfa, .fna or similar). All quality values
+ are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
+ the read name and the sequence on a single line (and not spread over several lines).
+
+-s/--skip Skip (i.e. do not align) the first reads or read pairs from the input.
+
+-u/--upto Only aligns the first reads or read pairs from the input. Default: no limit.
+
+--phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
+
+--phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
+
+--solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
+ (which can't). The formula for conversion is:
+ phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
+ is usually the right option for use with (unconverted) reads emitted by the GA
+ Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
+
+--solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
+ reads emitted by GA Pipeline version 1.3 or later. Default: off.
+
+--path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not
+ specified it is assumed that Bowtie (1 or 2) is in the PATH.
+
+
+Alignment:
+
+-n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
+ of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
+ default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
+
+-l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
+ which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
+ larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
+
+-e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout
+ the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
+ quality values to the nearest 10 and saturates at 30. This value is not relevant for
+ Bowtie 2.
+
+--chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in
+ --best mode. Best-first search must keep track of many paths at once to ensure it is
+ always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
+ memory impact of the descriptors, but they can still grow very large in some cases. If
+ you receive an error message saying that chunk memory has been exhausted in --best mode,
+ try adjusting this parameter up to dedicate more memory to the descriptors. This value
+ is not relevant for Bowtie 2. Default: 512.
+
+-I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
+ a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
+ with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
+ satisfied). A 19-bp gap would not be valid in that case. Default: 0.
+
+-X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
+ a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
+ 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
+ A 61-bp gap would not be valid in that case. Default: 500.
+
+--multicore Sets the number of parallel instances of Bismark to be run concurrently. This forks the
+ Bismark alignment step very early on so that each individual Spawn of Bismark processes
+ only every n-th sequence (n being set by --multicore). Once all processes have completed,
+ the individual BAM files, mapping reports, unmapped or ambiguous FastQ files are merged
+ into single files in very much the same way as they would have been generated running Bismark
+ conventionally with only a single instance.
+
+ If system resources are plentiful this is a viable option to speed up the alignment process
+ (we observed a near linear speed increase for up to --multicore 8 tested). However, please note
+ that a typical Bismark run will use several cores already (Bismark itself, 2 or 4 threads of
+ Bowtie/Bowtie2, Samtools, gzip etc...) and ~10-16GB of memory depending on the choice of aligner
+ and genome. WARNING: Bismark Parallel (BP?) is resource hungry! Each value of --multicore specified
+ will effectively lead to a linear increase in compute and memory requirements, so --multicore 4 for
+ e.g. the GRCm38 mouse genome will probably use ~20 cores and eat ~40GB or RAM, but at the same time
+ reduce the alignment time to ~25-30%. You have been warned.
+
+
+
+Bowtie 1 Reporting:
+
+-k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
+ will be used by default.
+
+--best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
+ (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
+ terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
+ quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
+ have Phred quality 10. When --best is not specified, Bowtie may report alignments that
+ are sub-optimal in terms of stratum and/or quality (though an effort is made to report
+ the best alignment). --best mode also removes all strand bias. Note that --best does not
+ affect which alignments are considered "valid" by Bowtie, only which valid alignments
+ are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
+ Default: on.
+
+--no_best Disables the --best option which is on by default. This can speed up the alignment process,
+ e.g. for testing purposes, but for credible results it is not recommended to disable --best.
+
+
+Output:
+
+--non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
+ bisulfite strands will be reported. Default: OFF.
+
+ (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
+ to the original strands are merely theoretical and should not exist in reality. Specifying directional
+ alignments (which is the default) will only run 2 alignment threads to the original top (OT)
+ or bottom (OB) strands in parallel and report these alignments. This is the recommended option
+ for sprand-specific libraries).
+
+--pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
+ PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
+ as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
+ and OB ones. Use this option only if you are certain that your libraries were constructed following
+ a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
+ --pbat works only for FastQ files (in both Bowtie and Bowtie 2 mode) and using uncompressed
+ temporary files only).
+
+--sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
+ split up into several smaller files to run concurrently and the output files are to be merged.
+
+--quiet Print nothing besides alignments.
+
+--vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
+ of SAM format output.
+
+-un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
+ appear as they did in the input, without any translation of quality values that may have
+ taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
+ and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
+ with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
+ are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
+
+--ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
+ mismatches or other reads that fail to align uniquely to a file in the output directory.
+ Written reads will appear as they did in the input, without any of the translation of quality
+ values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
+ parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
+ _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
+
+-o/--output_dir Write all output files into this directory. By default the output files will be written into
+ the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
+ to create it first. The path to the output folder can be either relative or absolute.
+
+--temp_dir Write temporary files to this directory instead of into the same directory as the input files. If
+ the specified folder does not exist, Bismark will attempt to create it first. The path to the
+ temporary folder can be either relative or absolute.
+
+--non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
+ alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
+ just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
+ The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
+ and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
+
+--gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
+ space. This option is available for most alignment modes but is not available for paired-end FastA
+ files. This option might be somewhat slower than writing out uncompressed files, but this awaits
+ further testing.
+
+--sam The output will be written out in SAM format instead of the default BAM format. Bismark will
+ attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
+ been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
+ the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
+
+--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
+ explicitly if Samtools is in the PATH already.
+
+--prefix Prefixes to the output filenames. Trailing dots will be replaced by a single one. For
+ example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
+
+-B/--basename Write all output to files starting with this base file name. For example, '--basename foo'
+ would result in the files 'foo.sam' and 'foo_SE_report.txt' (or its paired-end equivalent). Takes
+ precedence over --prefix.
+
+--old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
+ this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
+ both the appended read IDs and custom FLAG values may cause problems with some downstream tools
+ such as Picard, new defaults were implemented as of version 0.8.3.
+
+
+ default old_flag
+ =================== ===================
+ Read 1 Read 2 Read 1 Read 2
+
+ OT: 99 147 67 131
+
+ OB: 83 163 115 179
+
+ CTOT: 99 147 67 131
+
+ CTOB: 83 163 115 179
+
+
+Other:
+
+-h/--help Displays this help file.
+
+-v/--version Displays version information.
+
+
+BOWTIE 2 SPECIFIC OPTIONS
+
+--bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
+ alignments, i.e. searches for alignments involving all read characters (also called
+ untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
+ and/or quality trimmed where appropriate. Both small (.bt2) and large (.bt2l) Bowtie 2
+ indexes are supported. Default: off.
+
+Bowtie 2 alignment options:
+
+-N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
+ Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
+ but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
+ Bowtie 1 see -n).
+
+-L Sets the length of the seed substrings to align during multiseed alignment. Smaller values
+ make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
+ used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
+ Bowtie 1 see -l).
+
+--ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
+ position to be the highest possible, regardless of the actual value. I.e. input is treated
+ as though all quality values are high. This is also the default behavior when the input
+ doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
+
+
+Bowtie 2 paired-end options:
+
+--no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
+ it cannot find a concordant or discordant alignment for a pair. This option is invariable and
+ and on by default.
+
+--no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
+ A discordant alignment is an alignment where both mates align uniquely, but that does not
+ satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
+ and it is on by default.
+
+
+Bowtie 2 effort options:
+
+-D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
+ the alignments found so far. A seed extension "fails" if it does not yield a new best or a
+ new second-best alignment. Default: 15.
+
+-R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
+ When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
+ mismatches allowed) at different offsets and searches for more alignments. A read is considered
+ to have repetitive seeds if the total number of seed hits divided by the number of seeds
+ that aligned at least once is greater than 300. Default: 2.
+
+Bowtie 2 parallelization options:
+
+
+-p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
+ and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
+ parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
+ E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
+ by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
+ library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
+ automatically use the option '--reorder', which guarantees that output SAM records are printed in
+ an order corresponding to the order of the reads in the original input file, even when -p is set
+ greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
+ setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
+ if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
+ correspond to input order in that case.
+
+Bowtie 2 Scoring options:
+
+--score_min Sets a function governing the minimum alignment score needed for an alignment to be considered
+ "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
+ L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
+ See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
+ L,0,-0.2.
+
+--rdg , Sets the read gap open () and extend () penalties. A read gap of length N gets a penalty
+ of + N * . Default: 5, 3.
+
+--rfg , Sets the reference gap open () and extend () penalties. A reference gap of length N gets
+ a penalty of + N * . Default: 5, 3.
+
+
+Bowtie 2 Reporting options:
+
+-most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
+ deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
+ default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
+ effort expended to find valid alignments.
+
+ For reference, this used to be the old (now deprecated) description of -M:
+ Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it
+ can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever
+ happens first. Only the best alignment is reported. Information from the other alignments is used to
+ estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
+ Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
+ aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not
+ guarantee that the alignment reported is the best possible in terms of alignment score. -M is
+ always used and its default value is set to 10.
+
+
+'VANILLA' Bismark OUTPUT:
+
+Single-end output format (tab-separated):
+
+ (1)
+ (2)
+ (3)
+ (4)
+ (5)
+ (6)
+ (7)
+ (8)
+ (9)
+(11)
+
+
+Paired-end output format (tab-separated):
+ (1)
+ (2)
+ (3)
+ (4)
+ (5)
+ (6)
+ (7)
+ (8)
+ (9)
+(10)
+(11)
+(12)
+(14)
+(15)
+
+
+Bismark SAM OUTPUT (default):
+
+ (1) QNAME (seq-ID)
+ (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
+ (3) RNAME (chromosome)
+ (4) POS (start position)
+ (5) MAPQ (always 255 for use with Bowtie)
+ (6) CIGAR
+ (7) RNEXT
+ (8) PNEXT
+ (9) TLEN
+(10) SEQ
+(11) QUAL (Phred33 scale)
+(12) NM-tag (edit distance to the reference)
+(13) MD-tag (base-by-base mismatches to the reference (handles indels)
+(14) XM-tag (methylation call string)
+(15) XR-tag (read conversion state for the alignment)
+(16) XG-tag (genome conversion state for the alignment)
+(17) XA/XB-tag (non-bisulfite mismatches) (optional!)
+
+Each read of paired-end alignments is written out in a separate line in the above format.
+
+
+Last edited on 06 May 2015.
+
+HOW_TO
+}
diff -r 047eb877b6f0 -r 4084128e7cca new/bismark_genome_preparation
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/new/bismark_genome_preparation Wed Sep 02 15:55:46 2015 -0400
@@ -0,0 +1,468 @@
+#!/usr/bin/perl --
+use strict;
+use warnings;
+use Cwd;
+# use File::Path qw(rmtree);
+$|++;
+
+
+## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk)
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see .
+
+use Getopt::Long;
+use Cwd;
+
+my $verbose;
+my $help;
+my $version;
+my $man;
+my $path_to_bowtie;
+my $multi_fasta;
+my $single_fasta;
+my $bowtie2;
+
+my $bismark_version = 'v0.14.3';
+
+GetOptions ('verbose' => \$verbose,
+ 'help' => \$help,
+ 'man' => \$man,
+ 'version' => \$version,
+ 'path_to_bowtie:s' => \$path_to_bowtie,
+ 'single_fasta' => \$single_fasta,
+ 'bowtie2' => \$bowtie2,
+ );
+
+if ($help or $man){
+ print_helpfile();
+ exit;
+}
+
+if ($version){
+ print << "VERSION";
+
+ Bismark - Bisulfite Mapper and Methylation Caller.
+
+ Bismark Genome Preparation Version: $bismark_version
+ Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
+ www.bioinformatics.babraham.ac.uk/projects/
+
+VERSION
+ exit;
+}
+
+my $genome_folder = shift @ARGV; # mandatory
+my %chromosomes; # checking if chromosome names are unique (required)
+
+# Ensuring a genome folder has been specified
+if ($genome_folder){
+ unless ($genome_folder =~ /\/$/){
+ $genome_folder =~ s/$/\//;
+ }
+ $verbose and print "Path to genome folder specified as: $genome_folder\n";
+ chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
+
+ # making the genome folder path abolsolute so it won't break if the path was specified relative
+ $genome_folder = getcwd;
+ unless ($genome_folder =~ /\/$/){
+ $genome_folder =~ s/$/\//;
+ }
+}
+else{
+ die "Please specify a genome folder to be used for bisulfite conversion\n\n";
+}
+
+
+my $CT_dir;
+my $GA_dir;
+
+
+if ($single_fasta){
+ print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n";
+ $multi_fasta = 0;
+}
+else{
+ print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n";
+ $single_fasta = 0;
+ $multi_fasta = 1;
+}
+
+my @filenames = create_bisulfite_genome_folders();
+
+process_sequence_files ();
+
+launch_bowtie_indexer();
+
+sub launch_bowtie_indexer{
+ if ($bowtie2){
+ print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n";
+ }
+ else{
+ print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n";
+ }
+ print "Please be aware that this process can - depending on genome size - take up to several hours!\n";
+ sleep(5);
+
+ ### if the path to bowtie was specfified explicitely
+ if ($path_to_bowtie){
+ if ($bowtie2){
+ $path_to_bowtie =~ s/$/bowtie2-build/;
+ }
+ else{
+ $path_to_bowtie =~ s/$/bowtie-build/;
+ }
+ }
+ ### otherwise we assume that bowtie-build is in the path
+ else{
+ if ($bowtie2){
+ $path_to_bowtie = 'bowtie2-build';
+ }
+ else{
+ $path_to_bowtie = 'bowtie-build';
+ }
+ }
+
+ $verbose and print "\n";
+
+ ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer)
+ my $pid = fork();
+
+ # parent process
+ if ($pid){
+ sleep(1);
+ chdir $CT_dir or die "Unable to change directory: $!\n";
+ $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n";
+ my @fasta_files = <*.fa>;
+ my $file_list = join (',',@fasta_files);
+ $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n";
+ $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n";
+
+ sleep (11);
+ exec ("$path_to_bowtie","-f","$file_list","BS_CT");
+ }
+
+ # child process
+ elsif ($pid == 0){
+ sleep(2);
+ chdir $GA_dir or die "Unable to change directory: $!\n";
+ $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n";
+ my @fasta_files = <*.fa>;
+ my $file_list = join (',',@fasta_files);
+ $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n";
+ $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n";
+ $verbose and print "(starting in 10 seconds)\n";
+ sleep(10);
+ exec ("$path_to_bowtie","-f","$file_list","BS_GA");
+ }
+
+ # if the platform doesn't support the fork command we will run the indexing processes one after the other
+ else{
+ print "Forking process was not successful, therefore performing the indexing sequentially instead\n";
+ sleep(10);
+
+ ### moving to CT genome folder
+ $verbose and warn "Preparing to index CT converted genome in $CT_dir\n";
+ chdir $CT_dir or die "Unable to change directory: $!\n";
+ my @fasta_files = <*.fa>;
+ my $file_list = join (',',@fasta_files);
+ $verbose and print "$file_list\n\n";
+ sleep(2);
+ system ("$path_to_bowtie","-f","$file_list","BS_CT");
+ @fasta_files=();
+ $file_list= '';
+
+ ### moving to GA genome folder
+ $verbose and warn "Preparing to index GA converted genome in $GA_dir\n";
+ chdir $GA_dir or die "Unable to change directory: $!\n";
+ @fasta_files = <*.fa>;
+ $file_list = join (',',@fasta_files);
+ $verbose and print "$file_list\n\n";
+ sleep(2);
+ exec ("$path_to_bowtie","-f","$file_list","BS_GA");
+ }
+}
+
+
+sub process_sequence_files {
+
+ my ($total_CT_conversions,$total_GA_conversions) = (0,0);
+ $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n";
+ sleep (3);
+
+ $verbose and print "conversions performed:\n";
+ $verbose and print join("\t",'chromosome','C->T','G->A'),"\n";
+
+
+ ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file
+ ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle
+ ### This is now the default option
+
+ if ($multi_fasta){
+ ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files
+ my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa";
+ open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+
+ my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa";
+ open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+ }
+
+ foreach my $filename(@filenames){
+ my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
+ open (IN,$filename) or die "Failed to read from sequence file $filename $!\n";
+ # warn "Reading chromosome information from $filename\n\n";
+
+ ### first line needs to be a fastA header
+ my $first_line = ;
+ chomp $first_line;
+
+ ### Extracting chromosome name from the FastA header
+ my $chromosome_name = extract_chromosome_name($first_line);
+
+ ### Exiting if a chromosome with the same name was present already
+ if (exists $chromosomes{$chromosome_name}){
+ die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+ }
+ else{
+ $chromosomes{$chromosome_name}++;
+ }
+
+ ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
+ unless ($multi_fasta){
+ my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
+ $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
+ open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+
+ my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
+ $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
+ open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+ }
+
+ print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry
+ print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry
+
+
+ while (){
+
+ ### in case the line is a new fastA header
+ if ($_ =~ /^>/){
+ ### printing out the stats for the previous chromosome
+ $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
+ ### resetting the chromosome transliteration counters
+ ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0);
+
+ ### Extracting chromosome name from the additional FastA header
+ $chromosome_name = extract_chromosome_name($_);
+
+ ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes.
+ unless ($multi_fasta){
+ my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name";
+ $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/;
+ open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n";
+
+ my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name";
+ $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/;
+ open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n";
+ }
+
+ print CT_CONVERT ">",$chromosome_name,"_CT_converted\n";
+ print GA_CONVERT ">",$chromosome_name,"_GA_converted\n";
+ }
+
+ else{
+ my $sequence = uc$_;
+
+ ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group)
+
+ $sequence =~ s/[^ATCGN\n\r]/N/g;
+
+ ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion)
+
+ my $CT_sequence = $sequence;
+ my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts
+ $total_CT_conversions += $CT_transliterations_performed;
+ $chromosome_CT_conversions += $CT_transliterations_performed;
+
+ print CT_CONVERT $CT_sequence;
+
+ ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse-
+ ### complementing the forward strand and then C->T converting it)
+
+ my $GA_sequence = $sequence;
+ my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand
+ $total_GA_conversions += $GA_transliterations_performed;
+ $chromosome_GA_conversions += $GA_transliterations_performed;
+
+ print GA_CONVERT $GA_sequence;
+
+ }
+ }
+ $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n";
+ }
+ close (CT_CONVERT) or die "Failed to close filehandle: $!\n";
+ close (GA_CONVERT) or die "Failed to close filehandle: $!\n";
+
+
+ print "\nTotal number of conversions performed:\n";
+ print "C->T:\t$total_CT_conversions\n";
+ print "G->A:\t$total_GA_conversions\n";
+
+ warn "\nStep II - Genome bisulfite conversions - completed\n\n\n";
+}
+
+sub extract_chromosome_name {
+
+ my $header = shift;
+
+ ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well
+
+ if ($header =~ s/^>//){
+ my ($chromosome_name) = split (/\s+/,$header);
+ return $chromosome_name;
+ }
+ else{
+ die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n";
+ }
+}
+
+sub create_bisulfite_genome_folders{
+
+ $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n";
+
+ if ($path_to_bowtie){
+ unless ($path_to_bowtie =~ /\/$/){
+ $path_to_bowtie =~ s/$/\//;
+ }
+ if (chdir $path_to_bowtie){
+ if ($bowtie2){
+ $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n";
+ }
+ else{
+ $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n";
+ }
+ }
+ else{
+ die "There was an error with the path to bowtie: $!\n";
+ }
+ }
+
+ chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!";
+
+
+ # Exiting unless there are fastA files in the folder
+ my @filenames = <*.fa>;
+
+ ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+ unless (@filenames){
+ @filenames = <*.fasta>;
+ }
+
+ unless (@filenames){
+ die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n";
+ }
+
+ warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n";
+ sleep (3);
+
+ # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists
+ my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/";
+ unless (-d $bisulfite_dir){
+ mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n";
+ $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n";
+ }
+ else{
+ print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n";
+ sleep(5);
+ }
+
+ chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n";
+ $CT_dir = "${bisulfite_dir}CT_conversion/";
+ $GA_dir = "${bisulfite_dir}GA_conversion/";
+
+ # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion)
+ # converted version of the genome
+ unless (-d $CT_dir){
+ mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n";
+ $verbose and print "Created Bisulfite Genome folder $CT_dir\n";
+ }
+ unless (-d $GA_dir){
+ mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n";
+ $verbose and print "Created Bisulfite Genome folder $GA_dir\n";
+ }
+
+ # moving back to the original genome folder
+ chdir $genome_folder or die "Could't move to directory $genome_folder $!";
+ # $verbose and print "Moved back to genome folder folder $genome_folder\n";
+ warn "\nStep I - Prepare genome folders - completed\n\n\n";
+ return @filenames;
+}
+
+sub print_helpfile{
+ print << 'HOW_TO';
+
+
+DESCRIPTION
+
+This script is supposed to convert a specified reference genome into two different bisulfite
+converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first
+bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs
+converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference
+genome folder. Once the bisulfite conversion has been completed the program will fork and launch
+two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware
+that the indexing process can take up to several hours; this will mainly depend on genome size
+and system resources.
+
+
+
+The following is a brief description of command line options and arguments to control the
+Bismark Genome Preparation:
+
+
+USAGE: bismark_genome_preparation [options]
+
+
+OPTIONS:
+
+--help/--man Displays this help filea and exits.
+
+--version Displays version information and exits.
+
+--verbose Print verbose output for more details or debugging.
+
+--path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system
+ (depending on which aligner/indexer you intend to use). Unless this path
+ is specified it is assumed that Bowtie is in the PATH.
+
+--bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1).
+
+--single_fasta Instruct the Bismark Indexer to write the converted genomes into
+ single-entry FastA files instead of making one multi-FastA file (MFA)
+ per chromosome. This might be useful if individual bisulfite converted
+ chromosomes are needed (e.g. for debugging), however it can cause a
+ problem with indexing if the number of chromosomes is vast (this is likely
+ to be in the range of several thousand files; the operating system can
+ only handle lists up to a certain length, and some newly assembled
+ genomes may contain 20000-50000 contigs of scaffold files which do exceed
+ this list length limit).
+
+
+ARGUMENTS:
+
+ The path to the folder containing the genome to be bisulfite converted.
+ The Bismark Genome Preparation expects one or more fastA files in the folder
+ (with the file extension: .fa or .fasta). Specifying this path is mandatory.
+
+
+This script was last modified on 16 Oct 2014.
+HOW_TO
+}
diff -r 047eb877b6f0 -r 4084128e7cca new/bismark_methylation_extractor
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/new/bismark_methylation_extractor Wed Sep 02 15:55:46 2015 -0400
@@ -0,0 +1,5875 @@
+#!/usr/bin/perl
+use warnings;
+use strict;
+$|++;
+use Getopt::Long;
+use Cwd;
+use Carp;
+use FindBin qw($Bin);
+use lib "$Bin/../lib";
+
+## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk)
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see .
+
+my @filenames; # input files
+my %counting;
+my $parent_dir = getcwd();
+
+my %fhs;
+
+my $version = 'v0.14.3';
+my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline();
+
+
+### only needed for bedGraph output
+my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
+my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
+my @bedfiles;
+
+### only needed for genome-wide cytosine methylation report
+my %chromosomes;
+
+my %mbias_1;
+my %mbias_2;
+
+
+##############################################################################################
+### Summarising Run Parameters
+##############################################################################################
+
+### METHYLATION EXTRACTOR
+
+warn "Summarising Bismark methylation extractor parameters:\n";
+warn '='x63,"\n";
+
+if ($single){
+ if ($vanilla){
+ warn "Bismark single-end vanilla format specified\n";
+ }
+ else{
+ warn "Bismark single-end SAM format specified (default)\n"; # default
+ }
+}
+elsif ($paired){
+ if ($vanilla){
+ warn "Bismark paired-end vanilla format specified\n";
+ }
+ else{
+ warn "Bismark paired-end SAM format specified (default)\n"; # default
+ }
+}
+
+warn "Number of cores to be used: $multicore\n";
+
+if ($single){
+ if ($ignore){
+ warn "First $ignore bp will be disregarded when processing the methylation call string\n";
+ }
+ if ($ignore_3prime){
+ warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n";
+ }
+
+}
+else{ ## paired-end
+ if ($ignore){
+ warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
+ }
+ if ($ignore_r2){
+ warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
+ }
+
+ if ($ignore_3prime){
+ warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n";
+ }
+ if ($ignore_3prime_r2){
+ warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
+ }
+
+
+}
+
+
+if ($full){
+ warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
+}
+if ($merge_non_CpG){
+ warn "Merge CHG and CHH context to non-CpG context specified\n";
+}
+### output directory
+if ($output_dir eq ''){
+ warn "Output will be written to the current directory ('$parent_dir')\n";
+}
+else{
+ warn "Output path specified as: $output_dir\n";
+}
+
+
+sleep (1);
+
+### BEDGRAPH
+
+if ($bedGraph){
+ warn "\n\nSummarising bedGraph parameters:\n";
+ warn '='x63,"\n";
+
+ if ($counts){
+ warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t\ncoverage format:\t\n\n";
+ }
+ else{
+ warn "Generating additional sorted output in bedGraph format (output format: )\n";
+ }
+
+ ### Zero-based coordinates
+ if ($zero){
+ warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n";
+ }
+
+ warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
+
+ if ($CX_context){
+ warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
+ }
+ else{ # default
+ $CpG_only = 1;
+ warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
+ }
+
+ if ($remove){
+ warn "White spaces in read ID names will be removed prior to sorting\n";
+ }
+
+ if ($ample_mem){
+ warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
+ }
+ elsif (defined $sort_size){
+ warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
+ }
+ else{
+ warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
+ }
+
+
+
+ sleep (1);
+
+ if ($cytosine_report){
+ warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
+ warn '='x63,"\n";
+ warn "Generating comprehensive genome-wide cytosine report\n(output format: )\n";
+
+
+ if ($CX_context){
+ warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
+ }
+ else{ # default
+ $CpG_only = 1;
+ warn "Reporting cytosine methylation in CpG context only (default)\n";
+ }
+
+ if ($split_by_chromosome){
+ warn "Splitting the cytosine report output up into individual files for each chromosome\n";
+ }
+
+ ### Zero-based coordinates
+ if ($zero){
+ warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n";
+ }
+ else{ # default, 1-based coords
+ warn "Using 1-based genomic coordinates (default)\n";
+ }
+
+ ### GENOME folder
+ if ($genome_folder){
+ unless ($genome_folder =~/\/$/){
+ $genome_folder =~ s/$/\//;
+ }
+ warn "Genome folder was specified as $genome_folder\n";
+ }
+ else{
+ $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
+ warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
+ }
+ sleep (1);
+ }
+}
+
+warn "\n";
+sleep (1);
+
+######################################################
+### PROCESSING FILES
+######################################################
+
+foreach my $filename (@filenames){
+ # resetting counters and filehandles
+ %fhs = ();
+ %counting =(
+ total_meCHG_count => 0,
+ total_meCHH_count => 0,
+ total_meCpG_count => 0,
+ total_unmethylated_CHG_count => 0,
+ total_unmethylated_CHH_count => 0,
+ total_unmethylated_CpG_count => 0,
+ sequences_count => 0,
+ methylation_call_strings => 0,
+ );
+
+ @sorting_files = ();
+ @bedfiles = ();
+
+ %mbias_1 = ();
+ %mbias_2 = ();
+
+
+ ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
+ unless ($vanilla){
+ if ($paired){
+ test_positional_sorting($filename);
+ }
+ }
+
+ my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename);
+
+ if ($pid == 0){
+ warn "Finished processing child process. Exiting..\n";
+
+ # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
+ # foreach my $fh (keys %fhs) {
+ # if ($fh =~ /^[1230]$/) {
+ # foreach my $context (keys %{$fhs{$fh}}) {
+ # $fhs{$fh}->{$context}->flush;
+ # }
+ # }
+ # else{
+ # $fhs{$fh}->flush;
+ # }
+ # }
+ exit 0;
+ }
+
+ ###
+ if ($pid and $multicore > 1){
+ warn "Now waiting for all child processes to complete\n";
+ sleep(1);
+
+ ### we need to ensure that we wait for all child processes to be finished before continuing
+ # warn "here are the child IDs: @$pids\n";
+ # warn "Looping through the child process IDs:\n";
+
+ foreach my $id (@$pids){
+ # print "$id\t";
+ my $kid = waitpid ($id,0);
+ # print "Returned: $kid\nExit status: $?\n";
+ unless ($? == 0){
+ warn "\nChild process terminated with exit signal: '$?'\n\n";
+ }
+ }
+ }
+
+ ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
+ foreach my $fh (keys %fhs) {
+ if ($fh =~ /^[1230]$/) {
+ foreach my $context (keys %{$fhs{$fh}}) {
+ close $fhs{$fh}->{$context} or die $!;
+ }
+ }
+ else{
+ close $fhs{$fh} or die $!;
+ }
+ }
+
+ ### We need to stitch together a main splitting report from all individual parent/child processes
+ if ($multicore > 1){
+ merge_individual_splitting_reports($report_basename);
+ print_splitting_report();
+
+ merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally
+
+ }
+
+ unless ($mbias_off){
+ ### printing out all M-Bias data
+ produce_mbias_plots ($filename); produce_mbias_plots ($filename);
+
+ }
+
+ unless ($mbias_only){
+ delete_unused_files();
+ }
+
+ if ($bedGraph){
+
+ my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+ $out =~ s/gz$//;
+ $out =~ s/sam$//;
+ $out =~ s/bam$//;
+ $out =~ s/txt$//;
+ $out =~ s/$/bedGraph/;
+
+ my $bedGraph_output = $out;
+ my @args;
+
+ if ($remove){
+ push @args, '--remove';
+ }
+ if ($CX_context){
+ push @args, '--CX_context';
+ }
+ if ($no_header){
+ push @args, '--no_header';
+ }
+ if ($gazillion){
+ push @args, '--gazillion';
+ }
+ if ($ample_mem){
+ push @args, '--ample_memory';
+ }
+ if ($zero){
+ push @args, "--zero";
+ }
+
+ # if ($counts){
+ # push @args, "--counts";
+ # }
+
+ push @args, "--buffer_size $sort_size";
+ push @args, "--cutoff $coverage_threshold";
+ push @args, "--output $bedGraph_output";
+ push @args, "--dir '$output_dir'";
+
+ ### adding all files to be sorted to @args
+ foreach my $f (@sorting_files){
+ push @args, $f;
+ }
+
+ # print join "\t",@args,"\n";
+
+ system ("$Bin/bismark2bedGraph @args");
+
+ warn "Finished BedGraph conversion ...\n\n";
+ sleep(1);
+
+ # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
+ # warn "Writing bedGraph to file: $bedGraph_output\n";
+ # process_bedGraph_output();
+ # close OUT or die $!;
+
+ ### genome-wide cytosine methylation report requires bedGraph processing anyway
+ if ($cytosine_report){
+
+ @args = (); # resetting @args
+ my $cytosine_out = $out;
+ $cytosine_out =~ s/bedGraph$//;
+
+ if ($CX_context){
+ $cytosine_out =~ s/$/CX_report.txt/;
+ }
+ else{
+ $cytosine_out =~ s/$/CpG_report.txt/;
+ }
+
+ push @args, "--output $cytosine_out";
+ push @args, "--dir '$output_dir'";
+ push @args, "--genome '$genome_folder'";
+ push @args, "--parent_dir '$parent_dir'";
+
+ if ($zero){
+ push @args, "--zero";
+ }
+ if ($CX_context){
+ push @args, '--CX_context';
+ }
+ if ($split_by_chromosome){
+ push @args, '--split_by_chromosome';
+ }
+
+ my $coverage_output = $bedGraph_output;
+ $coverage_output =~ s/bedGraph$/bismark.cov/;
+
+ push @args, $output_dir . $coverage_output; # this will be the infile
+
+ system ("$Bin/coverage2cytosine @args");
+ # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
+ warn "\n\nFinished generating genome-wide cytosine report\n\n";
+ }
+ }
+}
+
+sub merge_individual_splitting_reports{
+
+ my $report_basename = shift;
+
+ my @splitting_reports; # only needed in multi-core mode to generate an overall report
+ foreach my $ext (1..$multicore){
+ push @splitting_reports, "$report_basename.$ext";
+ }
+ warn "\nMerging individual splitting reports into overall report: '$report_basename'\n";
+ warn "Merging from these individual files:\n";
+ print join ("\n",@splitting_reports),"\n\n";
+ sleep(1);
+
+ ##########
+ # resetting the counter first
+ %counting =(
+ total_meCHG_count => 0,
+ total_meCHH_count => 0,
+ total_meCpG_count => 0,
+ total_unmethylated_CHG_count => 0,
+ total_unmethylated_CHH_count => 0,
+ total_unmethylated_CpG_count => 0,
+ sequences_count => 0,
+ methylation_call_strings => 0,
+ );
+
+ # repopulating the merged counter
+ foreach my $file (@splitting_reports){
+ open (IR,$file) or die $!;
+ while (){
+ chomp;
+ my ($context,$count) = (split /\t/);
+ if ($context){
+ if ($context =~ /^Total C to T conversions in CpG context/){
+ # warn "context: $context\ncount: $count\n";
+ $counting{total_unmethylated_CpG_count} += $count;
+ }
+ elsif ($context =~ /^Total C to T conversions in CHG context/){
+ # warn "context: $context\ncount: $count\n";
+ $counting{total_unmethylated_CHG_count} += $count;
+ }
+ elsif ($context =~ /^Total C to T conversions in CHH context/){
+ # warn "context: $context\ncount: $count\n";
+ $counting{total_unmethylated_CHH_count} += $count;
+ }
+ elsif ($context =~ /^Total methylated C\'s in CpG context/){
+ # warn "context: $context\ncount: $count\n";
+ $counting{total_meCpG_count} += $count;
+ }
+ elsif ($context =~ /^Total methylated C\'s in CHG context/){
+ # warn "context: $context\ncount: $count\n";
+ $counting{total_meCHG_count} += $count;
+ }
+ elsif ($context =~ /^Total methylated C\'s in CHH context/){
+ # warn "context: $context\ncount: $count\n";
+ $counting{total_meCHH_count} += $count;
+ }
+ elsif ($context =~ /^line count/){
+ # warn "Line count\ncount: $count\n";
+ $counting{sequences_count} = $count; # always the same
+ }
+ elsif ($context =~ /^meth call strings/){
+ # warn "Meth call strings\ncount: $count\n";
+ $counting{methylation_call_strings} += $count;
+ }
+ }
+ }
+ }
+
+ # deleting the individual reports afterwards
+ foreach my $file (@splitting_reports){
+ unlink $file;
+ }
+}
+
+
+sub merge_individual_mbias_reports{
+
+ my $report_basename = shift;
+
+ my @mbias_reports; # only needed in multi-core mode to generate an overall report
+ foreach my $ext (1..$multicore){
+ push @mbias_reports, "$report_basename.${ext}.mbias";
+ }
+ warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n";
+ print join ("\n",@mbias_reports),"\n\n";
+
+
+ ##########
+ # resetting the counters first, then repopulating them
+ %mbias_1 = ();
+ %mbias_2 = ();
+
+ # repopulating the merged counter
+ foreach my $file (@mbias_reports){
+ open (IR,$file) or die $!;
+
+ my $context;
+ my $read;
+
+ while (){
+ chomp;
+ # warn "$_\n"; sleep(1);
+ if ($_ =~ /context/){
+ $context = $1 if ($_ =~ /(\D{3}) context/);
+ # warn "Context set as $context\n";
+
+ if ($_ =~ /R2/){
+ $read = 'R2';
+ }
+ else{
+ $read = 'R1';
+ }
+ # warn "Setting read identity to '$read'\n";
+
+ # reading in 2 additional lines (===========, and header line)
+ $_ = ;
+ #warn "discarding line $_\n";
+ $_ = ;
+ #warn "discarding line $_\n";
+ next;
+ }
+ else{
+
+ if ($_ eq ''){
+ # empty line, only occurs after a context has finished and before a new context starts
+ next;
+ }
+
+ my ($pos,$meth,$unmeth) = (split /\t/);
+ # warn "$pos\t$meth\t$unmeth\n"; sleep(1);
+ if ($read eq 'R1'){
+ $mbias_1{$context}->{$pos}->{meth} += $meth;
+ $mbias_1{$context}->{$pos}->{un} += $unmeth;
+ }
+ elsif ($read eq 'R2'){
+ $mbias_2{$context}->{$pos}->{meth} += $meth;
+ $mbias_2{$context}->{$pos}->{un} += $unmeth;
+ }
+ }
+ }
+ close IR or warn "Had trouble closing filehandle for $file: $!\n";
+ }
+
+ # deleting the individual reports afterwards
+ foreach my $file (@mbias_reports){
+ unlink $file;
+ }
+}
+
+
+sub delete_unused_files{
+
+ warn "Deleting unused files ...\n\n"; sleep(1);
+
+ my $index = 0;
+
+ while ($index <= $#sorting_files){
+ if ($sorting_files[$index] =~ /gz$/){
+ open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
+ }
+ else{
+ open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
+ }
+
+ my $used = 0;
+
+ while (){
+ next if (/^Bismark/);
+ if ($_){
+ $used = 1;
+ last;
+ }
+ }
+
+ if ($used){
+ warn "$sorting_files[$index] contains data ->\tkept\n";
+ ++$index;
+ }
+ else{
+
+ my $delete = unlink $sorting_files[$index];
+
+ if ($delete){
+ warn "$sorting_files[$index] was empty ->\tdeleted\n";
+ }
+ else{
+ warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
+ }
+
+ ### we also need to remove the element from @sorting_files
+ splice @sorting_files, $index, 1;
+ }
+ }
+ warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
+}
+
+sub produce_mbias_plots{
+
+ my $filename = shift;
+
+ my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+ $mbias =~ s/gz$//;
+ $mbias =~ s/sam$//;
+ $mbias =~ s/bam$//;
+ $mbias =~ s/txt$//;
+ my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
+ $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
+ $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
+
+ $mbias =~ s/$/M-bias.txt/;
+
+ open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
+
+ # determining maximum read length
+ my $max_length_1 = 0;
+ my $max_length_2 = 0;
+
+ foreach my $context (keys %mbias_1){
+ foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
+ $max_length_1 = $pos unless ($max_length_1 >= $pos);
+ }
+ }
+ if ($paired){
+ foreach my $context (keys %mbias_2){
+ foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
+ $max_length_2 = $pos unless ($max_length_2 >= $pos);
+ }
+ }
+ }
+
+ if ($single){
+ warn "Determining maximum read length for M-Bias plot\n";
+ warn "Maximum read length of Read 1: $max_length_1\n\n";
+ }
+ else{
+ warn "Determining maximum read lengths for M-Bias plots\n";
+ warn "Maximum read length of Read 1: $max_length_1\n";
+ warn "Maximum read length of Read 2: $max_length_2\n\n";
+ }
+ # sleep(3);
+
+ my @mbias_read1;
+ my @mbias_read2;
+
+ #Check whether the module GD::Graph:lines is installed
+ my $gd_graph_installed = 0;
+ eval{
+ require GD::Graph::lines;
+ GD::Graph::lines->import();
+ };
+
+ unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...}
+ $gd_graph_installed = 1;
+
+ #Check whether the module GD::Graph::colour is installed
+ eval{
+ require GD::Graph::colour;
+ GD::Graph::colour->import(qw(:colours :lists :files :convert));
+ };
+
+ if ($@) {
+ warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
+ sleep(2);
+ $gd_graph_installed = 0;
+ }
+
+
+ }
+ else{
+ warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
+ sleep(2);
+ }
+
+
+ my $graph_title;
+ my $graph1;
+ my $graph2;
+
+ if ( $gd_graph_installed){
+ $graph1 = GD::Graph::lines->new(800,600);
+ if ($paired){
+ $graph2 = GD::Graph::lines->new(800,600);
+ }
+ }
+
+ foreach my $context (qw(CpG CHG CHH)){
+ @{$mbias_read1[0]} = ();
+
+ if ($paired){
+ print MBIAS "$context context (R1)\n================\n";
+ $graph_title = 'M-bias (Read 1)';
+ }
+ else{
+ print MBIAS "$context context\n===========\n";
+ $graph_title = 'M-bias';
+ }
+ print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
+
+ foreach my $pos (1..$max_length_1){
+
+ unless (defined $mbias_1{$context}->{$pos}->{meth}){
+ $mbias_1{$context}->{$pos}->{meth} = 0;
+ }
+ unless (defined $mbias_1{$context}->{$pos}->{un}){
+ $mbias_1{$context}->{$pos}->{un} = 0;
+ }
+
+ my $percent = '';
+ if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
+ $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
+ }
+ my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
+
+ print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
+ push @{$mbias_read1[0]},$pos;
+
+ if ($context eq 'CpG'){
+ push @{$mbias_read1[1]},$percent;
+ push @{$mbias_read1[4]},$coverage;
+ }
+ elsif ($context eq 'CHG'){
+ push @{$mbias_read1[2]},$percent;
+ push @{$mbias_read1[5]},$coverage;
+ }
+ elsif ($context eq 'CHH'){
+ push @{$mbias_read1[3]},$percent;
+ push @{$mbias_read1[6]},$coverage;
+ }
+ }
+ print MBIAS "\n";
+ }
+
+ if ( $gd_graph_installed){
+
+ add_colour(nice_blue => [31,120,180]);
+ add_colour(nice_orange => [255,127,0]);
+ add_colour(nice_green => [51,160,44]);
+ add_colour(pale_blue => [153,206,227]);
+ add_colour(pale_orange => [253,204,138]);
+ add_colour(pale_green => [191,230,207]);
+
+ $graph1->set(
+ x_label => 'position (bp)',
+ y1_label => '% methylation',
+ y2_label => '# methylation calls',
+ title => $graph_title,
+ line_width => 2,
+ x_max_value => $max_length_1,
+ x_min_value => 0,
+ y_tick_number => 10,
+ y_label_skip => 2,
+ y1_max_value => 100,
+ y1_min_value => 0,
+ y_label_skip => 2,
+ y2_min_value => 0,
+ x_label_skip => 5,
+ x_label_position => 0.5,
+ x_tick_offset => -1,
+ bgclr => 'white',
+ transparent => 0,
+ two_axes => 1,
+ use_axis => [1,1,1,2,2,2],
+ legend_placement => 'RC',
+ legend_spacing => 6,
+ legend_marker_width => 24,
+ legend_marker_height => 18,
+ dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
+ ) or die $graph1->error;
+
+ $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
+
+ ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014)
+ if (my $gd1 = $graph1->plot(\@mbias_read1)) {
+ open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
+ binmode MBIAS_G1;
+ print MBIAS_G1 $gd1->png;
+ }
+ else {
+ warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n";
+ }
+
+ # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
+ # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
+ # binmode MBIAS_G1;
+ # print MBIAS_G1 $gd1->png;
+ }
+
+ if ($paired){
+
+ foreach my $context (qw(CpG CHG CHH)){
+ @{$mbias_read2[0]} = ();
+
+ print MBIAS "$context context (R2)\n================\n";
+ print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
+ foreach my $pos (1..$max_length_2){
+
+ unless (defined $mbias_2{$context}->{$pos}->{meth}){
+ $mbias_2{$context}->{$pos}->{meth} = 0;
+ }
+ unless (defined $mbias_2{$context}->{$pos}->{un}){
+ $mbias_2{$context}->{$pos}->{un} = 0;
+ }
+
+ my $percent = '';
+ if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
+ $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
+ }
+ my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
+
+ print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
+
+ push @{$mbias_read2[0]},$pos;
+
+ if ($context eq 'CpG'){
+ push @{$mbias_read2[1]},$percent;
+ push @{$mbias_read2[4]},$coverage;
+ }
+ elsif ($context eq 'CHG'){
+ push @{$mbias_read2[2]},$percent;
+ push @{$mbias_read2[5]},$coverage;
+ }
+ elsif ($context eq 'CHH'){
+ push @{$mbias_read2[3]},$percent;
+ push @{$mbias_read2[6]},$coverage;
+ }
+ }
+ print MBIAS "\n";
+ }
+
+ if ( $gd_graph_installed){
+
+ add_colour(nice_blue => [31,120,180]);
+ add_colour(nice_orange => [255,127,0]);
+ add_colour(nice_green => [51,160,44]);
+ add_colour(pale_blue => [153,206,227]);
+ add_colour(pale_orange => [253,204,138]);
+ add_colour(pale_green => [191,230,207]);
+
+ $graph2->set(
+ x_label => 'position (bp)',
+ line_width => 2,
+ x_max_value => $max_length_1,
+ x_min_value => 0,
+ y_tick_number => 10,
+ y_label_skip => 2,
+ y1_max_value => 100,
+ y1_min_value => 0,
+ y_label_skip => 2,
+ y2_min_value => 0,
+ x_label_skip => 5,
+ x_label_position => 0.5,
+ x_tick_offset => -1,
+ bgclr => 'white',
+ transparent => 0,
+ two_axes => 1,
+ use_axis => [1,1,1,2,2,2],
+ legend_placement => 'RC',
+ legend_spacing => 6,
+ legend_marker_width => 24,
+ legend_marker_height => 18,
+ dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
+ x_label => 'position (bp)',
+ y1_label => '% methylation',
+ y2_label => '# calls',
+ title => 'M-bias (Read 2)',
+ ) or die $graph2->error;
+
+ $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
+
+ ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014)
+ if (my $gd2 = $graph2->plot(\@mbias_read2)) {
+ open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
+ binmode MBIAS_G2;
+ print MBIAS_G2 $gd2->png;
+ }
+ else {
+ warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n";
+ }
+
+ # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
+ # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
+ # binmode MBIAS_G2;
+ # print MBIAS_G2 $gd2->png;
+
+ }
+ }
+}
+
+sub process_commandline{
+ my $help;
+ my $single_end;
+ my $paired_end;
+ my $ignore;
+ my $ignore_r2;
+ my $genomic_fasta;
+ my $full;
+ my $report;
+ my $extractor_version;
+ my $no_overlap;
+ my $merge_non_CpG;
+ my $vanilla;
+ my $output_dir;
+ my $no_header;
+ my $bedGraph;
+ my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
+ my $remove;
+ my $counts;
+ my $cytosine_report;
+ my $genome_folder;
+ my $zero;
+ my $CpG_only;
+ my $CX_context;
+ my $split_by_chromosome;
+ my $sort_size;
+ my $samtools_path;
+ my $gzip;
+ my $mbias_only;
+ my $mbias_off;
+ my $gazillion;
+ my $ample_mem;
+ my $include_overlap;
+ my $ignore_3prime;
+ my $ignore_3prime_r2;
+ my $multicore;
+
+ my $command_line = GetOptions ('help|man' => \$help,
+ 'p|paired-end' => \$paired_end,
+ 's|single-end' => \$single_end,
+ 'fasta' => \$genomic_fasta,
+ 'ignore=i' => \$ignore,
+ 'ignore_r2=i' => \$ignore_r2,
+ 'comprehensive' => \$full,
+ 'report' => \$report,
+ 'version' => \$extractor_version,
+ 'no_overlap' => \$no_overlap,
+ 'merge_non_CpG' => \$merge_non_CpG,
+ 'vanilla' => \$vanilla,
+ 'o|output=s' => \$output_dir,
+ 'no_header' => \$no_header,
+ 'bedGraph' => \$bedGraph,
+ "cutoff=i" => \$coverage_threshold,
+ "remove_spaces" => \$remove,
+ "counts" => \$counts,
+ "cytosine_report" => \$cytosine_report,
+ 'g|genome_folder=s' => \$genome_folder,
+ "zero_based" => \$zero,
+ "CX|CX_context" => \$CX_context,
+ "split_by_chromosome" => \$split_by_chromosome,
+ "buffer_size=s" => \$sort_size,
+ 'samtools_path=s' => \$samtools_path,
+ 'gzip' => \$gzip,
+ 'mbias_only' => \$mbias_only,
+ 'mbias_off' => \$mbias_off,
+ 'gazillion|scaffolds' => \$gazillion,
+ 'ample_memory' => \$ample_mem,
+ 'include_overlap' => \$include_overlap,
+ 'ignore_3prime=i' => \$ignore_3prime,
+ 'ignore_3prime_r2=i' => \$ignore_3prime_r2,
+ 'multicore=i' => \$multicore,
+ );
+
+ ### EXIT ON ERROR if there were errors with any of the supplied options
+ unless ($command_line){
+ die "Please respecify command line options\n";
+ }
+
+ ### HELPFILE
+ if ($help){
+ print_helpfile();
+ exit;
+ }
+
+ if ($extractor_version){
+ print << "VERSION";
+
+
+ Bismark Methylation Extractor
+
+ Bismark Extractor Version: $version
+ Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
+ www.bioinformatics.babraham.ac.uk/projects/bismark/
+
+
+VERSION
+ exit;
+ }
+
+
+ ### no files provided
+ unless (@ARGV){
+ die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
+ }
+ @filenames = @ARGV;
+
+ warn "\n *** Bismark methylation extractor version $version ***\n\n";
+
+ ### M-BIAS ONLY
+ if ($mbias_only){
+
+ if ($mbias_off){
+ die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n";
+ }
+ if ($bedGraph){
+ die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
+ }
+ if ($cytosine_report){
+ die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
+ }
+ if ($merge_non_CpG){
+ warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
+ }
+ if ($full){
+ warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
+ }
+ sleep(3);
+ }
+
+ ### PRINT A REPORT
+ unless ($report){
+ $report = 1; # making this the default
+ }
+
+ ### OUTPUT DIR PATH
+ if ($output_dir){
+ unless ($output_dir =~ /\/$/){
+ $output_dir =~ s/$/\//;
+ }
+ }
+ else{
+ $output_dir = '';
+ }
+
+ ### NO HEADER
+ unless ($no_header){
+ $no_header = 0;
+ }
+
+ ### OLD (VANILLA) OUTPUT FORMAT
+ unless ($vanilla){
+ $vanilla = 0;
+ }
+
+ if ($single_end){
+ $paired_end = 0; ### SINGLE END ALIGNMENTS
+ }
+ elsif ($paired_end){
+ $single_end = 0; ### PAIRED-END ALIGNMENTS
+ }
+ else{
+
+ ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
+
+ if ($vanilla){
+ die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
+ }
+ else{ # SAM/BAM format
+
+ my $file = $filenames[0];
+ warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
+
+ ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
+ if ($file =~ /\.gz$/){
+ open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n";
+ }
+ elsif ($file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam
+ open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n";
+ }
+ else{
+ open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
+ }
+
+ while (){
+ last unless (/^\@/);
+ if ($_ =~ /^\@PG/){
+ # warn "found the \@PG line:\n";
+ # warn "$_";
+
+ if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){
+ warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
+ $paired_end = 1;
+ $single_end = 0;
+ }
+ else{
+ warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
+ $paired_end = 0;
+ $single_end = 1;
+ }
+ }
+ }
+
+ # close DETERMINE or warn $!; # this always throws an error anyway...
+
+ }
+ }
+
+ ### IGNORING 5' END
+ # bases at the start of the read when processing the methylation call string
+ unless ($ignore){
+ $ignore = 0;
+ }
+
+ if (defined $ignore_r2){
+ die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
+ }
+ else{
+ $ignore_r2 = 0;
+ }
+
+ ### IGNORING 3' END
+ # bases at the end of the read when processing the methylation call string
+ unless ($ignore_3prime){
+ $ignore_3prime = 0;
+ }
+
+ if (defined $ignore_3prime_r2){
+ die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end);
+ }
+ else{
+ $ignore_3prime_r2 = 0;
+ }
+
+
+ ### NO OVERLAP
+ ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps
+ if ($include_overlap){
+ die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
+ warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n";
+ $no_overlap = 0;
+ }
+ else{ # default
+ if ($paired_end){
+ warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n";
+ $no_overlap = 1;
+ }
+ }
+
+ ### COMPREHENSIVE OUTPUT
+ unless ($full){
+ $full = 0;
+ }
+
+ ### MERGE NON-CpG context
+ unless ($merge_non_CpG){
+ $merge_non_CpG = 0;
+ }
+
+ ### remove white spaces in read ID (needed for sorting using the sort command
+ unless ($remove){
+ $remove = 0;
+ }
+
+ ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
+ if (defined $coverage_threshold){
+ unless ($coverage_threshold > 0){
+ die "Please select a coverage greater than 0 (positive integers only)\n";
+ }
+ }
+ else{
+ $coverage_threshold = 1;
+ }
+
+ ### SORT buffer size
+ if (defined $sort_size){
+ unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
+ die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
+ }
+ }
+ else{
+ $sort_size = '2G';
+ }
+
+ if ($zero){
+ die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
+ }
+
+ if ($CX_context){
+ die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
+ }
+ else{
+ $CX_context = 0;
+ }
+
+ unless ($counts){
+ $counts = 1; # counts will always be set
+ }
+
+ if ($cytosine_report){
+
+ ### GENOME folder
+ if ($genome_folder){
+ unless ($genome_folder =~/\/$/){
+ $genome_folder =~ s/$/\//;
+ }
+ }
+ else{
+ die "Please specify a genome folder to proceed (full path only)\n";
+ }
+
+ unless ($bedGraph){
+ warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
+ $bedGraph = 1;
+ }
+ unless ($counts){
+ # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
+ $counts = 1;
+ }
+ warn "\n";
+ }
+
+ ### PATH TO SAMTOOLS
+ if (defined $samtools_path){
+ # if Samtools was specified as full command
+ if ($samtools_path =~ /samtools$/){
+ if (-e $samtools_path){
+ # Samtools executable found
+ }
+ else{
+ die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+ }
+ }
+ else{
+ unless ($samtools_path =~ /\/$/){
+ $samtools_path =~ s/$/\//;
+ }
+ $samtools_path .= 'samtools';
+ if (-e $samtools_path){
+ # Samtools executable found
+ }
+ else{
+ die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+ }
+ }
+ }
+ # Check whether Samtools is in the PATH if no path was supplied by the user
+ else{
+ if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
+ $samtools_path = `which samtools`;
+ chomp $samtools_path;
+ }
+ }
+
+ unless (defined $samtools_path){
+ $samtools_path = '';
+ }
+
+
+ if ($gazillion){
+ if ($ample_mem){
+ die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
+ }
+ }
+
+ if (defined $multicore){
+ unless ($multicore > 0){
+ die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n";
+ }
+ if ($multicore > 20){
+ warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n";
+ }
+ }
+ else{
+ $multicore = 1; # default. Single-thread mode
+ warn "Setting core usage to single-threaded (default). Consider using --multicore to speed up the extraction process.\n\n";
+ }
+
+ return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore);
+}
+
+
+sub test_positional_sorting{
+
+ my $filename = shift;
+
+ print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
+ sleep(1);
+
+ if ($filename =~ /\.gz$/) {
+ open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
+ }
+ elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
+ if ($samtools_path){
+ open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
+ }
+ else{
+ die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
+ }
+ }
+ else {
+ open (TEST,$filename) or die "Can't open file $filename: $!\n";
+ }
+
+ my $count = 0;
+
+ while () {
+ if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
+ if (/^\@SO/) {
+ die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
+ }
+ next;
+ }
+ $count++;
+
+ last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
+
+ my ($id_1) = (split (/\t/));
+
+ ### reading the next line which should be read 2
+ $_ = ;
+ my ($id_2) = (split (/\t/));
+ last unless ($id_2);
+ ++$count;
+
+ if ($id_1 eq $id_2){
+ ### ids are the same
+ next;
+ }
+ else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
+ my $id_1_trunc = $id_1;
+ $id_1_trunc =~ s/\/1$//;
+ my $id_2_trunc = $id_2;
+ $id_2_trunc =~ s/\/2$//;
+
+ unless ($id_1_trunc eq $id_2_trunc){
+ die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
+ }
+ }
+ }
+ # close TEST or die $!; somehow fails on our cluster...
+ ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
+ warn "...passed!\n";
+ sleep(1);
+
+}
+
+sub process_Bismark_results_file{
+
+ my $filename = shift;
+ my $report_filename = open_output_filehandles($filename);
+
+ ### disabling buffering so we don't run into problems with half written out lines...
+ foreach my $fh (keys %fhs){
+ if ($fh =~ /^[1230]$/) {
+ foreach my $context (keys %{$fhs{$fh}}) {
+ select($fhs{$fh}->{$context});
+ $|++;
+ }
+ }
+ else{
+ select($fhs{$fh});
+ $|++;
+ }
+ }
+ select(STDOUT);
+
+ ################################################
+ ################################################
+ ### multi-process handling
+ ###
+
+ my $offset = 1;
+ my $process_id;
+ my @pids;
+ if ($multicore > 1){
+
+ until ($offset == $multicore){
+ # warn "multicore: $multicore\noffset: $offset\n";
+ my $fork = fork;
+
+ if (defined $fork){
+ if ($fork != 0){
+ $process_id = $fork;
+ push @pids, $process_id;
+ if ($offset < $multicore){
+ ++$offset;
+ # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n";
+ }
+ else{
+ # warn "Reached the number of maximum multicores. Proceeeding to processing...\n";
+ }
+ }
+ elsif ($fork == 0){
+ # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n";
+ $process_id = $fork;
+ last;
+ }
+ }
+ else{
+ die "Forking unsuccessful. Proceeding using a single thread only\n";
+ }
+ }
+
+ # warn "\nThe thread identity\n===================\n";
+ if ($process_id){
+ # print "I am the parent process. My children are called:\n";
+ # print join ("\t",@pids),"\n";
+ # print "I am going to process the following line count: $offset\n\n";
+ }
+ elsif($process_id == 0){
+ # warn "I am a child process: Process ID: $process_id\n";
+ # warn "I am going to process the following line count: $offset\n\n";
+ }
+ else{
+ die "Process ID was: $process_id\n";
+ }
+ }
+ else{
+ # warn "Single-core mode: setting pid to 1\n";
+ $process_id = 1;
+ }
+
+ ################################################
+ ################################################
+ if ($process_id){
+ warn "Now reading in Bismark result file $filename\n";
+ }
+ else{
+ warn "\nNow reading in Bismark result file $filename\n\n";
+ }
+
+ if ($filename =~ /\.gz$/) {
+ open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
+ }
+ elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
+ if ($samtools_path){
+ open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
+ }
+ else{
+ die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
+ }
+ }
+ else {
+ open (IN,$filename) or die "Can't open file $filename: $!\n";
+ }
+
+ ### Vanilla and SAM output need to read different numbers of header lines
+ if ($vanilla) {
+ my $bismark_version = ; ## discarding the Bismark version info
+ chomp $bismark_version;
+ $bismark_version =~ s/\r//; # replaces \r line feed
+ $bismark_version =~ s/Bismark version: //;
+ if ($bismark_version =~ /^\@/) {
+ warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
+ sleep (1);
+ }
+
+ unless ($version eq $bismark_version){
+ die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
+ }
+ } else {
+ # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
+ # We are reading from it further down
+ }
+
+ my $methylation_call_strings_processed = 0;
+ my $line_count = 0;
+
+ ### proceeding differently now for single-end or paired-end Bismark files
+
+ ### PROCESSING SINGLE-END RESULT FILES
+ if ($single) {
+
+ ### also proceeding differently now for SAM format or vanilla Bismark format files
+ if ($vanilla) { # old vanilla Bismark output format
+ while () {
+ ++$line_count;
+ warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+
+ if ( ($line_count - $offset)%$multicore == 0){
+ # warn "line count: $line_count\noffset: $offset\n";
+ # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
+ # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
+ }
+ else{
+ # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
+ next;
+ }
+
+ ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
+ my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
+
+ ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
+ ### last position
+ chomp $genome_conversion;
+
+ my $index;
+ if ($meth_call) {
+
+ if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+ $index = 0;
+ } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+ $index = 1;
+ } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+ $index = 3;
+ } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+ $index = 2;
+ } else {
+ die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+ }
+
+ ### Clipping off the first number of bases from the methylation call string as specified with --ignore
+ if ($ignore) {
+ $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
+
+ ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
+ if ($strand eq '+') {
+ $start += $ignore;
+ }
+ elsif ($strand eq '-') {
+ $start += length($meth_call)-1; ## $meth_call is already shortened!
+ }
+ else {
+ die "Alignment did not have proper strand information: $strand\n";
+ }
+ }
+
+ ### Clipping off the last number of bases from the methylation call string as specified with --ignore_3prime
+ if ($ignore_3prime) {
+
+ $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime);
+
+ ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly
+ if ($strand eq '+') {
+ # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all
+ }
+ elsif ($strand eq '-') {
+ # here we need to discriminate if the start has been adjusted because of --ignore or not
+ if ($ignore){
+ # position adjusted already, and because of this 3' trimming is irrelevant for the start position
+ }
+ else{
+ # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step
+ $start += $ignore_3prime;
+ }
+ }
+ else {
+ die "Alignment did not have proper strand information: $strand\n";
+ }
+ }
+ ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage
+
+ ### printing out the methylation state of every C in the read
+ print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
+
+ ++$methylation_call_strings_processed; # 1 per single-end result
+ }
+ }
+ } else { # processing single-end SAM format (default)
+ while () {
+ chomp;
+ ### SAM format can either start with header lines (starting with @) or start with alignments directly
+ if (/^\@/) { # skipping header lines (starting with @)
+ warn "skipping SAM header line:\t$_\n" unless ($process_id == 0);
+ next;
+ }
+
+ ++$line_count;
+ # warn "$line_count\n";
+ warn "Processed lines: $line_count\n" if ($line_count%500000 == 0);
+
+ if ( ($line_count - $offset)%$multicore == 0){
+ # warn "line count: $line_count\noffset: $offset\n";
+ # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
+ # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
+ }
+ else{
+ # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
+ next;
+ }
+
+ # example read in SAM format
+ # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
+ ###
+
+ # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
+ # < 0.7.6 $meth_call =~ s/^XM:Z://;
+ # < 0.7.6 $read_conversion =~ s/^XR:Z://;
+ # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
+
+ my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
+
+ ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ my $meth_call; ### Thanks to Zachary Zeno for this solution
+ my $read_conversion;
+ my $genome_conversion;
+
+ while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
+ my $tag = $1;
+ my $value = $2;
+
+ if ($tag eq "XM") {
+ $meth_call = $value;
+ $meth_call =~ s/\r//;
+ } elsif ($tag eq "XR") {
+ $read_conversion = $value;
+ $read_conversion =~ s/\r//;
+ } elsif ($tag eq "XG") {
+ $genome_conversion = $value;
+ $genome_conversion =~ s/\r//;
+ }
+ }
+
+ my $strand;
+ # warn "$meth_call\n$read_conversion\n$genome_conversion\n";
+
+ my $index;
+ if ($meth_call) {
+ if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+ $index = 0;
+ $strand = '+';
+ } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+ $index = 1;
+ $strand = '-';
+ } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+ $index = 2;
+ $strand = '+';
+ } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+ $index = 3;
+ $strand = '-';
+ } else {
+ die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+ }
+
+ ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
+ if ($strand eq '-') {
+ $meth_call = reverse $meth_call;
+ }
+ # warn "\n$meth_call\n";
+
+ ### IGNORE 5 PRIME: Clipping off the first number of bases from the methylation call string as specified with --ignore
+ if ($ignore) {
+ # warn "\n\n$meth_call\n";
+ $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
+ # warn "$meth_call\n";sleep(1);
+
+ ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+
+ my @len = split (/\D+/,$cigar); # storing the length per operation
+ my @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ my @comp_cigar; # building an array with all CIGAR operations
+ foreach my $index (0..$#len) {
+ foreach (1..$len[$index]) {
+ # print "$ops[$index]";
+ push @comp_cigar, $ops[$index];
+ }
+ }
+ # print "original CIGAR: $cigar\n";
+ # print "original CIGAR: @comp_cigar\n";
+
+ ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
+ if ($strand eq '+') {
+
+ my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
+ my $I_count = 0;
+
+ for (1..$ignore) {
+ my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
+ # print "$_ deleted $op\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $D_count++;
+ $op = shift @comp_cigar;
+ # print "$_ deleted $op\n";
+ }
+ if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ $I_count++;
+ }
+ }
+ $start += $ignore + $D_count - $I_count;
+ # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
+ }
+ elsif ($strand eq '-') {
+
+ for (1..$ignore) {
+ my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $op = pop @comp_cigar;
+ }
+ }
+
+ ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
+ ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
+ my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
+ foreach (@comp_cigar) {
+ ++$MD_count if ($_ eq 'M' or $_ eq 'D');
+ }
+ $start += $MD_count - 1;
+ }
+
+ ### reconstituting shortened CIGAR string
+ my $new_cigar;
+ my $count = 0;
+ my $last_op;
+ # print "ignore adjusted: @comp_cigar\n";
+ foreach my $op (@comp_cigar) {
+ unless (defined $last_op){
+ $last_op = $op;
+ ++$count;
+ next;
+ }
+ if ($last_op eq $op) {
+ ++$count;
+ } else {
+ $new_cigar .= "$count$last_op";
+ $last_op = $op;
+ $count = 1;
+ }
+ }
+ $new_cigar .= "$count$last_op"; # appending the last operation and count
+ $cigar = $new_cigar;
+ # print "ignore adjusted scalar: $cigar\n";
+ }
+
+ #######################
+ ### INGORE 3' END ###
+ #######################
+
+ # Clipping off the last number of bases from the methylation call string as specified with --ignore_3prime
+ if ($ignore_3prime) {
+ # warn "$meth_call\n";
+ $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime);
+ # warn "$meth_call\n";sleep(1);
+
+ ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+
+ my @len = split (/\D+/,$cigar); # storing the length per operation
+ my @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ my @comp_cigar; # building an array with all CIGAR operations
+ foreach my $index (0..$#len) {
+ foreach (1..$len[$index]) {
+ # print "$ops[$index]";
+ push @comp_cigar, $ops[$index];
+ }
+ }
+
+ # print "original CIGAR: $cigar\n";
+ # print join ("",@comp_cigar),"\n";
+
+ ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly
+ if ($strand eq '+') {
+
+ ### clipping the 3' end does not affect the starting position of forward strand alignments
+ # ignore 5' has already been taken care of at this stage, if relevant at all
+
+ for (1..$ignore_3prime) {
+ my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end
+ # print join ("",@comp_cigar),"\n";
+ # print "$_ deleted $op from 3' end\n";
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $op = pop @comp_cigar;
+ # print join ("",@comp_cigar),"\n";
+ # print "$_ deleted $op from 3' end\n";
+ }
+ }
+ # print "Final truncated CIGAR string:\n";
+ # print join ("",@comp_cigar),"\n";
+ # $start += $ignore + $D_count - $I_count;
+ # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n";
+ }
+ elsif ($strand eq '-') {
+
+ my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
+ my $I_count = 0;
+
+ for (1..$ignore_3prime) {
+ my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $D_count++;
+ $op = shift @comp_cigar;
+ }
+ if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ $I_count++;
+ }
+
+ }
+
+ # here we need to discriminate if the start has been adjusted because of --ignore or not
+ if ($ignore){
+ # the start position has already been modified for --ignore already, so we don't have to adjust the position again now
+ }
+ else{
+ # Here we need to add the length ignore_3prime to the read starting position
+ # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step
+ $start += $ignore_3prime + $D_count - $I_count;
+ }
+
+ }
+
+ ### reconstituting shortened CIGAR string
+ my $new_cigar;
+ my $count = 0;
+ my $last_op;
+ # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n";
+ foreach my $op (@comp_cigar) {
+ unless (defined $last_op){
+ $last_op = $op;
+ ++$count;
+ next;
+ }
+ if ($last_op eq $op) {
+ ++$count;
+ } else {
+ $new_cigar .= "$count$last_op";
+ $last_op = $op;
+ $count = 1;
+ }
+ }
+ $new_cigar .= "$count$last_op"; # appending the last operation and count
+ $cigar = $new_cigar;
+ # print "ignore_3prime adjusted scalar: $cigar\n";
+ }
+ }
+ ### printing out the methylation state of every C in the read
+ print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
+
+ ++$methylation_call_strings_processed; # 1 per single-end result
+ }
+ }
+ }
+
+ ### PROCESSING PAIRED-END RESULT FILES
+ elsif ($paired) {
+
+ ### proceeding differently now for SAM format or vanilla Bismark format files
+ if ($vanilla) { # old vanilla Bismark paired-end output format
+ while () {
+ ++$line_count;
+ warn "processed line: $line_count\n" if ($line_count%500000 == 0);
+
+ if ( ($line_count - $offset)%$multicore == 0){
+ # warn "line count: $line_count\noffset: $offset\n";
+ # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
+ # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
+ }
+ else{
+ # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
+ next;
+ }
+
+ ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
+ my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
+
+ my $index;
+ chomp $genome_conversion;
+
+ if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
+ $index = 0; ## this is OT
+ } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
+ $index = 2; ## this is CTOB!!!
+ } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
+ $index = 1; ## this is CTOT!!!
+ } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
+ $index = 3; ## this is OB
+ } else {
+ die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
+ }
+
+ if ($meth_call_1 and $meth_call_2) {
+ ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore '
+
+ ### IGNORE FROM 5' END
+ if ($ignore) {
+ $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
+
+ ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
+ $start_read_1 += $ignore;
+ }
+ if ($ignore_r2) {
+ $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
+
+ ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
+ $end_read_2 -= $ignore_r2;
+ }
+
+ ### IGNORE 3' END
+
+ ### Clipping off the last number of bases from the methylation call string of Read 1 as specified with --ignore_3prime
+ if ($ignore_3prime) {
+ $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime);
+ # we don't have to adjust the position now since the shortened methylation call will be fine, see below
+ }
+ ### Clipping off the last number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2
+ if ($ignore_3prime_r2) {
+ $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2);
+ # we don't have to adjust the position now since the shortened methylation call will be fine, see below
+ }
+
+ my $end_read_1;
+ my $start_read_2;
+
+ if ($strand eq '+') {
+
+ $end_read_1 = $start_read_1 + length($meth_call_1) - 1;
+ $start_read_2 = $end_read_2 - length($meth_call_2) + 1;
+
+ ## we first pass the first read which is in + orientation on the forward strand
+ print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
+
+ # we next pass the second read which is in - orientation on the reverse strand
+ ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
+ print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
+ }
+ else {
+
+ $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
+ $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
+
+ ## we first pass the first read which is in - orientation on the reverse strand
+ print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
+
+ # we next pass the second read which is in + orientation on the forward strand
+ ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
+ print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
+ }
+
+ $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
+ }
+ }
+ }
+ else { # Bismark paired-end BAM/SAM output format (default)
+ while () {
+ chomp;
+ ### SAM format can either start with header lines (starting with @) or start with alignments directly
+ if (/^\@/) { # skipping header lines (starting with @)
+ warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess
+ next;
+ }
+
+ ++$line_count;
+ warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+
+ if ( ($line_count - $offset)%$multicore == 0){
+ # warn "line count: $line_count\noffset: $offset\n";
+ # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
+ # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
+ }
+ else{
+ # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
+ $_ = ; # reading and skipping another line since this is the paired-end read
+ next;
+ }
+
+ # example paired-end reads in SAM format (2 consecutive lines)
+ # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
+ # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
+
+ my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ my $meth_call_1;
+ my $first_read_conversion;
+ my $genome_conversion;
+
+ while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
+ my $tag = $1;
+ my $value = $2;
+
+ if ($tag eq "XM") {
+ $meth_call_1 = $value;
+ $meth_call_1 =~ s/\r//;
+ } elsif ($tag eq "XR") {
+ $first_read_conversion = $value;
+ $first_read_conversion =~ s/\r//;
+ } elsif ($tag eq "XG") {
+ $genome_conversion = $value;
+ $genome_conversion =~ s/\r//;
+ }
+ }
+
+ $_ = ; # reading in the paired read
+ chomp;
+
+ my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+
+ my $meth_call_2;
+ my $second_read_conversion;
+
+ while ( /(XM|XR):Z:([^\t]+)/g ) {
+ my $tag = $1;
+ my $value = $2;
+
+ if ($tag eq "XM") {
+ $meth_call_2 = $value;
+ $meth_call_2 =~ s/\r//;
+ } elsif ($tag eq "XR") {
+ $second_read_conversion = $value;
+ $second_read_conversion = s/\r//;
+ }
+ }
+
+ # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
+ # chomp $genome_conversion; # in case it captured a new line character # already removed
+
+ # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
+
+ my $index;
+ my $strand;
+
+ if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
+ $index = 0; ## this is OT
+ $strand = '+';
+ } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
+ $index = 1; ## this is CTOT
+ $strand = '-';
+ } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
+ $index = 2; ## this is CTOB
+ $strand = '+';
+ } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
+ $index = 3; ## this is OB
+ $strand = '-';
+ } else {
+ die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
+ }
+
+ ### reversing the methylation call of the read that was reverse-complemented
+ if ($strand eq '+') {
+ $meth_call_2 = reverse $meth_call_2;
+ } else {
+ $meth_call_1 = reverse $meth_call_1;
+ }
+ # warn "\n$meth_call_1\n$meth_call_2\n";
+
+ if ($meth_call_1 and $meth_call_2) {
+
+ my $end_read_1;
+
+ ### READ 1
+ my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
+ my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
+ shift @ops_1; # remove the empty first element
+
+ die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
+
+ my @comp_cigar_1; # building an array with all CIGAR operations
+ foreach my $index (0..$#len_1) {
+ foreach (1..$len_1[$index]) {
+ # print "$ops_1[$index]";
+ push @comp_cigar_1, $ops_1[$index];
+ }
+ }
+ # print "original CIGAR read 1: $cigar_1\n";
+ # print "original CIGAR read 1: @comp_cigar_1\n";
+
+ ### READ 2
+ my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
+ my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
+ shift @ops_2; # remove the empty first element
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
+ my @comp_cigar_2; # building an array with all CIGAR operations for read 2
+ foreach my $index (0..$#len_2) {
+ foreach (1..$len_2[$index]) {
+ # print "$ops_2[$index]";
+ push @comp_cigar_2, $ops_2[$index];
+ }
+ }
+ # print "original CIGAR read 2: $cigar_2\n";
+ # print "original CIGAR read 2: @comp_cigar_2\n";
+
+ ##################################
+ ### IGNORE BASES FROM 5' END ###
+ ##################################
+
+ if ($ignore) {
+ ### Clipping off the first number of bases from the methylation call string as specified with '--ignore ' for read 1
+ ### the methylation calls have already been reversed where necessary
+
+ if ( (length($meth_call_1) - $ignore) <= 0){
+ # next; # skipping this read entirely if the read is shorter than the portion to be ignored
+ $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
+ }
+ else {
+ $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
+
+ if ($strand eq '+') {
+
+ ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
+ my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
+ my $I_count_1 = 0;
+
+ for (1..$ignore) {
+ my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
+ # print "$_ deleted $op\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $D_count_1++;
+ $op = shift @comp_cigar_1;
+ # print "$_ deleted $op\n";
+ }
+ if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ $I_count_1++;
+ }
+ }
+
+ $start_read_1 += $ignore + $D_count_1 - $I_count_1;
+ # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
+
+ # the start position of reads mapping to the reverse strand is being adjusted further below
+ }
+ elsif ($strand eq '-') {
+
+ ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
+ for (1..$ignore) {
+ my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $op = pop @comp_cigar_1;
+ }
+ }
+ # the start position of reads mapping to the reverse strand is being adjusted further below
+
+ }
+ }
+ }
+
+ if ($ignore_r2) {
+ ### Clipping off the first number of bases from the methylation call string as specified with '--ignore_r2 ' for read 2
+ ### the methylation calls have already been reversed where necessary
+
+ if ( (length($meth_call_2) - $ignore_r2) <= 0){
+ # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair!
+ $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
+ }
+ else {
+ $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
+
+ ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+
+ if ($strand eq '+') {
+
+ ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
+
+ for (1..$ignore_r2) {
+ my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $op = pop @comp_cigar_2;
+ }
+ }
+ # the start position of reads mapping to the reverse strand is being adjusted further below
+ }
+ elsif ($strand eq '-') {
+
+ ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
+ my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
+ my $I_count_2 = 0;
+
+ for (1..$ignore_r2) {
+ my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
+ # print "$_ deleted $op\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $D_count_2++;
+ $op = shift @comp_cigar_2;
+ # print "$_ deleted $op\n";
+ }
+ if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ $I_count_2++;
+ }
+ }
+
+ $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
+ # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
+ }
+ }
+ }
+
+ if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string
+
+ ### reconstituting shortened CIGAR string 1
+ my $new_cigar_1;
+ my $count_1 = 0;
+ my $last_op_1;
+ # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
+ foreach my $op (@comp_cigar_1) {
+ unless (defined $last_op_1){
+ $last_op_1 = $op;
+ ++$count_1;
+ next;
+ }
+ if ($last_op_1 eq $op) {
+ ++$count_1;
+ } else {
+ $new_cigar_1 .= "$count_1$last_op_1";
+ $last_op_1 = $op;
+ $count_1 = 1;
+ }
+ }
+ $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
+ $cigar_1 = $new_cigar_1;
+ # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
+ }
+
+ if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
+
+ ### reconstituting shortened CIGAR string 2
+ my $new_cigar_2;
+ my $count_2 = 0;
+ my $last_op_2;
+ # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
+ foreach my $op (@comp_cigar_2) {
+ unless (defined $last_op_2){
+ $last_op_2 = $op;
+ ++$count_2;
+ next;
+ }
+ if ($last_op_2 eq $op) {
+ ++$count_2;
+ }
+ else {
+ $new_cigar_2 .= "$count_2$last_op_2";
+ $last_op_2 = $op;
+ $count_2 = 1;
+ }
+ }
+ $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
+ $cigar_2 = $new_cigar_2;
+ # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
+ }
+
+ ###########################
+ ### END IGNORE 5' END ###
+ ###########################
+
+ ##################################
+ ### IGNORE BASES FROM 3' END ###
+ ##################################
+
+ # print "CIGAR string before truncating 3' end (Read 1)\n";
+ # print join ("",@comp_cigar_1),"\n";
+
+ if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further
+
+ ### Clipping off the last number of bases from the methylation call string as specified with '--ignore_3prime ' for read 1
+ ### the methylation calls have already been reversed where necessary
+
+ if ( (length($meth_call_1) - $ignore_3prime) <= 0){
+ $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
+ }
+ else {
+ $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime);
+ # warn "truncated meth_call 1:\n$meth_call_1\n";
+
+ if ($strand eq '+') {
+
+ ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments
+ # ignore 5' has already been taken care of at this stage, if relevant at all
+
+ for (1..$ignore_3prime) {
+ my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end
+ # print "$_ deleted $op from 3' end\n";
+ # print join ("",@comp_cigar_1),"\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $op = pop @comp_cigar_1;
+ # print join ("",@comp_cigar_1),"\n";
+ # print "$_ deleted $op from 3' end\n";
+ }
+ }
+
+ # print "Final truncated CIGAR string (Read 1):\n";
+ # print join ("",@comp_cigar_1),"\n";
+
+ }
+ elsif ($strand eq '-') {
+
+ my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
+ my $I_count_1 = 0;
+
+ ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start
+ for (1..$ignore_3prime) {
+ my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array
+ # print join ("",@comp_cigar_1),"\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $D_count_1++;
+ $op = shift @comp_cigar_1;
+ # print join ("",@comp_cigar_1),"\n";
+ }
+
+ if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ $I_count_1++;
+ }
+ }
+
+ # print "Final truncated CIGAR string reverse_read:\n";
+ # print join ("",@comp_cigar_1),"\n";
+
+ # Here we need to add the length ignore_3prime to the read starting position
+ # adjustment of the true start position of this reverse read will take place later in the methylation extraction step
+ $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1;
+
+ }
+ }
+ }
+
+ if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further
+
+ ### Clipping off the last number of bases from the methylation call string as specified with '--ignore_3prime_r2 ' for read 2
+ ### the methylation calls have already been reversed where necessary
+
+ if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){
+ $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
+ }
+ else {
+ $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2);
+ # warn "truncated meth_call 2:\n$meth_call_2\n";
+
+ ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly
+
+ if ($strand eq '+') {
+
+ ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment)
+ ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start
+
+ my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
+ my $I_count_2 = 0;
+
+ for (1..$ignore_3prime_r2) {
+ my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array
+ # print "$_ deleted $op from 3' end\n";
+ # print join ("",@comp_cigar_2),"\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $D_count_2++;
+ $op = shift @comp_cigar_2;
+ # print "$_ deleted $op from 3' end\n";
+ # print join ("",@comp_cigar_2),"\n";
+ }
+
+ if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ $I_count_2++;
+ }
+ }
+
+ # print "Final truncated CIGAR string 2 (+ alignment):\n";
+ # print join ("",@comp_cigar_2),"\n";
+
+ # Here we need to add the length ignore_3prime_r2 to the read starting position
+ # adjustment of the true start position of this reverse read will take place later in the methylation extraction step
+ $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2;
+
+ # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
+ }
+ elsif ($strand eq '-') {
+ ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment)
+ ### ignore_r2 5' has already been taken care of at this stage, if relevant at all
+
+ ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end
+
+ for (1..$ignore_3prime_r2) {
+ my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
+ # print "$_ deleted $op\n";
+ # print join ("",@comp_cigar_2),"\n";
+
+ while ($op eq 'D') { # repeating this for deletions (D)
+ $op = pop @comp_cigar_2;
+ # print "$_ deleted $op\n";
+ # print join ("",@comp_cigar_2),"\n";
+ }
+ }
+
+ # print "Final truncated CIGAR string 2 (- alignment):\n";
+ # print join ("",@comp_cigar_2),"\n";
+
+ }
+ }
+ }
+
+ if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
+
+ ### reconstituting shortened CIGAR string 1
+ my $new_cigar_1;
+ my $count_1 = 0;
+ my $last_op_1;
+ # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n";
+ foreach my $op (@comp_cigar_1) {
+ unless (defined $last_op_1){
+ $last_op_1 = $op;
+ ++$count_1;
+ next;
+ }
+ if ($last_op_1 eq $op) {
+ ++$count_1;
+ }
+ else {
+ $new_cigar_1 .= "$count_1$last_op_1";
+ $last_op_1 = $op;
+ $count_1 = 1;
+ }
+ }
+ $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
+ $cigar_1 = $new_cigar_1;
+ # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n";
+ }
+
+ if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
+
+ ### reconstituting shortened CIGAR string 2
+ my $new_cigar_2;
+ my $count_2 = 0;
+ my $last_op_2;
+ # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n";
+ foreach my $op (@comp_cigar_2) {
+ unless (defined $last_op_2){
+ $last_op_2 = $op;
+ ++$count_2;
+ next;
+ }
+ if ($last_op_2 eq $op) {
+ ++$count_2;
+ }
+ else {
+ $new_cigar_2 .= "$count_2$last_op_2";
+ $last_op_2 = $op;
+ $count_2 = 1;
+ }
+ }
+ $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
+ $cigar_2 = $new_cigar_2;
+ # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
+ }
+
+ ###########################
+ ### END IGNORE 3' END ###
+ ###########################
+
+
+ ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
+
+ if ($strand eq '+') {
+ ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
+ @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ # print "reverse: @comp_cigar_2\n";
+
+ my $MD_count_1 = 0;
+ foreach (@comp_cigar_1) {
+ ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ }
+
+ my $MD_count_2 = 0;
+ foreach (@comp_cigar_2) {
+ ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ }
+
+ $end_read_1 = $start_read_1 + $MD_count_1 - 1;
+ $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
+ }
+ else {
+ ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
+
+ @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ # print "reverse: @comp_cigar_1\n";
+
+ my $MD_count_1 = 0;
+ foreach (@comp_cigar_1) {
+ ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ }
+
+ $end_read_1 = $start_read_1;
+ $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
+ }
+
+ if ($strand eq '+') {
+ ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
+ print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
+
+ # we next pass the second read which is in - orientation on the reverse strand
+ ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
+ print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
+ }
+ else {
+ ## we first pass the first read which is in - orientation on the reverse strand
+ print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
+
+ # we next pass the second read which is in + orientation on the forward strand
+ ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
+ print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
+ }
+
+ $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings
+ }
+ }
+ }
+ } else {
+ die "Single-end or paired-end reads not specified properly\n";
+ }
+
+ $counting{sequences_count} = $line_count;
+ $counting{methylation_call_strings} = $methylation_call_strings_processed;
+
+ if ($multicore == 1){
+ print_splitting_report ();
+ }
+ elsif ($multicore > 1){
+ print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed);
+ print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed);
+ }
+
+ return ($process_id,\@pids,$report_filename);
+
+}
+
+
+
+sub print_splitting_report{
+
+ ### Calculating methylation percentages if applicable
+ warn "\nProcessed $counting{sequences_count} lines in total\n";
+ warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n";
+ if ($report) {
+ print REPORT "\nProcessed $counting{sequences_count} lines in total\n";
+ print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n";
+ }
+
+ my $percent_meCpG;
+ if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ }
+
+ my $percent_meCHG;
+ if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ }
+
+ my $percent_meCHH;
+ if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ }
+
+ my $percent_non_CpG_methylation;
+ if ($merge_non_CpG){
+ if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
+ }
+ }
+
+ if ($report){
+ ### detailed information about Cs analysed
+ print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+
+ my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+
+ print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+
+ print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ ### calculating methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### 2-Context Output
+ if ($merge_non_CpG){
+ if ($percent_non_CpG_methylation){
+ print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ }
+ else{
+ print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ }
+ }
+
+ ### 3 Context Output
+ else{
+ ### calculating methylated CHG percentage if applicable
+ if ($percent_meCHG){
+ print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### calculating methylated CHH percentage if applicable
+ if ($percent_meCHH){
+ print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ }
+ else{
+ print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ }
+ }
+ }
+
+ ### detailed information about Cs analysed for on-screen report
+ warn "Final Cytosine Methylation Report\n",'='x33,"\n";
+
+ my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ warn "Total number of C's analysed:\t$total_number_of_C\n\n";
+
+ warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+
+ warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ ### printing methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ warn "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### 2-Context Output
+ if ($merge_non_CpG){
+ if ($percent_non_CpG_methylation){
+ warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ }
+ }
+
+ ### 3-Context Output
+ else{
+ ### printing methylated CHG percentage if applicable
+ if ($percent_meCHG){
+ warn "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### printing methylated CHH percentage if applicable
+ if ($percent_meCHH){
+ warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ }
+ }
+}
+
+###
+
+sub print_splitting_report_multicore{
+
+ my ($report_filename,$offset,$line_count,$meth_call_strings) = @_;
+
+ # warn "\$report_filename is $report_filename\n";
+ my $special_report = $report_filename.".$offset";
+
+ open (SPECIAL_REPORT,'>',$special_report) or die $!;
+ # warn "line count\t$line_count\n";
+ # warn "meth call strings\t$meth_call_strings\n";
+
+ print SPECIAL_REPORT "line count\t$line_count\n";
+ print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n";
+
+ ### Calculating methylation percentages if applicable
+ my $percent_meCpG;
+ if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ }
+
+ my $percent_meCHG;
+ if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ }
+
+ my $percent_meCHH;
+ if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ }
+
+ my $percent_non_CpG_methylation;
+ if ($merge_non_CpG){
+ if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
+ }
+ }
+
+ if ($report){
+
+ ### detailed information about Cs analysed
+ print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+
+ my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+
+ print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+
+ print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+
+ ### calculating methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### 2-Context Output
+ if ($merge_non_CpG){
+ if ($percent_non_CpG_methylation){
+ print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ }
+ else{
+ print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ }
+ }
+
+ ### 3 Context Output
+ else{
+ ### calculating methylated CHG percentage if applicable
+ if ($percent_meCHG){
+ print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### calculating methylated CHH percentage if applicable
+ if ($percent_meCHH){
+ print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ }
+ else{
+ print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ }
+ }
+ }
+ close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n";
+}
+
+
+###
+
+### INDIVIDUAL M-BIAS REPORTS
+
+sub print_mbias_report_multicore{
+
+ my ($report_filename,$offset,$line_count,$meth_call_strings) = @_;
+
+ # warn "\$report_filename is $report_filename\n";
+ my $special_mbias_report = $report_filename.".${offset}.mbias";
+
+ open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!;
+
+ # determining maximum read length
+ my $max_length_1 = 0;
+ my $max_length_2 = 0;
+
+ foreach my $context (keys %mbias_1){
+ foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
+ $max_length_1 = $pos unless ($max_length_1 >= $pos);
+ }
+ }
+ if ($paired){
+ foreach my $context (keys %mbias_2){
+ foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
+ $max_length_2 = $pos unless ($max_length_2 >= $pos);
+ }
+ }
+ }
+
+ if ($single){
+ # warn "Determining maximum read length for M-Bias plot\n";
+ # warn "Maximum read length of Read 1: $max_length_1\n\n";
+ }
+ else{
+ # warn "Determining maximum read lengths for M-Bias plots\n";
+ # warn "Maximum read length of Read 1: $max_length_1\n";
+ # warn "Maximum read length of Read 2: $max_length_2\n\n";
+ }
+
+ foreach my $context (qw(CpG CHG CHH)){
+
+ if ($paired){
+ print SPECIAL_MBIAS "$context context (R1)\n================\n";
+ }
+ else{
+ print SPECIAL_MBIAS "$context context\n===========\n";
+ }
+ print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
+
+ foreach my $pos (1..$max_length_1){
+
+ unless (defined $mbias_1{$context}->{$pos}->{meth}){
+ $mbias_1{$context}->{$pos}->{meth} = 0;
+ }
+ unless (defined $mbias_1{$context}->{$pos}->{un}){
+ $mbias_1{$context}->{$pos}->{un} = 0;
+ }
+
+ my $percent = '';
+ if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
+ $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
+ }
+ my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
+
+ print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
+ }
+ print SPECIAL_MBIAS "\n";
+ }
+
+ if ($paired){
+
+ foreach my $context (qw(CpG CHG CHH)){
+
+ print SPECIAL_MBIAS "$context context (R2)\n================\n";
+ print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
+
+ foreach my $pos (1..$max_length_2){
+
+ unless (defined $mbias_2{$context}->{$pos}->{meth}){
+ $mbias_2{$context}->{$pos}->{meth} = 0;
+ }
+ unless (defined $mbias_2{$context}->{$pos}->{un}){
+ $mbias_2{$context}->{$pos}->{un} = 0;
+ }
+
+ my $percent = '';
+ if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
+ $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
+ }
+ my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
+
+ print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
+ }
+ }
+ }
+
+ close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n";
+}
+
+
+###
+
+
+sub print_individual_C_methylation_states_paired_end_files{
+
+ my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
+
+ unless (defined $meth_call) {
+ return; # skip this read
+ }
+
+ ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
+ die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
+
+ my @methylation_calls = split(//,$meth_call);
+
+ #################################################################
+ ### . for bases not involving cytosines ###
+ ### X for methylated C in CHG context (was protected) ###
+ ### x for not methylated C in CHG context (was converted) ###
+ ### H for methylated C in CHH context (was protected) ###
+ ### h for not methylated C in CHH context (was converted) ###
+ ### Z for methylated C in CpG context (was protected) ###
+ ### z for not methylated C in CpG context (was converted) ###
+ ### U for methylated C in Unknown context (was protected) ###
+ ### u for not methylated C in Unknown context (was converted) ###
+ #################################################################
+
+ my $methyl_CHG_count = 0;
+ my $methyl_CHH_count = 0;
+ my $methyl_CpG_count = 0;
+ my $unmethylated_CHG_count = 0;
+ my $unmethylated_CHH_count = 0;
+ my $unmethylated_CpG_count = 0;
+
+ my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
+ my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
+ my @comp_cigar;
+
+ ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
+ if ($cigar =~ /^\d+M$/){
+ # this check speeds up the extraction process by up to 60%!!!
+ }
+ else{ # parsing CIGAR string
+ my @len;
+ my @ops;
+ @len = split (/\D+/,$cigar); # storing the length per operation
+ @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ foreach my $index (0..$#len){
+ foreach (1..$len[$index]){
+ # print "$ops[$index]";
+ push @comp_cigar, $ops[$index];
+ }
+ }
+ # warn "\nDetected CIGAR string: $cigar\n";
+ # warn "Length of methylation call: ",length $meth_call,"\n";
+ # warn "number of operations: ",scalar @ops,"\n";
+ # warn "number of length digits: ",scalar @len,"\n\n";
+ # print @comp_cigar,"\n";
+ # print "$meth_call\n\n";
+ # sleep (1);
+ }
+
+ if ($strand eq '-') {
+
+ ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
+ if (@comp_cigar){
+ @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ }
+ # print "reverse CIGAR string: @comp_cigar\n";
+
+ ### the start position of paired-end files has already been corrected, see above
+ }
+
+ ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
+
+ if ($merge_non_CpG) {
+ if ($no_overlap) { # this has to be read 2...
+
+ ### single-file CpG and non-CpG context output
+ if ($full) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start+$index+$pos_offset >= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.'){}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
+ }
+ }
+ }
+ elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start-$index+$pos_offset <= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
+ }
+ }
+ } else {
+ die "The read orientation was neither + nor -: '$strand'\n";
+ }
+ }
+
+ ### strand-specific methylation output
+ else {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start+$index+$pos_offset >= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start-$index+$pos_offset <= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } else {
+ die "The strand orientation was neither + nor -: '$strand'/n";
+ }
+ }
+ }
+
+ ### this is the default paired-end procedure allowing overlaps and using every single C position
+ ### Still within the 2-CONTEXT ONLY optional section
+ else {
+ ### single-file CpG and non-CpG context output
+ if ($full) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
+ }
+ }
+ } else {
+ die "The strand orientation as neither + nor -: '$strand'\n";
+ }
+ }
+
+ ### strand-specific methylation output
+ ### still within the 2-CONTEXT optional section
+ else {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } else {
+ die "The strand orientation as neither + nor -: '$strand'\n";
+ }
+ }
+ }
+ }
+
+ ############################################
+ ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
+ ############################################
+
+ elsif ($no_overlap) {
+ ### single-file CpG, CHG and CHH context output
+ if ($full) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start+$index+$pos_offset >= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start-$index+$pos_offset <= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } else {
+ die "The strand orientation as neither + nor -: '$strand'\n";
+ }
+ }
+
+ ### strand-specific methylation output
+ else {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start+$index+$pos_offset >= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### Returning as soon as the methylation calls start overlapping
+ if ($start-$index+$pos_offset <= $end_read_1) {
+ return;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } else {
+ die "The strand orientation as neither + nor -: '$strand'\n";
+ }
+ }
+ }
+
+ ### this is the paired-end procedure allowing overlaps and using every single C position
+ else {
+ ### single-file CpG, CHG and CHH context output
+ if ($full) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } else {
+ die "The strand orientation as neither + nor -: '$strand'\n";
+ }
+ }
+
+ ### strand-specific methylation output
+ else {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } elsif ($strand eq '-') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CpG}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{meth}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ if ($read_identity == 1){
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ else{
+ $mbias_2{CHH}->{$index+1}->{un}++;
+ }
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ } else {
+ die "The strand orientation as neither + nor -: '$strand'\n";
+ }
+ }
+ }
+}
+
+sub check_cigar_string {
+ my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
+ # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
+ my ($new_cigar_offset,$new_pos_offset) = (0,0);
+
+ if ($strand eq '+') {
+ # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
+
+ if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ # warn "position needs no adjustment\n";
+ }
+
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
+ $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
+ # warn "adjusted genomic position by -1 bp (insertion)\n";
+ }
+
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
+ # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
+
+ while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
+ if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ # warn "position needs no adjustment\n";
+ last;
+ }
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
+ $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
+ # warn "adjusted genomic position by another -1 bp (insertion)\n";
+ last;
+ }
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
+ # warn "adjusted genomic position by another +1 bp (deletion)\n";
+ }
+ else{
+ die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ }
+ }
+ }
+ else{
+ die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ }
+ }
+
+ elsif ($strand eq '-') {
+ # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
+
+ if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ # warn "position needs no adjustment\n";
+ }
+
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
+ $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
+ # warn "adjusted genomic position by +1 bp (insertion)\n";
+ }
+
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
+ # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
+
+ while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
+ if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ # warn "Found new 'M' operation; position needs no adjustment\n";
+ last;
+ }
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
+ $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
+ # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
+ last;
+ }
+ elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
+ # warn "adjusted genomic position by another -1 bp (deletion)\n";
+ }
+ else{
+ die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ }
+ }
+ }
+ else{
+ die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ }
+ }
+ # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
+ return ($new_cigar_offset,$new_pos_offset);
+}
+
+sub print_individual_C_methylation_states_single_end{
+
+ my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
+ my @methylation_calls = split(//,$meth_call);
+
+ #################################################################
+ ### . for bases not involving cytosines ###
+ ### X for methylated C in CHG context (was protected) ###
+ ### x for not methylated C in CHG context (was converted) ###
+ ### H for methylated C in CHH context (was protected) ###
+ ### h for not methylated C in CHH context (was converted) ###
+ ### Z for methylated C in CpG context (was protected) ###
+ ### z for not methylated C in CpG context (was converted) ###
+ #################################################################
+
+ my $methyl_CHG_count = 0;
+ my $methyl_CHH_count = 0;
+ my $methyl_CpG_count = 0;
+ my $unmethylated_CHG_count = 0;
+ my $unmethylated_CHH_count = 0;
+ my $unmethylated_CpG_count = 0;
+
+ my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
+ my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
+
+ my @comp_cigar;
+
+ if ($cigar){ # parsing CIGAR string
+
+ ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing
+ if ($cigar =~ /^\d+M$/){
+ # warn "See!? I told you so! $cigar\n";
+ # sleep(1);
+ }
+ else{
+
+ my @len;
+ my @ops;
+
+ @len = split (/\D+/,$cigar); # storing the length per operation
+ @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+ # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ foreach my $index (0..$#len){
+ foreach (1..$len[$index]){
+ # print "$ops[$index]";
+ push @comp_cigar, $ops[$index];
+ }
+ }
+ }
+ # warn "\nDetected CIGAR string: $cigar\n";
+ # warn "Length of methylation call: ",length $meth_call,"\n";
+ # warn "number of operations: ",scalar @ops,"\n";
+ # warn "number of length digits: ",scalar @len,"\n\n";
+ # print @comp_cigar,"\n";
+ # print "$meth_call\n\n";
+ # sleep (1);
+ }
+
+ ### adjusting the start position for all reads mapping to the reverse strand
+ if ($strand eq '-') {
+
+ if (@comp_cigar){ # only needed for SAM reads with InDels
+ @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ # print @comp_cigar,"\n";
+ }
+
+ unless ($ignore){ ### if --ignore was specified the start position has already been corrected
+
+ if ($cigar){ ### SAM format
+ if ($cigar =~ /^(\d+)M$/){ # linear match
+ $start += $1 - 1;
+ }
+ else{ # InDel read
+ my $MD_count = 0;
+ foreach (@comp_cigar){
+ ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ }
+ $start += $MD_count - 1;
+ }
+ }
+ else{ ### vanilla format
+ $start += length($meth_call)-1;
+ }
+ }
+ }
+
+ ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
+
+ ### single-file CpG and other-context output
+ if ($full and $merge_non_CpG) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+
+ if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{CpG_context}} $line unless($mbias_only);
+ # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{CpG_context}} $line unless($mbias_only);
+ # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ elsif ($strand eq '-') {
+
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
+ # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{CpG_context}} $line unless($mbias_only);
+ #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{CpG_context}} $line unless($mbias_only);
+ #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
+ print {$fhs{other_context}} $line unless($mbias_only);
+ #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.'){}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ else {
+ die "The strand information was neither + nor -: $strand\n";
+ }
+ }
+
+ ### strand-specific methylation output
+ elsif ($merge_non_CpG) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ elsif ($strand eq '-') {
+
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ else {
+ die "The strand information was neither + nor -: $strand\n";
+ }
+ }
+
+ ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
+
+ elsif ($full) {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
+ }
+ }
+ }
+ elsif ($strand eq '-') {
+
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ else {
+ die "The read had a strand orientation which was neither + nor -: $strand\n";
+ }
+ }
+
+ ### strand-specific methylation output
+ else {
+ if ($strand eq '+') {
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ elsif ($strand eq '-') {
+
+ for my $index (0..$#methylation_calls) {
+ ### methylated Cs (any context) will receive a forward (+) orientation
+ ### not methylated Cs (any context) will receive a reverse (-) orientation
+
+ if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ $cigar_offset += $cigar_mod;
+ $pos_offset += $pos_mod;
+ }
+
+ if ($methylation_calls[$index] eq 'X') {
+ $counting{total_meCHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'x') {
+ $counting{total_unmethylated_CHG_count}++;
+ print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'Z') {
+ $counting{total_meCpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'z') {
+ $counting{total_unmethylated_CpG_count}++;
+ print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CpG}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq 'H') {
+ $counting{total_meCHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{meth}++;
+ }
+ elsif ($methylation_calls[$index] eq 'h') {
+ $counting{total_unmethylated_CHH_count}++;
+ print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
+ $mbias_1{CHH}->{$index+1}->{un}++;
+ }
+ elsif ($methylation_calls[$index] eq '.') {}
+ elsif (lc$methylation_calls[$index] eq 'u'){}
+ else{
+ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ }
+ }
+ }
+ else {
+ die "The strand information was neither + nor -: $strand\n";
+ }
+ }
+}
+
+sub open_output_filehandles{
+
+ my $filename = shift;
+
+ my $output_filename = (split (/\//,$filename))[-1];
+ my $report_filename = $output_filename;
+
+ ### OPENING OUTPUT-FILEHANDLES
+ if ($report) {
+ $report_filename =~ s/\.sam$//;
+ $report_filename =~ s/\.txt$//;
+ $report_filename =~ s/$/_splitting_report.txt/;
+ $report_filename = $output_dir . $report_filename;
+ open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
+ }
+
+ if ($report) {
+
+ print REPORT "$output_filename\n\n";
+ print REPORT "Parameters used to extract methylation information:\n";
+ print REPORT "Bismark Extractor Version: $version\n";
+
+ if ($paired) {
+ if ($vanilla) {
+ print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
+ } else {
+ print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
+ }
+ }
+
+ if ($single) {
+ if ($vanilla) {
+ print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
+ } else {
+ print REPORT "Bismark result file: single-end (SAM format)\n"; # default
+ }
+ }
+ if ($single){
+ if ($ignore) {
+ print REPORT "Ignoring first $ignore bp\n";
+ }
+ if ($ignore_3prime) {
+ print REPORT "Ignoring last $ignore_3prime bp\n";
+ }
+ }
+ else{ # paired-end
+ if ($ignore) {
+ print REPORT "Ignoring first $ignore bp of Read 1\n";
+ }
+ if ($ignore_r2){
+ print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
+ }
+
+ if ($ignore_3prime) {
+ print REPORT "Ignoring last $ignore_3prime bp of Read 1\n";
+ }
+ if ($ignore_3prime_r2){
+ print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n";
+ }
+
+ }
+
+ if ($full) {
+ print REPORT "Output specified: comprehensive\n";
+ } else {
+ print REPORT "Output specified: strand-specific (default)\n";
+ }
+
+ if ($no_overlap) {
+ print REPORT "No overlapping methylation calls specified\n";
+ }
+ if ($genomic_fasta) {
+ print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
+ }
+ if ($merge_non_CpG) {
+ print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
+ }
+
+ print REPORT "\n";
+ }
+
+ ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+
+ ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
+ ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
+ if ($full and $merge_non_CpG) {
+ my $cpg_output = my $other_c_output = $output_filename;
+ ### C in CpG context
+ $cpg_output =~ s/^/CpG_context_/;
+ $cpg_output =~ s/sam$/txt/;
+ $cpg_output =~ s/bam$/txt/;
+ $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
+ $cpg_output = $output_dir . $cpg_output;
+
+ if ($gzip){
+ $cpg_output .= '.gz';
+ open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
+ }
+ else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option...
+ open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
+ # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
+ push @sorting_files,$cpg_output;
+
+ unless ($no_header) {
+ print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ ### C in any other context than CpG
+ $other_c_output =~ s/^/Non_CpG_context_/;
+ $other_c_output =~ s/sam$/txt/;
+ $other_c_output =~ s/bam$/txt/;
+ $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
+ $other_c_output = $output_dir . $other_c_output;
+
+ if ($gzip){
+ $other_c_output .= '.gz';
+ open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
+ # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
+ push @sorting_files,$other_c_output;
+
+
+ unless ($no_header) {
+ print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+ }
+
+ ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
+ elsif ($merge_non_CpG) {
+
+ my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
+
+ ### For cytosines in CpG context
+ $cpg_ot =~ s/^/CpG_OT_/;
+ $cpg_ot =~ s/sam$/txt/;
+ $cpg_ot =~ s/bam$/txt/;
+ $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
+ $cpg_ot = $output_dir . $cpg_ot;
+
+ if ($gzip){
+ $cpg_ot .= '.gz';
+ open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
+ # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
+ push @sorting_files,$cpg_ot;
+
+ unless($no_header){
+ print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $cpg_ctot =~ s/^/CpG_CTOT_/;
+ $cpg_ctot =~ s/sam$/txt/;
+ $cpg_ctot =~ s/bam$/txt/;
+ $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
+ $cpg_ctot = $output_dir . $cpg_ctot;
+
+ if ($gzip){
+ $cpg_ctot .= '.gz';
+ open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
+ # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
+ push @sorting_files,$cpg_ctot;
+
+ unless($no_header){
+ print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $cpg_ctob =~ s/^/CpG_CTOB_/;
+ $cpg_ctob =~ s/sam$/txt/;
+ $cpg_ctob =~ s/bam$/txt/;
+ $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
+ $cpg_ctob = $output_dir . $cpg_ctob;
+
+ if ($gzip){
+ $cpg_ctob .= '.gz';
+ open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
+ # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
+ push @sorting_files,$cpg_ctob;
+
+ unless($no_header){
+ print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $cpg_ob =~ s/^/CpG_OB_/;
+ $cpg_ob =~ s/sam$/txt/;
+ $cpg_ob =~ s/bam$/txt/;
+ $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
+ $cpg_ob = $output_dir . $cpg_ob;
+
+ if ($gzip){
+ $cpg_ob .= '.gz';
+ open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
+ # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
+ push @sorting_files,$cpg_ob;
+
+ unless($no_header){
+ print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ ### For cytosines in Non-CpG (CC, CT or CA) context
+ my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
+
+ $other_c_ot =~ s/^/Non_CpG_OT_/;
+ $other_c_ot =~ s/sam$/txt/;
+ $other_c_ot =~ s/bam$/txt/;
+ $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
+ $other_c_ot = $output_dir . $other_c_ot;
+
+ if ($gzip){
+ $other_c_ot .= '.gz';
+ open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
+ # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
+ push @sorting_files,$other_c_ot;
+
+ unless($no_header){
+ print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
+ $other_c_ctot =~ s/sam$/txt/;
+ $other_c_ctot =~ s/bam$/txt/;
+ $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
+ $other_c_ctot = $output_dir . $other_c_ctot;
+
+ if ($gzip){
+ $other_c_ctot .= '.gz';
+ open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
+ # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
+ push @sorting_files,$other_c_ctot;
+
+ unless($no_header){
+ print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
+ $other_c_ctob =~ s/sam$/txt/;
+ $other_c_ctob =~ s/bam$/txt/;
+ $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
+ $other_c_ctob = $output_dir . $other_c_ctob;
+
+ if ($gzip){
+ $other_c_ctob .= '.gz';
+ open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
+ # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
+ push @sorting_files,$other_c_ctob;
+
+ unless($no_header){
+ print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $other_c_ob =~ s/^/Non_CpG_OB_/;
+ $other_c_ob =~ s/sam$/txt/;
+ $other_c_ob =~ s/sam$/txt/;
+ $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
+ $other_c_ob = $output_dir . $other_c_ob;
+
+ if ($gzip){
+ $other_c_ob .= '.gz';
+ open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
+ # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
+ push @sorting_files,$other_c_ob;
+
+ unless($no_header){
+ print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+ }
+ ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
+
+ ### if --comprehensive was specified we are only writing one file per context
+ elsif ($full) {
+ my $cpg_output = my $chg_output = my $chh_output = $output_filename;
+ ### C in CpG context
+ $cpg_output =~ s/^/CpG_context_/;
+ $cpg_output =~ s/sam$/txt/;
+ $cpg_output =~ s/bam$/txt/;
+ $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
+ $cpg_output = $output_dir . $cpg_output;
+
+ if ($gzip){
+ $cpg_output .= '.gz';
+ open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
+ # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
+ push @sorting_files,$cpg_output;
+
+ unless($no_header){
+ print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ ### C in CHG context
+ $chg_output =~ s/^/CHG_context_/;
+ $chg_output =~ s/sam$/txt/;
+ $chg_output =~ s/bam$/txt/;
+ $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
+ $chg_output = $output_dir . $chg_output;
+
+ if ($gzip){
+ $chg_output .= '.gz';
+ open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
+ # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
+ push @sorting_files,$chg_output;
+
+ unless($no_header){
+ print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ ### C in CHH context
+ $chh_output =~ s/^/CHH_context_/;
+ $chh_output =~ s/sam$/txt/;
+ $chh_output =~ s/bam$/txt/;
+ $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
+ $chh_output = $output_dir . $chh_output;
+
+ if ($gzip){
+ $chh_output .= '.gz';
+ open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
+ # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
+ push @sorting_files, $chh_output;
+
+ unless($no_header){
+ print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+ }
+ ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
+ else {
+ my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
+
+ ### For cytosines in CpG context
+ $cpg_ot =~ s/^/CpG_OT_/;
+ $cpg_ot =~ s/sam$/txt/;
+ $cpg_ot =~ s/bam$/txt/;
+ $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
+ $cpg_ot = $output_dir . $cpg_ot;
+
+ if ($gzip){
+ $cpg_ot .= '.gz';
+ open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
+ # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
+ push @sorting_files,$cpg_ot;
+
+ unless($no_header){
+ print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $cpg_ctot =~ s/^/CpG_CTOT_/;
+ $cpg_ctot =~ s/sam$/txt/;
+ $cpg_ctot =~ s/bam$/txt/;
+ $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
+ $cpg_ctot = $output_dir . $cpg_ctot;
+
+ if ($gzip){
+ $cpg_ctot .= '.gz';
+ open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
+ # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
+ push @sorting_files,$cpg_ctot;
+
+ unless($no_header){
+ print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $cpg_ctob =~ s/^/CpG_CTOB_/;
+ $cpg_ctob =~ s/sam$/txt/;
+ $cpg_ctob =~ s/bam$/txt/;
+ $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
+ $cpg_ctob = $output_dir . $cpg_ctob;
+
+ if ($gzip){
+ $cpg_ctob .= '.gz';
+ open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
+ # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
+ push @sorting_files,$cpg_ctob;
+
+ unless($no_header){
+ print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $cpg_ob =~ s/^/CpG_OB_/;
+ $cpg_ob =~ s/sam$/txt/;
+ $cpg_ob =~ s/bam$/txt/;
+ $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
+ $cpg_ob = $output_dir . $cpg_ob;
+
+ if ($gzip){
+ $cpg_ob .= '.gz';
+ open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
+ # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
+ push @sorting_files,$cpg_ob;
+
+ unless($no_header){
+ print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ ### For cytosines in CHG context
+ my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
+
+ $chg_ot =~ s/^/CHG_OT_/;
+ $chg_ot =~ s/sam$/txt/;
+ $chg_ot =~ s/bam$/txt/;
+ $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
+ $chg_ot = $output_dir . $chg_ot;
+
+ if ($gzip){
+ $chg_ot .= '.gz';
+ open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
+ # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
+ push @sorting_files,$chg_ot;
+
+ unless($no_header){
+ print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $chg_ctot =~ s/^/CHG_CTOT_/;
+ $chg_ctot =~ s/sam$/txt/;
+ $chg_ctot =~ s/bam$/txt/;
+ $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
+ $chg_ctot = $output_dir . $chg_ctot;
+
+ if ($gzip){
+ $chg_ctot .= '.gz';
+ open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
+ # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
+ push @sorting_files,$chg_ctot;
+
+ unless($no_header){
+ print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $chg_ctob =~ s/^/CHG_CTOB_/;
+ $chg_ctob =~ s/sam$/txt/;
+ $chg_ctob =~ s/bam$/txt/;
+ $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
+ $chg_ctob = $output_dir . $chg_ctob;
+
+ if ($gzip){
+ $chg_ctob .= '.gz';
+ open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
+ # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
+ push @sorting_files,$chg_ctob;
+
+ unless($no_header){
+ print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $chg_ob =~ s/^/CHG_OB_/;
+ $chg_ob =~ s/sam$/txt/;
+ $chg_ob =~ s/bam$/txt/;
+ $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
+ $chg_ob = $output_dir . $chg_ob;
+
+ if ($gzip){
+ $chg_ob .= '.gz';
+ open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
+ # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
+ push @sorting_files,$chg_ob;
+
+ unless($no_header){
+ print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ ### For cytosines in CHH context
+ my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
+
+ $chh_ot =~ s/^/CHH_OT_/;
+ $chh_ot =~ s/sam$/txt/;
+ $chh_ot =~ s/bam$/txt/;
+ $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
+ $chh_ot = $output_dir . $chh_ot;
+
+ if ($gzip){
+ $chh_ot .= '.gz';
+ open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
+ # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
+ push @sorting_files,$chh_ot;
+
+ unless($no_header){
+ print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $chh_ctot =~ s/^/CHH_CTOT_/;
+ $chh_ctot =~ s/sam$/txt/;
+ $chh_ctot =~ s/bam$/txt/;
+ $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
+ $chh_ctot = $output_dir . $chh_ctot;
+
+ if ($gzip){
+ $chh_ctot .= '.gz';
+ open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
+ # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
+ push @sorting_files,$chh_ctot;
+
+ unless($no_header){
+ print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $chh_ctob =~ s/^/CHH_CTOB_/;
+ $chh_ctob =~ s/sam$/txt/;
+ $chh_ctob =~ s/bam$/txt/;
+ $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
+ $chh_ctob = $output_dir . $chh_ctob;
+
+ if ($gzip){
+ $chh_ctob .= '.gz';
+ open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
+ # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
+ push @sorting_files,$chh_ctob;
+
+ unless($no_header){
+ print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+
+ $chh_ob =~ s/^/CHH_OB_/;
+ $chh_ob =~ s/sam$/txt/;
+ $chh_ob =~ s/bam$/txt/;
+ $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
+ $chh_ob = $output_dir . $chh_ob;
+
+ if ($gzip){
+ $chh_ob .= '.gz';
+ open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
+ }
+ else{
+ open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
+ # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
+ }
+
+ warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
+ push @sorting_files,$chh_ob;
+
+ unless($no_header){
+ print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
+ }
+ }
+ return $report_filename;
+}
+
+sub isBam{
+
+ my $filename = shift;
+
+ # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
+ open (DISGUISE,"zcat $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";
+
+ ### when BAM files read through a zcat stream they start with BAM...
+ my $bam_in_disguise = ;
+ # warn "BAM in disguise: $bam_in_disguise\n\n";
+
+ if ($bam_in_disguise){
+ if ($bam_in_disguise =~ /^BAM/){
+ close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
+ return 1;
+ }
+ else{
+ close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
+ return 0;
+ }
+ }
+ else{
+ close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
+ return 0;
+ }
+}
+
+
+sub print_helpfile{
+
+ print << 'HOW_TO';
+
+
+DESCRIPTION
+
+The following is a brief description of all options to control the Bismark
+methylation extractor. The script reads in a bisulfite read alignment results file
+produced by the Bismark bisulfite mapper and extracts the methylation information
+for individual cytosines. This information is found in the methylation call field
+which can contain the following characters:
+
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~ X for methylated C in CHG context ~~~
+ ~~~ x for not methylated C CHG ~~~
+ ~~~ H for methylated C in CHH context ~~~
+ ~~~ h for not methylated C in CHH context ~~~
+ ~~~ Z for methylated C in CpG context ~~~
+ ~~~ z for not methylated C in CpG context ~~~
+ ~~~ U for methylated C in Unknown context (CN or CHN ~~~
+ ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
+ ~~~ . for any bases not involving cytosines ~~~
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
+context (this distinction is actually already made in Bismark itself). As the methylation
+information for every C analysed can produce files which easily have tens or even hundreds of
+millions of lines, file sizes can become very large and more difficult to handle. The C
+methylation info additionally splits cytosine methylation calls up into one of the four possible
+strands a given bisulfite read aligned against:
+
+ OT original top strand
+ CTOT complementary to original top strand
+
+ OB original bottom strand
+ CTOB complementary to original bottom strand
+
+Thus, by default twelve individual output files are being generated per input file (unless
+--comprehensive is specified, see below). The output files can be imported into a genome
+viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
+unless the bisulfite reads were generated preserving directionality it doesn't make any
+sense to look at the data in a strand-specific manner). Strand-specific output files can
+optionally be skipped, in which case only three output files for CpG, CHG or CHH context
+will be generated. For both the strand-specific and comprehensive outputs there is also
+the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
+
+
+The output files are in the following format (tab delimited):
+
+
+
+
+USAGE: methylation_extractor [options]
+
+
+ARGUMENTS:
+==========
+
+ A space-separated list of Bismark result files in SAM format from
+ which methylation information is extracted for every cytosine in
+ the reads. For alignment files in the older custom Bismark output
+ see option '--vanilla'.
+
+OPTIONS:
+
+-s/--single-end Input file(s) are Bismark result file(s) generated from single-end
+ read data. Specifying either --single-end or --paired-end is
+ mandatory.
+
+-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
+ read data. Specifying either --paired-end or --single-end is
+ mandatory.
+
+--vanilla The Bismark result input file(s) are in the old custom Bismark format
+ (up to version 0.5.x) and not in SAM format which is the default as
+ of Bismark version 0.6.x or higher. Default: OFF.
+
+--no_overlap For paired-end reads it is theoretically possible that read_1 and
+ read_2 overlap. This option avoids scoring overlapping methylation
+ calls twice (only methylation calls of read 1 are used for in the process
+ since read 1 has historically higher quality basecalls than read 2).
+ Whilst this option removes a bias towards more methylation calls
+ in the center of sequenced fragments it may de facto remove a sizable
+ proportion of the data. This option is on by default for paired-end data
+ but can be disabled using --include_overlap. Default: ON.
+
+--include_overlap For paired-end data all methylation calls will be extracted irrespective of
+ of whether they overlap or not. Default: OFF.
+
+--ignore Ignore the first bp from the 5' end of Read 1 (or single-end alignment
+ files) when processing the methylation call string. This can remove e.g. a
+ restriction enzyme site at the start of each read or any other source of
+ bias (such as PBAT-Seq data).
+
+--ignore_r2 Ignore the first bp from the 5' end of Read 2 of paired-end sequencing
+ results only. Since the first couple of bases in Read 2 of BS-Seq experiments
+ show a severe bias towards non-methylation as a result of end-repairing
+ sonicated fragments with unmethylated cytosines (see M-bias plot), it is
+ recommended that the first couple of bp of Read 2 are removed before
+ starting downstream analysis. Please see the section on M-bias plots in the
+ Bismark User Guide for more details.
+
+--ignore_3prime Ignore the last bp from the 3' end of Read 1 (or single-end alignment
+ files) when processing the methylation call string. This can remove unwanted
+ biases from the end of reads.
+
+--ignore_3prime_r2 Ignore the last bp from the 3' end of Read 2 of paired-end sequencing
+ results only. This can remove unwanted biases from the end of reads.
+
+--comprehensive Specifying this option will merge all four possible strand-specific
+ methylation info into context-dependent output files. The default
+
+ contexts are:
+ - CpG context
+ - CHG context
+ - CHH context
+
+--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
+ strand-specific output files (default) for Cs in
+ - CpG context
+ - non-CpG context
+
+--report Prints out a short methylation summary as well as the paramaters used to run
+ this script. Default: ON.
+
+--no_header Suppresses the Bismark version header line in all output files for more convenient
+ batch processing.
+
+-o/--output DIR Allows specification of a different output directory (absolute or relative
+ path). If not specified explicitely, the output will be written to the current directory.
+
+--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
+ explicitly if Samtools is in the PATH already.
+
+--gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
+ a GZIP compressed form to save disk space. This option does not work on bedGraph and
+ genome-wide cytosine reports as they are 'tiny' anyway.
+
+--version Displays version information.
+
+-h/--help Displays this help file and exits.
+
+--mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
+ well as a report (optional) and then quit. Default: OFF.
+
+--mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report.
+ Only recommended for users who deliberately want to keep an earlier version of the M-bias report.
+ Default: OFF.
+
+--multicore Sets the number of cores to be used for the methylation extraction process. If system resources
+ are plentiful this is a viable option to speed up the extraction process (we observed a near linear
+ speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file
+ and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore
+ specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so
+ --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing
+ on the bismark2bedGraph or genome-wide cytosine report processes.
+
+
+
+
+bedGraph specific options:
+==========================
+
+--bedGraph After finishing the methylation extraction, the methylation output is written into a
+ sorted bedGraph file that reports the position of a given cytosine and its methylation
+ state (in %, see details below). The methylation extractor output is temporarily split up into
+ temporary files, one per chromosome (written into the current directory or folder
+ specified with -o/--output); these temp files are then used for sorting and deleted
+ afterwards. By default, only cytosines in CpG context will be sorted. The option
+ '--CX_context' may be used to report all cytosines irrespective of sequence context
+ (this will take MUCH longer!). The default folder for temporary files during the sorting
+ process is the output directory. The bedGraph conversion step is performed by the external
+ module 'bismark2bedGraph'; this script needs to reside in the same folder as the
+ bismark_methylation_extractor itself.
+
+--zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
+ and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
+ instead of using 1-based coordinates throughout. Default: OFF.
+
+
+--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
+ before its methylation percentage is reported. Default: 1.
+
+--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
+
+--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
+ in the experiment irrespective of its sequence context. This applies to both forward and
+ reverse strands. Please be aware that this option may generate large temporary and output files
+ and may take a long time to sort (up to many hours). Default: OFF.
+ (i.e. Default = CpG context only).
+
+--buffer_size This allows you to specify the main memory sort buffer when sorting the methylation information.
+ Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
+ a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
+ (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
+ Defaults to 2G.
+
+--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
+ scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
+ individual chromosome files. These errors were caused by the operating system's limit
+ of the number of filehandle that can be written to at any one time (typically 1024; to
+ find out this limit on Linux, type: ulimit -a).
+ To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
+ methylation calls into individual chromosome files. Instead, all input files are
+ temporarily merged into a single file (unless there is only a single file), and this
+ file will then be sorted by both chromosome AND position using the Unix sort command.
+ Please be aware that this option might take a looooong time to complete, depending on
+ the size of the input files, and the memory you allocate to this process (see --buffer_size).
+ Nevertheless, it seems to be working.
+
+--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
+ instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
+ sorting process of very large files, but this comes at the cost of a larger memory footprint
+ (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
+ of RAM). Due to overheads in creating and looping through these arrays it seems that it will
+ actually be *slower* for small files (few million alignments), and we are currently testing at
+ which point it is advisable to use this option. Note that --ample_memory is not compatible
+ with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
+
+
+
+Genome-wide cytosine methylation report specific options:
+=========================================================
+
+--cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
+ genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
+ chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all
+ cytosine context is optional). The output considers all Cs on both forward and reverse strands and
+ reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
+ covered). The cytosine report conversion step is performed by the external module
+ 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
+ itself.
+
+--CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
+ its context. This applies to both forward and reverse strands. Please be aware that this will
+ generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
+ Default: OFF (i.e. Default = CpG context only).
+
+--zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF.
+
+--genome_folder Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
+ formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
+
+--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
+ will be named to include the input filename and the chromosome number.
+
+
+
+OUTPUT:
+
+The bismark_methylation_extractor output is in the form:
+========================================================
+
+
+* Methylated cytosines receive a '+' orientation,
+* Unmethylated cytosines receive a '-' orientation.
+
+
+
+The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
+=========================================================================================================
+
+track type=bedGraph (header line)
+
+
+
+
+
+The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'):
+============================================================================================================================================
+
+
+
+
+
+The genome-wide cytosine methylation output file is tab-delimited in the following format:
+==========================================================================================
+
+
+
+
+This script was last modified on 22 April 2015.
+
+HOW_TO
+}
diff -r 047eb877b6f0 -r 4084128e7cca old/bismark
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/old/bismark Wed Sep 02 15:55:46 2015 -0400
@@ -0,0 +1,7959 @@
+#!/usr/bin/perl --
+use strict;
+use warnings;
+use IO::Handle;
+use Cwd;
+$|++;
+use Getopt::Long;
+
+
+## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see .
+
+
+my $parent_dir = getcwd;
+my $bismark_version = 'v0.10.0';
+my $command_line = join (" ",@ARGV);
+
+### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
+foreach my $arg (@ARGV){
+ if ($arg eq '--solexa1.3-quals'){
+ $arg = '--phred64-quals';
+ }
+}
+my @filenames; # will be populated by processing the command line
+
+my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line();
+
+my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
+my %chromosomes; # stores the chromosome sequences of the mouse genome
+my %counting; # counting various events
+
+my $seqID_contains_tabs;
+
+foreach my $filename (@filenames){
+
+ chdir $parent_dir or die "Unable to move to initial working directory $!\n";
+ ### resetting the counting hash and fhs
+ reset_counters_and_fhs($filename);
+ $seqID_contains_tabs = 0;
+
+ ### PAIRED-END ALIGNMENTS
+ if ($filename =~ ','){
+ my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
+
+ $fhs[0]->{name} = 'CTread1GAread2CTgenome';
+ $fhs[1]->{name} = 'GAread1CTread2GAgenome';
+ $fhs[2]->{name} = 'GAread1CTread2CTgenome';
+ $fhs[3]->{name} = 'CTread1GAread2GAgenome';
+
+ warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
+
+ my ($filename_1,$filename_2) = (split (/,/,$filename));
+ warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
+
+ ### additional variables only for paired-end alignments
+ my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
+
+ ### FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ warn "Input files are in FastA format\n";
+
+ if ($directional){
+ ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
+ ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = undef;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = undef;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ else{
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
+ ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+
+ if ($bowtie2){
+ paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ else{
+ paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ }
+
+ ### FastQ format
+ else{
+ warn "Input files are in FastQ format\n";
+ if ($directional){
+ if ($bowtie2){
+ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = undef;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = undef;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ else{ # Bowtie 1 alignments
+ if ($gzip){
+ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
+ $fhs[0]->{inputfile_2} = undef; # no longer needed
+ $fhs[1]->{inputfile_1} = undef;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = undef;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
+ $fhs[3]->{inputfile_2} = undef; # no longer needed
+ }
+ else{
+ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = undef;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = undef;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ }
+ }
+ elsif($pbat){ # PBAT-Seq
+ ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = undef;
+ $fhs[0]->{inputfile_2} = undef;
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[3]->{inputfile_1} = undef;
+ $fhs[3]->{inputfile_2} = undef;
+ }
+ else{
+ if ($bowtie2){
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ else{ # Bowtie 1 alignments
+ if ($gzip){
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = undef;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = undef;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
+ }
+ else{ #uncompressed temp files
+ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
+ ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
+
+ $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
+ $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
+ $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
+ $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
+ $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
+ }
+ }
+ }
+ if ($bowtie2){
+ paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ else{
+ paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ }
+ start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+
+ ### Else we are performing SINGLE-END ALIGNMENTS
+ else{
+ warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
+ ### Initialising bisulfite conversion filenames
+ my ($C_to_T_infile,$G_to_A_infile);
+
+
+ ### FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ warn "Inut file is in FastA format\n";
+ if ($directional){
+ ($C_to_T_infile) = biTransformFastAFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ }
+ else{
+ ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
+ }
+
+ ### Creating 4 different bowtie filehandles and storing the first entry
+ if ($bowtie2){
+ single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
+ }
+ else{
+ single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
+ }
+ }
+
+ ## FastQ format
+ else{
+ warn "Input file is in FastQ format\n";
+ if ($directional){
+ ($C_to_T_infile) = biTransformFastQFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ }
+ elsif($pbat){
+ ($G_to_A_infile) = biTransformFastQFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
+ }
+ else{
+ ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
+ $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
+ $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
+ }
+
+ ### Creating up to 4 different bowtie filehandles and storing the first entry
+ if ($bowtie2){
+ single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
+ }
+ elsif ($pbat){
+ single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
+ }
+ else{
+ single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
+ }
+ }
+
+ start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
+
+ }
+}
+
+sub start_methylation_call_procedure_single_ends {
+ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
+ my ($dir,$filename);
+
+ if ($sequence_file =~ /\//){
+ ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $sequence_file;
+ }
+
+ ### printing all alignments to a results file
+ my $outfile = $filename;
+ if ($prefix){
+ $outfile = "$prefix.$outfile";
+ }
+
+
+ if ($bowtie2){ # SAM format is the default for Bowtie 2
+ $outfile =~ s/$/_bismark_bt2.sam/;
+ }
+ elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
+ $outfile =~ s/$/_bismark.txt/;
+ }
+ else{ # SAM is the default output
+ $outfile =~ s/$/_bismark.sam/;
+ }
+
+ $bam = 0 unless (defined $bam);
+
+ if ($bam == 1){ ### Samtools is installed, writing out BAM directly
+ $outfile =~ s/sam/bam/;
+ open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ }
+ elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
+ $outfile .= '.gz';
+ open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ }
+ else{ # uncompressed ouput, default
+ open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ }
+
+ warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
+ sleep(1);
+
+ if ($vanilla){
+ print OUT "Bismark version: $bismark_version\n";
+ }
+
+ ### printing alignment and methylation call summary to a report file
+ my $reportfile = $filename;
+ if ($prefix){
+ $reportfile = "$prefix.$reportfile";
+ }
+
+ if ($bowtie2){
+ $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
+ }
+ else{
+ $reportfile =~ s/$/_bismark_SE_report.txt/;
+ }
+
+ open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
+ print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
+
+ if ($unmapped){
+ my $unmapped_file = $filename;
+ if ($prefix){
+ $unmapped_file = "$prefix.$unmapped_file";
+ }
+
+ $unmapped_file =~ s/$/_unmapped_reads.txt/;
+ open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
+ print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
+ }
+ if ($ambiguous){
+ my $ambiguous_file = $filename;
+ if ($prefix){
+ $ambiguous_file = "$prefix.$ambiguous_file";
+ }
+ $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
+ open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
+ print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
+ }
+
+ if ($directional){
+ print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
+ }
+ print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+
+
+ ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
+ unless (%chromosomes){
+ my $cwd = getcwd; # storing the path of the current working directory
+ print "Current working directory is: $cwd\n\n";
+ read_genome_into_memory($cwd);
+ }
+
+ unless ($vanilla or $sam_no_hd){
+ generate_SAM_header();
+ }
+
+ ### Input file is in FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
+ }
+ ### Input file is in FastQ format
+ else{
+ process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
+ }
+}
+
+sub start_methylation_call_procedure_paired_ends {
+ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+
+ my ($dir_1,$filename_1);
+
+ if ($sequence_file_1 =~ /\//){
+ ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename_1 = $sequence_file_1;
+ }
+
+ my ($dir_2,$filename_2);
+
+ if ($sequence_file_2 =~ /\//){
+ ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename_2 = $sequence_file_2;
+ }
+
+ ### printing all alignments to a results file
+ my $outfile = $filename_1;
+
+ if ($prefix){
+ $outfile = "$prefix.$outfile";
+ }
+
+ if ($bowtie2){ # SAM format is the default Bowtie 2 output
+ $outfile =~ s/$/_bismark_bt2_pe.sam/;
+ }
+ elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
+ $outfile =~ s/$/_bismark_pe.txt/;
+ }
+ else{ # SAM format is the default Bowtie 1 output
+ $outfile =~ s/$/_bismark_pe.sam/;
+ }
+
+ $bam = 0 unless (defined $bam);
+
+ if ($bam == 1){ ### Samtools is installed, writing out BAM directly
+ $outfile =~ s/sam/bam/;
+ open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ }
+ elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
+ $outfile .= '.gz';
+ open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ }
+ else{ # uncompressed ouput, default
+ open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ }
+
+ warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
+ sleep(1);
+
+ if ($vanilla){
+ print OUT "Bismark version: $bismark_version\n";
+ }
+
+ ### printing alignment and methylation call summary to a report file
+ my $reportfile = $filename_1;
+ if ($prefix){
+ $reportfile = "$prefix.$reportfile";
+ }
+
+ if ($bowtie2){
+ $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
+ }
+ else{
+ $reportfile =~ s/$/_bismark_PE_report.txt/;
+ }
+
+ open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
+ print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
+ print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+
+
+ ### Unmapped read output
+ if ($unmapped){
+ my $unmapped_1 = $filename_1;
+ my $unmapped_2 = $filename_2;
+ if ($prefix){
+ $unmapped_1 = "$prefix.$unmapped_1";
+ $unmapped_2 = "$prefix.$unmapped_2";
+ }
+ $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
+ $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
+ open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
+ open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
+ print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
+ }
+
+ if ($ambiguous){
+ my $amb_1 = $filename_1;
+ my $amb_2 = $filename_2;
+ if ($prefix){
+ $amb_1 = "$prefix.$amb_1";
+ $amb_2 = "$prefix.$amb_2";
+ }
+
+ $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
+ $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
+ open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
+ open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
+ print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
+ }
+
+ if ($directional){
+ print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
+ }
+
+ ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
+ unless (%chromosomes){
+ my $cwd = getcwd; # storing the path of the current working directory
+ print "Current working directory is: $cwd\n\n";
+ read_genome_into_memory($cwd);
+ }
+
+ unless ($vanilla or $sam_no_hd){
+ generate_SAM_header();
+ }
+
+ ### Input files are in FastA format
+ if ($sequence_file_format eq 'FASTA'){
+ process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+ ### Input files are in FastQ format
+ else{
+ process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+ }
+}
+
+sub print_final_analysis_report_single_end{
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ ### All sequences from the original sequence file have been analysed now
+ ### deleting temporary C->T or G->A infiles
+
+ if ($directional){
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
+ if ($deletion_successful == 1){
+ warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
+ }
+ else{
+ warn "Could not delete temporary file $C_to_T_infile properly $!\n";
+ }
+ }
+ elsif ($pbat){
+ my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
+ if ($deletion_successful == 1){
+ warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
+ }
+ else{
+ warn "Could not delete temporary file $G_to_A_infile properly $!\n";
+ }
+ }
+ else{
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
+ if ($deletion_successful == 2){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
+ }
+ else{
+ warn "Could not delete temporary files properly $!\n";
+ }
+ }
+
+ ### printing a final report for the alignment procedure
+ print REPORT "Final Alignment report\n",'='x22,"\n";
+ warn "Final Alignment report\n",'='x22,"\n";
+ # foreach my $index (0..$#fhs){
+ # print "$fhs[$index]->{name}\n";
+ # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
+ # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
+ # }
+
+ ### printing a final report for the methylation call procedure
+ warn "Sequences analysed in total:\t$counting{sequences_count}\n";
+ print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
+ my $percent_alignable_sequences;
+
+ if ($counting{sequences_count} == 0){
+ $percent_alignable_sequences = 0;
+ }
+ else{
+ $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
+ }
+
+ warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
+ print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
+
+ ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
+ ### only calculating the percentage if there were any overruled alignments
+ if ($counting{low_complexity_alignments_overruled_count}){
+ my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
+ # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
+ }
+
+ print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
+ print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
+
+ print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
+ print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
+
+ if ($directional){
+ print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ }
+
+ ### detailed information about Cs analysed
+ warn "Final Cytosine Methylation Report\n",'='x33,"\n";
+ my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
+ warn "Total number of C's analysed:\t$total_number_of_C\n\n";
+ warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+ if ($bowtie2){
+ warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
+ }
+ warn "\n";
+
+ warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+ if ($bowtie2){
+ warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
+ }
+ warn "\n";
+
+ print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+ print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+
+ print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+ if ($bowtie2){
+ print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
+ }
+ print REPORT "\n";
+
+ print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+ if ($bowtie2){
+ print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
+ }
+ print REPORT "\n";
+
+ my $percent_meCHG;
+ if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ }
+
+ my $percent_meCHH;
+ if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ }
+
+ my $percent_meCpG;
+ if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ }
+
+ my $percent_meC_unknown;
+ if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
+ $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
+ }
+
+
+ ### printing methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ warn "C methylated in CpG context:\t${percent_meCpG}%\n";
+ print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage (CHG context) if applicable
+ if ($percent_meCHG){
+ warn "C methylated in CHG context:\t${percent_meCHG}%\n";
+ print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage (CHH context) if applicable
+ if ($percent_meCHH){
+ warn "C methylated in CHH context:\t${percent_meCHH}%\n";
+ print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+ }
+
+ ### printing methylated C percentage (Unknown C context) if applicable
+ if ($bowtie2){
+ if ($percent_meC_unknown){
+ warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+ print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
+ }
+ }
+ print REPORT "\n\n";
+ warn "\n\n";
+
+ if ($seqID_contains_tabs){
+ warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
+ print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
+ }
+
+
+ ###########################################################################################################################################
+ ### create pie-chart with mapping stats
+ ###########################################################################################################################################
+
+
+ my $filename;
+ if ($pbat){
+ $filename = $G_to_A_infile;
+ }
+ else{
+ $filename = $C_to_T_infile;
+ }
+
+ my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+ $pie_chart =~ s/gz$//;
+ $pie_chart =~ s/_C_to_T\.fastq$//;
+ $pie_chart =~ s/_G_to_A\.fastq$//;
+
+ # if ($prefix){
+ # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation
+ # }
+ $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png";
+
+
+ #Check whether the module GD::Graph is installed
+ my $gd_graph_installed = 0;
+ eval{
+ require GD::Graph::pie;
+ GD::Graph::pie->import();
+ };
+
+ unless($@) {
+ $gd_graph_installed = 1;
+ }
+ else{
+ warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
+ sleep(2);
+ }
+
+ if ($gd_graph_installed){
+ warn "Generating pie chart\n\n";
+ sleep(1);
+ my $graph = GD::Graph::pie->new(600,600);
+
+ my $percent_unaligned;
+ my $percent_multiple;
+ my $percent_unextractable;
+
+ if ($counting{sequences_count}){
+ $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
+ $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
+ $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
+ }
+ else{
+ $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
+ }
+
+ my @aln_stats = (
+ ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
+ [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
+ );
+
+ $graph->set(
+ start_angle => 180,
+ '3d' => 0,
+ label => 'Alignment stats (single-end)',
+ suppress_angle => 2, # Only label slices of sufficient size
+ transparent => 0,
+ dclrs => [ qw(red lorange dgreen cyan) ],
+ ) or die $graph->error;
+
+ my $gd = $graph->plot(\@aln_stats) or die $graph->error;
+
+ open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
+ binmode PIE;
+ print PIE $gd->png;
+ }
+
+ warn "====================\nBismark run complete\n====================\n\n";
+
+}
+
+
+sub print_final_analysis_report_paired_ends{
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
+ if ($directional){
+ if ($G_to_A_infile_2){
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
+ if ($deletion_successful == 2){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
+ }
+ else{
+ warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
+ }
+ }
+ else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
+ if ($deletion_successful == 1){
+ warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
+ }
+ else{
+ warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
+ }
+ }
+ }
+ else{
+ if ($G_to_A_infile_2 and $C_to_T_infile_2){
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
+ if ($deletion_successful == 4){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
+ }
+ else{
+ warn "Could not delete temporary files properly: $!\n";
+ }
+ }
+ else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
+ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
+ if ($deletion_successful == 2){
+ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
+ }
+ else{
+ warn "Could not delete temporary files properly: $!\n";
+ }
+ }
+ }
+
+ ### printing a final report for the alignment procedure
+ warn "Final Alignment report\n",'='x22,"\n";
+ print REPORT "Final Alignment report\n",'='x22,"\n";
+ # foreach my $index (0..$#fhs){
+ # print "$fhs[$index]->{name}\n";
+ # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
+ # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
+ # }
+
+ ### printing a final report for the methylation call procedure
+ warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
+ print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
+
+ my $percent_alignable_sequence_pairs;
+ if ($counting{sequences_count} == 0){
+ $percent_alignable_sequence_pairs = 0;
+ }
+ else{
+ $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
+ }
+ print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
+ print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
+
+ print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
+ print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
+
+
+ print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
+ print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
+ print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
+ print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
+ print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
+ ### detailed information about Cs analysed
+
+ if ($directional){
+ print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
+ }
+
+ warn "Final Cytosine Methylation Report\n",'='x33,"\n";
+ print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+
+ my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ warn "Total number of C's analysed:\t$total_number_of_C\n\n";
+ warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+ if ($bowtie2){
+ warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
+ }
+ warn "\n";
+
+ warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+ if ($bowtie2){
+ warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
+ }
+ warn "\n";
+
+ print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+ print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+ if ($bowtie2){
+ print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
+ }
+ print REPORT "\n";
+
+ print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+ if ($bowtie2){
+ print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
+ }
+ print REPORT "\n";
+
+ my $percent_meCHG;
+ if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ }
+
+ my $percent_meCHH;
+ if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ }
+
+ my $percent_meCpG;
+ if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ }
+
+ my $percent_meC_unknown;
+ if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
+ $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
+ }
+
+
+ ### printing methylated CpG percentage if applicable
+ if ($percent_meCpG){
+ warn "C methylated in CpG context:\t${percent_meCpG}%\n";
+ print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage in CHG context if applicable
+ if ($percent_meCHG){
+ warn "C methylated in CHG context:\t${percent_meCHG}%\n";
+ print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ }
+
+ ### printing methylated C percentage in CHH context if applicable
+ if ($percent_meCHH){
+ warn "C methylated in CHH context:\t${percent_meCHH}%\n";
+ print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+ }
+
+ ### printing methylated C percentage (Unknown C context) if applicable
+ if ($bowtie2){
+ if ($percent_meC_unknown){
+ warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+ print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+ }
+ else{
+ warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
+ print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
+ }
+ }
+ print REPORT "\n\n";
+ warn "\n\n";
+
+
+ ############################################################################################################################################
+ ### create pie-chart with mapping stats
+ ###########################################################################################################################################
+
+ my $filename;
+ if ($pbat){
+ $filename = $G_to_A_infile_1;
+ }
+ else{
+ $filename = $C_to_T_infile_1;
+ }
+
+ my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+ $pie_chart =~ s/gz$//;
+ $pie_chart =~ s/_C_to_T.fastq$//;
+ $pie_chart =~ s/_G_to_A.fastq$//;
+ ### special format for gzipped PE Bowtie1 files
+ $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//;
+ $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//;
+
+ if ($prefix){
+ # prefix is now being prepended to the temp files already
+ # $pie_chart = "$prefix.$pie_chart";
+ }
+ $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png";
+
+ #Check whether the module GD::Graph is installed
+ my $gd_graph_installed = 0;
+ eval{
+ require GD::Graph::pie;
+ GD::Graph::pie->import();
+ };
+
+ unless($@) {
+ $gd_graph_installed = 1;
+ }
+ else{
+ warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
+ sleep(2);
+ }
+
+ if ($gd_graph_installed){
+ warn "Generating pie chart\n\n";
+ sleep(1);
+ my $graph = GD::Graph::pie->new(600,600);
+
+ my $percent_unaligned;
+ my $percent_multiple;
+ my $percent_unextractable;
+
+ if ($counting{sequences_count}){
+ $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
+ $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
+ $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
+ }
+ else{
+ $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
+ }
+
+ my @aln_stats = (
+ ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
+ [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
+ );
+
+ # push @{$mbias_read1[0]},$pos;
+
+ $graph->set(
+ start_angle => 180,
+ '3d' => 0,
+ label => 'Alignment stats (paired-end)',
+ suppress_angle => 2, # Only label slices of sufficient size
+ transparent => 0,
+ dclrs => [ qw(red lorange dgreen cyan) ],
+ ) or die $graph->error;
+
+ my $gd = $graph->plot(\@aln_stats) or die $graph->error;
+
+ open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
+ binmode PIE;
+ print PIE $gd->png;
+ }
+
+ warn "====================\nBismark run complete\n====================\n\n";
+
+}
+
+sub process_single_end_fastA_file_for_methylation_call{
+ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
+ ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
+ ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
+ ### the C->T or G->A version
+
+ ### gzipped version of the infile
+ if ($sequence_file =~ /\.gz$/){
+ open (IN,"zcat $sequence_file |") or die $!;
+ }
+ else{
+ open (IN,$sequence_file) or die $!;
+ }
+
+ my $count = 0;
+
+ warn "\nReading in the sequence file $sequence_file\n";
+ while (1) {
+ # last if ($counting{sequences_count} > 100);
+ my $identifier = ;
+ my $sequence = ;
+ last unless ($identifier and $sequence);
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+ if ($counting{sequences_count}%1000000==0) {
+ warn "Processed $counting{sequences_count} sequences so far\n";
+ }
+ chomp $sequence;
+ chomp $identifier;
+
+ $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
+ }
+ else{
+ $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequence to ambiguous.out if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ print AMBIG ">$identifier\n";
+ print AMBIG "$sequence\n";
+ }
+
+ # print the sequence to file if --un was specified
+ elsif ($unmapped and $return == 1){
+ print UNMAPPED ">$identifier\n";
+ print UNMAPPED "$sequence\n";
+ }
+ }
+ print "Processed $counting{sequences_count} sequences in total\n\n";
+
+ print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
+
+}
+
+sub process_single_end_fastQ_file_for_methylation_call{
+ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
+ ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
+ ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
+ ### the C->T or G->A version
+
+ ### gzipped version of the infile
+ if ($sequence_file =~ /\.gz$/){
+ open (IN,"zcat $sequence_file |") or die $!;
+ }
+ else{
+ open (IN,$sequence_file) or die $!;
+ }
+
+ my $count = 0;
+
+ warn "\nReading in the sequence file $sequence_file\n";
+ while (1) {
+ my $identifier = ;
+ my $sequence = ;
+ my $identifier_2 = ;
+ my $quality_value = ;
+ last unless ($identifier and $sequence and $identifier_2 and $quality_value);
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+
+ if ($counting{sequences_count}%1000000==0) {
+ warn "Processed $counting{sequences_count} sequences so far\n";
+ }
+ chomp $sequence;
+ chomp $identifier;
+ chomp $quality_value;
+
+ $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
+ }
+ else{
+ $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequence to ambiguous.out if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ print AMBIG "\@$identifier\n";
+ print AMBIG "$sequence\n";
+ print AMBIG $identifier_2;
+ print AMBIG "$quality_value\n";
+ }
+
+ # print the sequence to file if --un was specified
+ elsif ($unmapped and $return == 1){
+ print UNMAPPED "\@$identifier\n";
+ print UNMAPPED "$sequence\n";
+ print UNMAPPED $identifier_2;
+ print UNMAPPED "$quality_value\n";
+ }
+ }
+ print "Processed $counting{sequences_count} sequences in total\n\n";
+
+ print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
+
+}
+
+sub process_fastA_files_for_paired_end_methylation_calls{
+ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
+ ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
+ ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
+ ### converted genomes (either the C->T or G->A version)
+
+ ### gzipped version of the infiles
+ if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
+ open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
+ open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
+ }
+ else{
+ open (IN1,$sequence_file_1) or die $!;
+ open (IN2,$sequence_file_2) or die $!;
+ }
+
+ warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
+ ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
+
+ my $count = 0;
+
+ while (1) {
+ # reading from the first input file
+ my $identifier_1 = ;
+ my $sequence_1 = ;
+ # reading from the second input file
+ my $identifier_2 = ;
+ my $sequence_2 = ;
+ last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
+
+ $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
+ $identifier_2 = fix_IDs($identifier_2);
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+ if ($counting{sequences_count}%1000000==0) {
+ warn "Processed $counting{sequences_count} sequence pairs so far\n";
+ }
+ my $orig_identifier_1 = $identifier_1;
+ my $orig_identifier_2 = $identifier_2;
+
+ chomp $sequence_1;
+ chomp $identifier_1;
+ chomp $sequence_2;
+ chomp $identifier_2;
+
+ $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
+ }
+ else{
+ $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ print AMBIG_1 $orig_identifier_1;
+ print AMBIG_1 "$sequence_1\n";
+ print AMBIG_2 $orig_identifier_2;
+ print AMBIG_2 "$sequence_2\n";
+ }
+
+ # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
+ elsif ($unmapped and $return == 1){
+ print UNMAPPED_1 $orig_identifier_1;
+ print UNMAPPED_1 "$sequence_1\n";
+ print UNMAPPED_2 $orig_identifier_2;
+ print UNMAPPED_2 "$sequence_2\n";
+ }
+ }
+
+ warn "Processed $counting{sequences_count} sequences in total\n\n";
+
+ close OUT or die $!;
+
+ print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+
+}
+
+sub process_fastQ_files_for_paired_end_methylation_calls{
+ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
+ ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
+ ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
+ ### of the converted genomes (either C->T or G->A version)
+
+ ### gzipped version of the infiles
+ if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
+ open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
+ open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
+ }
+ else{
+ open (IN1,$sequence_file_1) or die $!;
+ open (IN2,$sequence_file_2) or die $!;
+ }
+
+ my $count = 0;
+
+ warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
+ ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
+ while (1) {
+ # reading from the first input file
+ my $identifier_1 = ;
+ my $sequence_1 = ;
+ my $ident_1 = ; # not needed
+ my $quality_value_1 = ; # not needed
+ # reading from the second input file
+ my $identifier_2 = ;
+ my $sequence_2 = ;
+ my $ident_2 = ; # not needed
+ my $quality_value_2 = ; # not needed
+ last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
+
+ $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
+ $identifier_2 = fix_IDs($identifier_2);
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $counting{sequences_count}++;
+ if ($counting{sequences_count}%1000000==0) {
+ warn "Processed $counting{sequences_count} sequence pairs so far\n";
+ }
+
+ my $orig_identifier_1 = $identifier_1;
+ my $orig_identifier_2 = $identifier_2;
+
+ chomp $sequence_1;
+ chomp $identifier_1;
+ chomp $sequence_2;
+ chomp $identifier_2;
+ chomp $quality_value_1;
+ chomp $quality_value_2;
+
+ $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
+
+ my $return;
+ if ($bowtie2){
+ $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
+ }
+ else{
+ $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
+ }
+
+ unless ($return){
+ $return = 0;
+ }
+
+ # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
+ if ($ambiguous and $return == 2){
+ # seq_1
+ print AMBIG_1 $orig_identifier_1;
+ print AMBIG_1 "$sequence_1\n";
+ print AMBIG_1 $ident_1;
+ print AMBIG_1 "$quality_value_1\n";
+ # seq_2
+ print AMBIG_2 $orig_identifier_2;
+ print AMBIG_2 "$sequence_2\n";
+ print AMBIG_2 $ident_2;
+ print AMBIG_2 "$quality_value_2\n";
+ }
+
+ # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
+ elsif ($unmapped and $return == 1){
+ # seq_1
+ print UNMAPPED_1 $orig_identifier_1;
+ print UNMAPPED_1 "$sequence_1\n";
+ print UNMAPPED_1 $ident_1;
+ print UNMAPPED_1 "$quality_value_1\n";
+ # seq_2
+ print UNMAPPED_2 $orig_identifier_2;
+ print UNMAPPED_2 "$sequence_2\n";
+ print UNMAPPED_2 $ident_2;
+ print UNMAPPED_2 "$quality_value_2\n";
+ }
+ }
+
+ warn "Processed $counting{sequences_count} sequences in total\n\n";
+
+ close OUT or die $!;
+
+ print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
+
+}
+
+sub check_bowtie_results_single_end{
+ my ($sequence,$identifier,$quality_value) = @_;
+
+ unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
+ $quality_value = 'I'x(length$sequence);
+ }
+
+ my %mismatches = ();
+ ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
+ foreach my $index (0..$#fhs){
+
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
+ ### if the sequence we are currently looking at produced an alignment we are doing various things with it
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+ ###############################################################
+ ### STEP I Now processing the alignment stored in last_line ###
+ ###############################################################
+ my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
+ ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
+ ### we only continue to extract useful information about this alignment if 1 was returned
+ if ($valid_alignment_found_1 == 1){
+ ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
+ ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
+ my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
+
+ unless($mismatch_info){
+ $mismatch_info = '';
+ }
+
+ chomp $mismatch_info;
+ my $chromosome;
+ if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
+ $chromosome = $mapped_chromosome;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome\n";
+ }
+ ### Now extracting the number of mismatches to the converted genome
+ my $number_of_mismatches;
+ if ($mismatch_info eq ''){
+ $number_of_mismatches = 0;
+ }
+ elsif ($mismatch_info =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info);
+ $number_of_mismatches = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
+ }
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ my $alignment_location = join (":",$chromosome,$position);
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
+ ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
+ ### number for the found alignment)
+ unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
+ }
+ $number_of_mismatches = undef;
+ ##################################################################################################################################################
+ ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
+ ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
+ ### be returned as $valid_alignment_found and it will then be processed in the next round only.
+ ##################################################################################################################################################
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ }
+ else {
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ next;
+ }
+ my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
+ ### we only continue to extract useful information about this second alignment if 1 was returned
+ if ($valid_alignment_found_2 == 1){
+ ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
+ ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
+ my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
+ unless($mismatch_info){
+ $mismatch_info = '';
+ }
+ chomp $mismatch_info;
+
+ my $chromosome;
+ if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
+ $chromosome = $mapped_chromosome;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome\n";
+ }
+
+ ### Now extracting the number of mismatches to the converted genome
+ my $number_of_mismatches;
+ if ($mismatch_info eq ''){
+ $number_of_mismatches = 0;
+ }
+ elsif ($mismatch_info =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info);
+ $number_of_mismatches = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ ### extracting the chromosome number from the bowtie output (see above)
+ my $alignment_location = join (":",$chromosome,$position);
+ ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
+ ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
+ ### case we are not writing the same entry out a second time.
+ unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
+ $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
+ }
+ ####################################################################################################################################
+ #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
+ ####################################################################################################################################
+ $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ my ($seq_id) = split (/\t/,$newline);
+ die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ next;
+ }
+ else {
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ next;
+ }
+ ### still within the 2nd sequence in correct orientation found
+ }
+ ### still withing the 1st sequence in correct orientation found
+ }
+ ### still within the if (last_seq_id eq identifier) condition
+ }
+ ### still within foreach index loop
+ }
+ ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
+ unless(%mismatches){
+ $counting{no_single_alignment_found}++;
+ if ($unmapped){
+ return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
+ }
+ else{
+ return;
+ }
+ }
+ #######################################################################################################################################################
+ #######################################################################################################################################################
+ ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
+ ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
+ ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
+ #######################################################################################################################################################
+ #######################################################################################################################################################
+ ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+ my $sequence_fails = 0;
+ ### Declaring an empty hash reference which will store all information we need for the methylation call
+ my $methylation_call_params; # hash reference!
+ ### sorting in ascending order
+ foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
+
+ ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
+ if (scalar keys %{$mismatches{$mismatch_number}} == 1){
+ for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
+ $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
+ }
+ }
+ elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
+ ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
+ ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
+ ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
+ ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
+ ### reaction. E.g.
+ ### CAGTCACGCGCGCGCG will become
+ ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
+ ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
+ ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
+ ### G->A conversion:
+ ### highly methylated: CAATCACACACACACA
+ ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
+ ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
+ ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
+ ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
+ ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
+ ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
+ ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
+ ### In the above example the number of transliterations required to transform the actual sequence
+ ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
+ ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
+ ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
+ ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
+ ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
+ ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
+ my @three_candidate_seqs;
+ foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
+ my $transliterations_performed;
+ if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
+ $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
+ }
+ elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
+ $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
+ }
+ else{
+ die "unexpected index number range $!\n";
+ }
+ push @three_candidate_seqs,{
+ index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
+ bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
+ mismatch_number => $mismatch_number,
+ chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
+ position => $mismatches{$mismatch_number}->{$composite_location}->{position},
+ seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
+ transliterations_performed => $transliterations_performed,
+ };
+ }
+ ### sorting in ascending order for the lowest number of transliterations performed
+ @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
+ my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
+ my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
+ my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
+ # print "$first_array_element\t$second_array_element\t$third_array_element\n";
+ if (($first_array_element*2) < $second_array_element){
+ $counting{low_complexity_alignments_overruled_count}++;
+ ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
+ $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
+ $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
+ # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
+ }
+ else{
+ $sequence_fails = 1;
+ }
+ }
+ else{
+ $sequence_fails = 1;
+ }
+ ### after processing the alignment with the lowest number of mismatches we exit
+ last;
+ }
+ ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
+ if ($sequence_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+ if ($ambiguous){
+ return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
+ }
+ if ($unmapped){
+ return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
+ }
+ else{
+ return 0; # => exits to next sequence (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence has not been rejected so far it will have a unique best alignment
+ $counting{unique_best_alignment_count}++;
+ if ($pbat){
+ extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
+ }
+ else{
+ extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
+ }
+
+ ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+ ### otherwise we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
+
+ print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
+ return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
+}
+
+sub check_bowtie_results_single_end_bowtie2{
+ my ($sequence,$identifier,$quality_value) = @_;
+
+
+ unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
+ $quality_value = 'I'x(length$sequence);
+ }
+
+ # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
+ # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
+ # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
+
+ my $alignment_ambiguous = 0;
+
+ my %alignments = ();
+
+ ### reading from the Bowtie 2 output filehandles
+ foreach my $index (0..$#fhs){
+ # print "Index: $index\n";
+ # print "$fhs[$index]->{last_line}\n";
+ # print "$fhs[$index]->{last_seq_id}\n";
+ # sleep (1);
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
+
+ ### if the sequence we are currently looking at produced an alignment we are doing various things with it
+ # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
+
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+ # SAM format specifications for Bowtie 2
+ # (1) Name of read that aligned
+ # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
+ # 1 The read is one of a pair
+ # 2 The alignment is one end of a proper paired-end alignment
+ # 4 The read has no reported alignments
+ # 8 The read is one of a pair and has no reported alignments
+ # 16 The alignment is to the reverse reference strand
+ # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
+ # 64 The read is mate 1 in a pair
+ # 128 The read is mate 2 in a pair
+ # 256 The read has multiple mapping states
+ # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
+ # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
+ # (5) Mapping quality (255 means MAPQ is not available)
+ # (6) CIGAR string representation of alignment (* if unavailable)
+ # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
+ # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
+ # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
+ # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
+ # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
+ # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
+ # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
+ # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
+ # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
+ # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
+ # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
+ # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
+ # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
+ # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
+
+ my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
+
+ ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
+ if ($flag == 4){
+ ## reading in the next alignment, which must be the next sequence
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ if ($seq_id eq $identifier){
+ die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
+ }
+ next; # next instance
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ next;
+ }
+ }
+
+ # if there are one or more proper alignments we can extract the chromosome number
+ my $chromosome;
+ if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
+ $chromosome = $mapped_chromosome;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome\n";
+ }
+
+ ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
+ my ($alignment_score,$second_best,$MD_tag);
+ my @fields = split (/\t/,$fhs[$index]->{last_line});
+
+ foreach (11..$#fields){
+ if ($fields[$_] =~ /AS:i:(.*)/){
+ $alignment_score = $1;
+ }
+ elsif ($fields[$_] =~ /XS:i:(.*)/){
+ $second_best = $1;
+ }
+ elsif ($fields[$_] =~ /MD:Z:(.*)/){
+ $MD_tag = $1;
+ }
+ }
+
+ # warn "First best alignment_score is: '$alignment_score'\n";
+ # warn "MD tag is: '$MD_tag'\n";
+ die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
+
+ if (defined $second_best){
+ # warn "second best alignment_score is: '$second_best'\n\n";
+
+ # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
+ if ($alignment_score == $second_best){
+ $alignment_ambiguous = 1;
+ ## need to read and discard all additional ambiguous reads until we reach the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ last; # break free in case we have reached the end of the alignment output
+ }
+ }
+ # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
+ }
+ else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
+
+ my $alignment_location = join (":",$chromosome,$position);
+
+ ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id;
+ $alignments{$alignment_location}->{alignment_score} = $alignment_score;
+ $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome;
+ $alignments{$alignment_location}->{position} = $position;
+ $alignments{$alignment_location}->{CIGAR} = $cigar;
+ $alignments{$alignment_location}->{MD_tag} = $MD_tag;
+ }
+
+ ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ last; # break free in case we have reached the end of the alignment output
+ }
+ }
+ # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
+ }
+ }
+ else{ # there is no second best hit, so we can just store this one and read in the next sequence
+
+ my $alignment_location = join (":",$chromosome,$position);
+
+ ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id;
+ $alignments{$alignment_location}->{alignment_score} = $alignment_score;
+ $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome;
+ $alignments{$alignment_location}->{position} = $position;
+ $alignments{$alignment_location}->{MD_tag} = $MD_tag;
+ $alignments{$alignment_location}->{CIGAR} = $cigar;
+ }
+
+ my $newline = $fhs[$index]->{fh}-> getline();
+ if ($newline){
+ chomp $newline;
+ my ($seq_id) = split (/\t/,$newline);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ if ($seq_id eq $identifier){
+ die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
+ }
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ }
+ }
+ }
+ }
+
+ ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
+ if ($alignment_ambiguous == 1){
+ $counting{unsuitable_sequence_count}++;
+ ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
+ # print "$ambiguous_read_output\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0;
+ }
+ }
+
+ ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
+ unless(%alignments){
+ $counting{no_single_alignment_found}++;
+ # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
+ # print "$unmapped_read_output\n";
+ if ($unmapped){
+ return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
+ }
+ else{
+ return 0; # default
+ }
+ }
+
+ #######################################################################################################################################################
+
+ ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
+ ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
+ ### alignment score we are discarding the sequence altogether.
+ ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
+ ### opening (5) and extending (3 per bp) the gap.
+
+ #######################################################################################################################################################
+
+ my $methylation_call_params; # hash reference which will store all information we need for the methylation call
+ my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+
+ ### print contents of %alignments for debugging
+ # if (scalar keys %alignments > 1){
+ # print "\n******\n";
+ # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
+ # print "Loc: $alignment_location\n";
+ # print "ID: $alignments{$alignment_location}->{seq_id}\n";
+ # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
+ # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
+ # print "Index $alignments{$alignment_location}->{index}\n";
+ # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
+ # print "pos: $alignments{$alignment_location}->{position}\n";
+ # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
+ # }
+ # print "\n******\n";
+ # }
+
+ ### if there is only 1 entry in the hash with we accept it as the best alignment
+ if (scalar keys %alignments == 1){
+ for my $unique_best_alignment (keys %alignments){
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
+ $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
+ $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
+ }
+ }
+
+ ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
+ ### we boot the sequence altogether
+ elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
+ my $best_alignment_score;
+ my $best_alignment_location;
+ foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
+ # print "$alignments{$alignment_location}->{alignment_score}\n";
+ unless (defined $best_alignment_score){
+ $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
+ $best_alignment_location = $alignment_location;
+ # print "setting best alignment score: $best_alignment_score\n";
+ }
+ else{
+ ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
+ if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
+ # warn "Same alignment score, the sequence will get booted!\n";
+ $sequence_fails = 1;
+ last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
+ }
+ ### else we are going to store the best alignment for further processing
+ else{
+ $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
+ $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
+ $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
+ $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
+ last; # exiting after processing the second alignment since the sequence produced a unique best alignment
+ }
+ }
+ }
+ }
+ else{
+ die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
+ }
+
+ ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
+ if ($sequence_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+
+ ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
+ # print OUT "$ambiguous_read_output\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0; # => exits to next sequence (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence has not been rejected so far it has a unique best alignment
+ $counting{unique_best_alignment_count}++;
+
+ ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
+ extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
+
+ ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+
+ ### otherwise we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
+ print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
+ return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
+}
+
+
+sub determine_number_of_transliterations_performed{
+ my ($sequence,$read_conversion) = @_;
+ my $number_of_transliterations;
+ if ($read_conversion eq 'CT'){
+ $number_of_transliterations = $sequence =~ tr/C/T/;
+ }
+ elsif ($read_conversion eq 'GA'){
+ $number_of_transliterations = $sequence =~ tr/G/A/;
+ }
+ else{
+ die "Read conversion mode of the read was not specified $!\n";
+ }
+ return $number_of_transliterations;
+}
+
+sub decide_whether_single_end_alignment_is_valid{
+ my ($index,$identifier) = @_;
+
+ # extracting from Bowtie 1 format
+ my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
+
+ ### ensuring that the entry is the correct sequence
+ if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
+ ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
+ ### sensible alignments
+ my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ return 1; ### 1st possibility for a sequence to pass
+ }
+ ### If the alignment was in the wrong orientation we need to read in a new line
+ elsif($orientation == 0){
+ my $newline = $fhs[$index]->{fh}->getline();
+ if ($newline){
+ ($id,$strand) = (split (/\t/,$newline))[0,1];
+
+ ### ensuring that the next entry is still the correct sequence
+ if ($id eq $identifier){
+ ### checking orientation again
+ $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ $fhs[$index]->{last_seq_id} = $id;
+ $fhs[$index]->{last_line} = $newline;
+ return 1; ### 2nd possibility for a sequence to pass
+ }
+ ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
+ elsif ($orientation == 0){
+ $newline = $fhs[$index]->{fh}->getline();
+ if ($newline){
+ my ($seq_id) = split (/\t/,$newline);
+ ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
+ ### the same fields of the just read next entry
+ die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
+ $fhs[$index]->{last_seq_id} = $seq_id;
+ $fhs[$index]->{last_line} = $newline;
+ return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
+ }
+ else{
+ # assigning undef to last_seq_id and last_line (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
+ else{
+ $fhs[$index]->{last_seq_id} = $id;
+ $fhs[$index]->{last_line} = $newline;
+ return 0; # processing the new alignment result only in the next round
+ }
+ }
+ else {
+ # assigning undef to last_seq_id and last_line (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
+ else{
+ return 0;
+ }
+}
+#########################
+### BOWTIE 1 | PAIRED-END
+#########################
+
+sub check_bowtie_results_paired_ends{
+ my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
+
+ ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
+ unless ($quality_value_1){
+ $quality_value_1 = 'I'x(length$sequence_1);
+ }
+ unless ($quality_value_2){
+ $quality_value_2 = 'I'x(length$sequence_2);
+ }
+
+ # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
+ # sleep (1);
+ my %mismatches = ();
+ ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
+
+
+ ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
+ ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
+ ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
+ ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
+ ### strands are not being reported by specifying --directional
+
+ foreach my $index (0,3,1,2){
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
+ ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+ # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
+
+ ##################################################################################
+ ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
+ ##################################################################################
+ my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
+ ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
+ ### orientation. We only continue to extract useful information about this alignment if 1 was returned
+ if ($valid_alignment_found == 1){
+ ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
+ ### we store the useful information in %mismatches
+ my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
+ my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
+ chomp $mismatch_info_1;
+ chomp $mismatch_info_2;
+
+ ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
+ my ($chromosome_1,$chromosome_2);
+ if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
+ $chromosome_1 = $mapped_chromosome_1;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_1\n";
+ }
+ if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
+ $chromosome_2 = $mapped_chromosome_2;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_2\n";
+ }
+
+ ### Now extracting the number of mismatches to the converted genome
+ my $number_of_mismatches_1;
+ my $number_of_mismatches_2;
+ if ($mismatch_info_1 eq ''){
+ $number_of_mismatches_1 = 0;
+ }
+ elsif ($mismatch_info_1 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_1);
+ $number_of_mismatches_1 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ if ($mismatch_info_2 eq ''){
+ $number_of_mismatches_2 = 0;
+ }
+ elsif ($mismatch_info_2 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_2);
+ $number_of_mismatches_2 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
+ my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ die "Position 1 is higher than position 2" if ($position_1 > $position_2);
+ die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
+ my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
+ ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
+ ### number for the found alignment)
+ unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
+ }
+ ###################################################################################################################################################
+ ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
+ ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
+ ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
+ ### this round ###
+ ###################################################################################################################################################
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+
+ if ($newline_1 and $newline_2){
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ }
+ elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_2;
+ }
+ else{
+ die "Either read 1 or read 2 needs to end on '/1'\n";
+ }
+
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else {
+ # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ next; # jumping to the next index
+ }
+ ### Now processing the entry we just stored in last_line_1 and last_line_2
+ $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
+ ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
+ ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
+ if ($valid_alignment_found == 1){
+ ### we store the useful information in %mismatches
+ ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
+ ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
+ chomp $mismatch_info_1;
+ chomp $mismatch_info_2;
+ ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
+ if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
+ $chromosome_1 = $mapped_chromosome_1;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_1\n";
+ }
+ if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
+ $chromosome_2 = $mapped_chromosome_2;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_2\n";
+ }
+
+ $number_of_mismatches_1='';
+ $number_of_mismatches_2='';
+ ### Now extracting the number of mismatches to the converted genome
+ if ($mismatch_info_1 eq ''){
+ $number_of_mismatches_1 = 0;
+ }
+ elsif ($mismatch_info_1 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_1);
+ $number_of_mismatches_1 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ if ($mismatch_info_2 eq ''){
+ $number_of_mismatches_2 = 0;
+ }
+ elsif ($mismatch_info_2 =~ /^\d/){
+ my @mismatches = split (/,/,$mismatch_info_2);
+ $number_of_mismatches_2 = scalar @mismatches;
+ }
+ else{
+ die "Something weird is going on with the mismatch field\n";
+ }
+ ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
+ $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
+ ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
+ die "position 1 is greater than position 2" if ($position_1 > $position_2);
+ die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
+ $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
+ ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
+ ### number for the found alignment)
+ unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
+ $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
+ }
+ ###############################################################################################################################################
+ ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
+ ###############################################################################################################################################
+ $newline_1 = $fhs[$index]->{fh}-> getline();
+ $newline_2 = $fhs[$index]->{fh}-> getline();
+
+ if ($newline_1 and $newline_2){
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ }
+ if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $fhs[$index]->{last_seq_id} = $seq_id_2;
+ }
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else {
+ # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ next; # jumping to the next index
+ }
+ ### within the 2nd sequence pair alignment in correct orientation found
+ }
+ ### within the 1st sequence pair alignment in correct orientation found
+ }
+ ### still within the (last_seq_id eq identifier) condition
+ }
+ ### still within foreach index loop
+ }
+ ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
+ unless(%mismatches){
+ $counting{no_single_alignment_found}++;
+ return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
+ }
+ ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+ my $sequence_pair_fails = 0;
+ ### Declaring an empty hash reference which will store all information we need for the methylation call
+ my $methylation_call_params; # hash reference!
+ ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
+ ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
+ ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
+ foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
+ #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
+ foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
+ #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
+ # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
+ }
+ if (scalar keys %{$mismatches{$mismatch_number}} == 1){
+ # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
+ for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
+ $methylation_call_params->{$identifier}->{seq_id} = $identifier;
+ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
+ $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
+ $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
+ $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
+ $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
+ $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
+ $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
+ }
+ }
+ else{
+ $sequence_pair_fails = 1;
+ }
+ ### after processing the alignment with the lowest number of mismatches we exit
+ last;
+ }
+ ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
+ if ($sequence_pair_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+ if ($ambiguous){
+ return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
+ }
+ if ($unmapped){
+ return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
+ }
+ else{
+ return 0; # => exits to next sequence (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence has not been rejected so far it does have a unique best alignment
+ $counting{unique_best_alignment_count}++;
+ extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
+
+ ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+ ### otherwise we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
+ $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
+
+ print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
+ return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
+}
+
+#########################
+### BOWTIE 2 | PAIRED-END
+#########################
+
+sub check_bowtie_results_paired_ends_bowtie2{
+ my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
+
+ ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
+ unless ($quality_value_1){
+ $quality_value_1 = 'I'x(length$sequence_1);
+ }
+
+ unless ($quality_value_2){
+ $quality_value_2 = 'I'x(length$sequence_2);
+ }
+
+
+ # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
+
+
+ my %alignments;
+ my $alignment_ambiguous = 0;
+
+ ### reading from the Bowtie 2 output filehandles
+
+ ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
+ ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
+ ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
+ ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
+ ### strands are not being reported when '--directional' is specified
+
+ foreach my $index (0,3,1,2){
+ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
+ next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
+
+ ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
+ if ($fhs[$index]->{last_seq_id} eq $identifier) {
+
+ my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
+ my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
+ # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
+ # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
+ # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
+ # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
+ $id_1 =~ s/\/1$//;
+ $id_2 =~ s/\/2$//;
+
+ # SAM format specifications for Bowtie 2
+ # (1) Name of read that aligned
+ # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
+ # 1 The read is one of a pair
+ # 2 The alignment is one end of a proper paired-end alignment
+ # 4 The read has no reported alignments
+ # 8 The read is one of a pair and has no reported alignments
+ # 16 The alignment is to the reverse reference strand
+ # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
+ # 64 The read is mate 1 in a pair
+ # 128 The read is mate 2 in a pair
+ # 256 The read has multiple mapping states
+ # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
+ # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
+ # (5) Mapping quality (255 means MAPQ is not available)
+ # (6) CIGAR string representation of alignment (* if unavailable)
+ # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
+ # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
+ # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
+ # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
+ # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
+ # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
+ # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
+ # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
+ # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
+ # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
+ # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
+ # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
+ # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
+ # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
+ # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
+
+ ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
+ ### We can store the next alignment and move on to the next Bowtie 2 instance
+ if ($flag_1 == 77 and $flag_2 == 141){
+ ## reading in the next alignment, which must be the next sequence
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+
+ # print "current sequence ($identifier) did not map, reading in next sequence\n";
+ # print "$index\t$fhs[$index]->{last_seq_id}\n";
+ # print "$index\t$fhs[$index]->{last_line_1}\n";
+ # print "$index\t$fhs[$index]->{last_line_2}\n";
+ next; # next instance
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ ### If there are one or more proper alignments we can extract the chromosome number
+ my ($chromosome_1,$chromosome_2);
+ if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
+ $chromosome_1 = $mapped_chromosome_1;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_1\n";
+ }
+ if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
+ $chromosome_2 = $mapped_chromosome_2;
+ }
+ else{
+ die "Chromosome number extraction failed for $mapped_chromosome_2\n";
+ }
+
+ die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
+
+ ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
+ my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
+
+ my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
+ my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
+
+ foreach (11..$#fields_1){
+ if ($fields_1[$_] =~ /AS:i:(.*)/){
+ $alignment_score_1 = $1;
+ }
+ elsif ($fields_1[$_] =~ /XS:i:(.*)/){
+ $second_best_1 = $1;
+ }
+ elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
+ $MD_tag_1 = $1;
+ }
+ }
+
+ foreach (11..$#fields_2){
+ if ($fields_2[$_] =~ /AS:i:(.*)/){
+ $alignment_score_2 = $1;
+ }
+ elsif ($fields_2[$_] =~ /XS:i:(.*)/){
+ $second_best_2 = $1;
+ }
+ elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
+ $MD_tag_2 = $1;
+ }
+ }
+
+ die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
+ die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
+
+ # warn "First read 1 alignment score is: '$alignment_score_1'\n";
+ # warn "First read 2 alignment score is: '$alignment_score_2'\n";
+ # warn "MD tag 1 is: '$MD_tag_1'\n";
+ # warn "MD tag 2 is: '$MD_tag_2'\n";
+
+ ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
+ my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
+ # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
+
+ if (defined $second_best_1 and defined $second_best_2){
+ my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
+ # warn "Second best alignment_score_1 is: '$second_best_1'\n";
+ # warn "Second best alignment_score_2 is: '$second_best_2'\n";
+ # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
+
+ # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
+ if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
+ $alignment_ambiguous = 1;
+ # print "This read will be chucked (AS==XS detected)!\n";
+
+ ## need to read and discard all additional ambiguous reads until we reach the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
+
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ last; # break free if the end of the alignment output was reached
+ }
+ }
+ # if ($fhs[$index]->{last_seq_id}){
+ # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
+ # }
+ }
+ else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
+
+ my $alignment_location;
+ if ($position_1 <= $position_2){
+ $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ }
+ elsif($position_2 < $position_1){
+ $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
+ }
+
+ ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id_1;
+ $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
+ $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
+ $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
+ $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
+ $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
+ $alignments{$alignment_location}->{position_1} = $position_1;
+ $alignments{$alignment_location}->{position_2} = $position_2;
+ $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
+ $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
+ $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
+ $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
+ $alignments{$alignment_location}->{flag_1} = $flag_1;
+ $alignments{$alignment_location}->{flag_2} = $flag_2;
+ }
+ # warn "added best of several alignments to \%alignments hash\n";
+
+ ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
+ until ($fhs[$index]->{last_seq_id} ne $identifier){
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
+
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ }
+ else{
+ # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ last; # break free if the end of the alignment output was reached
+ }
+ }
+ # if($fhs[$index]->{last_seq_id}){
+ # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
+ # }
+ }
+ }
+ else{ # there is no second best hit, so we can just store this one and read in the next sequence
+
+ my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
+ # print "$alignment_location\n";
+ ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
+ ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
+ ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
+ ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
+
+ unless (exists $alignments{$alignment_location}){
+ $alignments{$alignment_location}->{seq_id} = $id_1;
+ $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
+ $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
+ $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
+ $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
+ $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
+ $alignments{$alignment_location}->{index} = $index;
+ $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
+ $alignments{$alignment_location}->{position_1} = $position_1;
+ $alignments{$alignment_location}->{position_2} = $position_2;
+ $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
+ $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
+ $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
+ $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
+ $alignments{$alignment_location}->{flag_1} = $flag_1;
+ $alignments{$alignment_location}->{flag_2} = $flag_2;
+ }
+
+ # warn "added unique alignment to \%alignments hash\n";
+
+ # Now reading and storing the next read pair
+ my $newline_1 = $fhs[$index]->{fh}-> getline();
+ my $newline_2 = $fhs[$index]->{fh}-> getline();
+ if ($newline_1 and $newline_2){
+ chomp $newline_1;
+ chomp $newline_2;
+ # print "$newline_1\n";
+ # print "$newline_2\n";
+ my ($seq_id_1) = split (/\t/,$newline_1);
+ my ($seq_id_2) = split (/\t/,$newline_2);
+ $seq_id_1 =~ s/\/1$//;
+ $seq_id_2 =~ s/\/2$//;
+ # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
+
+ $fhs[$index]->{last_seq_id} = $seq_id_1;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+
+ if ($seq_id_1 eq $identifier){
+ die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
+ }
+ }
+ else{
+ # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ }
+ }
+ }
+ }
+
+ ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
+ if ($alignment_ambiguous == 1){
+ $counting{unsuitable_sequence_count}++;
+ ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
+ # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
+ # print "$ambiguous_read_1\n";
+ # print "$ambiguous_read_2\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0;
+ }
+ }
+
+ ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
+ unless (%alignments){
+ $counting{no_single_alignment_found}++;
+
+ # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
+ # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
+ # print "$unmapped_read_1\n";
+ # print "$unmapped_read_2\n";
+ if ($unmapped){
+ return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
+ }
+ else{
+ return 0;
+ }
+ }
+
+ #######################################################################################################################################################
+
+ ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
+ ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
+ ### alignment score we are discarding the sequence pair altogether.
+ ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
+ ### and extending (3 per bp) the gap.
+
+ #######################################################################################################################################################
+
+ ### Declaring an empty hash reference which will store all information we need for the methylation call
+ my $methylation_call_params; # hash reference
+ my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
+
+ ### print contents of %alignments for debugging
+ ## if (scalar keys %alignments >= 1){
+ # print "\n******\n";
+ # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
+ # print "Loc: $alignment_location\n";
+ # print "ID: $alignments{$alignment_location}->{seq_id}\n";
+ # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
+ # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
+ # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
+ # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
+ # print "Index $alignments{$alignment_location}->{index}\n";
+ # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
+ # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
+ # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
+ # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
+ # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
+ # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
+ # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
+ # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
+ # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
+ # }
+ # print "\n******\n";
+ # }
+
+ ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
+ if (scalar keys %alignments == 1){
+ for my $unique_best_alignment (keys %alignments){
+ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
+ $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
+ $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
+ $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
+ $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
+ $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
+ $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
+ $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
+ $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
+ $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
+ $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
+ $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
+ }
+ }
+
+ ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
+ ### we boot the sequence pair altogether)
+ elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
+ my $best_sum_of_alignment_scores;
+ my $best_alignment_location;
+ foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
+ # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
+ unless (defined $best_sum_of_alignment_scores){
+ $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
+ $best_alignment_location = $alignment_location;
+ # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
+ }
+ else{
+ ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
+ if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
+ # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
+ $sequence_pair_fails = 1;
+ last; # exiting since we know that the sequence has ambiguous alignments
+ }
+ ### else we are going to store the best alignment for further processing
+ else{
+ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
+ $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
+ $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
+ $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
+ $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
+ $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
+ $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
+ $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
+ $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
+ $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
+ $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
+ $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
+ $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
+ $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
+ $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
+ last; # exiting since the sequence produced a unique best alignment
+ }
+ }
+ }
+ }
+ else{
+ die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
+ }
+
+ ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
+ if ($sequence_pair_fails == 1){
+ $counting{unsuitable_sequence_count}++;
+
+ ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
+ # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
+ # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
+ # print "$ambiguous_read_1\n";
+ # print "$ambiguous_read_2\n";
+
+ if ($ambiguous){
+ return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
+ }
+ elsif ($unmapped){
+ return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
+ }
+ else{
+ return 0; # => exits to next sequence pair (default)
+ }
+ }
+
+ ### --DIRECTIONAL
+ ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
+ ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
+ if ($directional){
+ if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
+ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
+ $counting{alignments_rejected_count}++;
+ return 0;
+ }
+ }
+
+ ### If the sequence pair has not been rejected so far it does have a unique best alignment
+ $counting{unique_best_alignment_count}++;
+ extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
+
+ ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+ if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
+ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
+ $counting{genomic_sequence_could_not_be_extracted_count}++;
+ return 0;
+ }
+
+ ### now we are set to perform the actual methylation call
+ $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
+ $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
+ # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
+ # print " $sequence_2\n";
+ # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
+ # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
+
+ print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
+ return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
+}
+
+###
+
+sub decide_whether_paired_end_alignment_is_valid{
+ my ($index,$identifier) = @_;
+ my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
+ my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
+ chomp $mismatch_info_1;
+ chomp $mismatch_info_2;
+ my $seq_id_1 = $id_1;
+ my $seq_id_2 = $id_2;
+ $seq_id_1 =~ s/\/1$//; # removing the read /1
+ $seq_id_2 =~ s/\/1$//; # removing the read /1
+
+ ### ensuring that the current entry is the correct sequence
+ if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
+ ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
+ ### sensible alignments
+ my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
+ }
+ ### If the alignment was in the wrong orientation we need to read in two new lines
+ elsif($orientation == 0){
+ my $newline_1 = $fhs[$index]->{fh}->getline();
+ my $newline_2 = $fhs[$index]->{fh}->getline();
+ if ($newline_1 and $newline_2){
+ ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
+ ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
+ ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
+
+ my $seqid;
+ $seq_id_1 = $id_1;
+ $seq_id_2 = $id_2;
+ # we need to capture the first read (ending on /1)
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_1;
+ }
+ elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_2;
+ }
+ else{
+ die "One of the two reads needs to end on /1!!";
+ }
+
+ ### ensuring that the next entry is still the correct sequence
+ if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
+ ### checking orientation again
+ $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
+ ### If the orientation was correct can we move on
+ if ($orientation == 1){
+ ### Writing the current sequence to last_line_1 and last_line_2
+ $fhs[$index]->{last_seq_id} = $seqid;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
+ }
+ ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
+ ### the next entry)
+ elsif ($orientation == 0){
+ $newline_1 = $fhs[$index]->{fh}->getline();
+ $newline_2 = $fhs[$index]->{fh}->getline();
+ if ($newline_1 and $newline_2){
+ ($seq_id_1) = split (/\t/,$newline_1);
+ ($seq_id_2) = split (/\t/,$newline_2);
+
+ $seqid = '';
+ if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_1;
+ }
+ elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
+ $seqid = $seq_id_2;
+ }
+ else{
+ die "One of the two reads needs to end on /1!!";
+ }
+
+ ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
+ ### the same fields of the just read next entry
+ die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
+ $fhs[$index]->{last_seq_id} = $seqid;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
+ }
+ else {
+ ### assigning undef to last_seq_id and last_line (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
+ else{
+ $fhs[$index]->{last_seq_id} = $seqid;
+ $fhs[$index]->{last_line_1} = $newline_1;
+ $fhs[$index]->{last_line_2} = $newline_2;
+ return 0; # processing the new alignment result only in the next round
+ }
+ }
+ else {
+ # assigning undef to last_seq_id and both last_lines (end of bowtie output)
+ $fhs[$index]->{last_seq_id} = undef;
+ $fhs[$index]->{last_line_1} = undef;
+ $fhs[$index]->{last_line_2} = undef;
+ return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
+ }
+ }
+ else{
+ die "The orientation of the alignment must be either correct or incorrect\n";
+ }
+ }
+ ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
+ else{
+ return 0;
+ }
+}
+
+### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
+
+sub extract_corresponding_genomic_sequence_paired_ends {
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+ my $alignment_read_1;
+ my $alignment_read_2;
+ my $read_conversion_info_1;
+ my $read_conversion_info_2;
+ my $genome_conversion;
+
+ ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
+ ### if the C happens to be at the first or last position of the actually observed sequence
+ my $non_bisulfite_sequence_1;
+ my $non_bisulfite_sequence_2;
+
+ ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
+ ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
+ ### sequences around!
+ ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_GA_CT_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'CT';
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
+ ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
+
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
+ ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
+
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+ else{
+ $non_bisulfite_sequence_2 = '';
+ }
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_CT_GA_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'GA';
+
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
+ ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
+ if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
+ }
+ else{
+ $non_bisulfite_sequence_1 = '';
+ }
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
+ ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from the complementary to (converted) forward strand]
+ $counting{GA_CT_CT_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'CT';
+
+ ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
+ ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
+ ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
+ ### Read 2 is CT converted so we need to capture 2 extra 3' bases
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
+ }
+ else{
+ $non_bisulfite_sequence_2 = '';
+ }
+ }
+
+ ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from the (converted) reverse strand]
+ $counting{CT_GA_GA_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'GA';
+
+ ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
+ ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
+ ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
+ if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
+ $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
+ ### the reverse strand sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+ }
+ else{
+ $non_bisulfite_sequence_1 = '';
+ }
+
+ ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
+ ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
+ ### Read 2 is GA converted so we need to capture 2 extra 5' bases
+ $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+}
+
+### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
+
+sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
+ my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
+ my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
+ my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
+ # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
+ # sleep(10);
+ ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_read_1;
+ my $alignment_read_2;
+ my $read_conversion_info_1;
+ my $read_conversion_info_2;
+ my $genome_conversion;
+
+ ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
+ ### if the C happens to be at the last position of the actually observed sequence
+ my $non_bisulfite_sequence_1 = '';
+ my $non_bisulfite_sequence_2 = '';
+
+ ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
+ my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
+ my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
+
+ # parsing CIGAR 1 string
+ my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
+ my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
+ shift @ops_1; # remove the empty first element
+ die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
+ # parsing CIGAR 2 string
+ my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
+ my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
+ shift @ops_2; # remove the empty first element
+ die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
+
+ my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
+ my $indels_2 = 0;
+
+ ### Extracting read 1 genomic sequence ###
+
+ # extracting 2 additional bp at the 5' end (read 1)
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
+ # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ return;
+ }
+ $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
+ }
+
+ foreach (0..$#len_1){
+ if ($ops_1[$_] eq 'M'){
+ # extracting genomic sequence
+ $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
+ # warn "$non_bisulfite_sequence_1\n";
+ # adjusting position
+ $pos_1 += $len_1[$_];
+ }
+ elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
+ # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
+ $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
+ # warn "$non_bisulfite_sequence_1\n";
+ # position doesn't need adjusting
+ $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
+ # we do not add any genomic sequence but only adjust the position
+ # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
+ $pos_1 += $len_1[$_];
+ $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
+ }
+ else{
+ die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
+ }
+ }
+
+ ### 3' end of read 1
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ return;
+ }
+
+ $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
+ }
+
+
+ ### Extracting read 2 genomic sequence ###
+
+ ### 5' end of read 2
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+ return;
+ }
+ $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
+ }
+
+ foreach (0..$#len_2){
+ if ($ops_2[$_] eq 'M'){
+ # extracting genomic sequence
+ $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
+ # warn "$non_bisulfite_sequence_2\n";
+ # adjusting position
+ $pos_2 += $len_2[$_];
+ }
+ elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
+ # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
+ $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
+ # warn "$non_bisulfite_sequence_2\n";
+ # position doesn't need adjusting
+ $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
+ # we do not add any genomic sequence but only adjust the position
+ # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
+ $pos_2 += $len_2[$_];
+ $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
+ }
+ elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
+ }
+ else{
+ die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
+ }
+ }
+
+ ### 3' end of read 2
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
+ # need to set read 1 as well now to prevent warning
+ # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
+ # sleep(5);
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+ return;
+ }
+ $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
+ }
+
+ ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
+ ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
+
+ ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_GA_CT_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'CT';
+ ### Read 1 is always the forward hit
+ ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from complementary to (converted) bottom strand]
+ $counting{GA_CT_GA_count}++;
+ $alignment_read_1 = '+';
+ $alignment_read_2 = '-';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'GA';
+ ### Read 1 is always the forward hit
+ ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
+ $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
+ }
+
+ ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from the complementary to (converted) top strand]
+ $counting{GA_CT_CT_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'GA';
+ $read_conversion_info_2 = 'CT';
+ $genome_conversion = 'CT';
+
+ ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+ }
+
+ ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from the (converted) reverse strand]
+ $counting{CT_GA_GA_count}++;
+ $alignment_read_1 = '-';
+ $alignment_read_2 = '+';
+ $read_conversion_info_1 = 'CT';
+ $read_conversion_info_2 = 'GA';
+ $genome_conversion = 'GA';
+ ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
+ $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
+ $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
+ ## the end position of a read is stored in $pos
+ $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
+ $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
+ $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
+ $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
+}
+
+##########################################
+### PRINT SINGLE END RESULTS: Bowtie 1 ###
+##########################################
+
+sub print_bisulfite_mapping_result_single_end{
+ my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value = convert_phred64_quals_to_phred33($quality_value);
+ }
+ elsif ($solexa){
+ $quality_value = convert_solexa_quals_to_phred33($quality_value);
+ }
+
+ ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
+ $methylation_call_params->{$identifier}->{position} += 1;
+
+ ### writing every uniquely mapped read and its methylation call to the output file
+ if ($vanilla){
+ my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
+ print OUT "$bowtie1_output\n";
+ }
+ else{ # SAM output, default since Bismark v1.0.0
+ single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
+ }
+}
+
+##########################################
+### PRINT SINGLE END RESULTS: Bowtie 2 ###
+##########################################
+
+sub print_bisulfite_mapping_result_single_end_bowtie2{
+ my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value = convert_phred64_quals_to_phred33($quality_value);
+ }
+ elsif ($solexa){
+ $quality_value = convert_solexa_quals_to_phred33($quality_value);
+ }
+
+ ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
+ single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
+}
+
+##########################################
+### PRINT PAIRED END ESULTS: Bowtie 1 ###
+##########################################
+
+sub print_bisulfite_mapping_results_paired_ends{
+ my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
+ }
+ elsif ($solexa){
+ $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
+ }
+
+ ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
+ $methylation_call_params->{$identifier}->{start_seq_1} += 1;
+
+ ### writing every single aligned read and its methylation call to the output file
+ if ($vanilla){
+ my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
+ print OUT "$bowtie1_output_paired_end\n";
+ }
+ else{ # SAM output, default since Bismark v1.0.0
+ paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
+ }
+
+}
+
+##########################################
+### PRINT PAIRED END ESULTS: Bowtie 2 ###
+##########################################
+
+sub print_bisulfite_mapping_results_paired_ends_bowtie2{
+ my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
+
+ ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
+ if ($phred64){
+ $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
+ }
+ elsif ($solexa){
+ $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
+ $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
+ }
+
+ ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
+ paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
+
+}
+
+
+sub convert_phred64_quals_to_phred33{
+
+ my $qual = shift;
+ my @quals = split (//,$qual);
+ my @new_quals;
+
+ foreach my $index (0..$#quals){
+ my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
+ my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
+ $new_quals[$index] = $phred33_quality_string;
+ }
+
+ my $phred33_quality = join ("",@new_quals);
+ return $phred33_quality;
+}
+
+sub convert_solexa_quals_to_phred33{
+
+ my $qual = shift;
+ my @quals = split (//,$qual);
+ my @new_quals;
+
+ foreach my $index (0..$#quals){
+ my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
+ my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
+ $new_quals[$index] = $phred33_quality_string;
+ }
+
+ my $phred33_quality = join ("",@new_quals);
+ return $phred33_quality;
+}
+
+sub convert_phred_score_into_phred33_quality_string{
+ my $qual = shift;
+ $qual = chr($qual+33);
+ return $qual;
+}
+
+sub convert_phred64_quality_string_into_phred_score{
+ my $string = shift;
+ my $qual = ord($string)-64;
+ return $qual;
+}
+
+sub convert_solexa_pre1_3_quality_string_into_phred_score{
+ ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
+ my $string = shift;
+ my $qual = ord($string)-59;
+ return $qual;
+}
+
+
+sub extract_corresponding_genomic_sequence_single_end {
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_strand;
+ my $read_conversion_info;
+ my $genome_conversion;
+ ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
+ ### if the C happens to be at the last position of the actually observed sequence
+ my $non_bisulfite_sequence;
+ ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
+
+ ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_CT_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'CT';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
+ ### + 2 extra base at the 3' end
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from (converted) reverse strand]
+ $counting{CT_GA_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'GA';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
+ ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
+ ## reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from complementary to (converted) forward strand]
+ $counting{GA_CT_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'CT';
+
+ ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
+ ## reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_GA_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'GA';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
+ ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+
+ ### at this point we can also determine the end position of a read
+ $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
+}
+
+sub extract_corresponding_genomic_sequence_single_end_pbat {
+ my ($sequence_identifier,$methylation_call_params) = @_;
+ ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_strand;
+ my $read_conversion_info;
+ my $genome_conversion;
+ ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
+ ### if the C happens to be at the last position of the actually observed sequence
+ my $non_bisulfite_sequence;
+ ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
+
+ my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1!
+
+ ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
+ if ($pbat_index == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_CT_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'CT';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
+ ### + 2 extra base at the 3' end
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
+ elsif ($pbat_index == 1){
+ ### [Index 1, sequence originated from (converted) reverse strand]
+ $counting{CT_GA_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'GA';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
+ ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
+ ## reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
+ elsif ($pbat_index == 2){
+ ### [Index 2, sequence originated from complementary to (converted) forward strand]
+ $counting{GA_CT_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'CT';
+
+ ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
+ ## reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+
+ ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
+ elsif ($pbat_index == 3){
+ ### [Index 3, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_GA_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'GA';
+
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
+ ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
+ $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
+ }
+ else{
+ $non_bisulfite_sequence = '';
+ }
+ }
+ else{
+ die "Too many bowtie result filehandles\n";
+ }
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+
+ ### at this point we can also determine the end position of a read
+ $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
+}
+
+
+sub extract_corresponding_genomic_sequence_single_end_bowtie2{
+ my ($sequence_identifier,$methylation_call_params) = @_;
+
+ my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
+ my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
+
+ ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
+ ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
+
+ ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
+ ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
+ my $alignment_strand;
+ my $read_conversion_info;
+ my $genome_conversion;
+ ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
+ ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
+ my $non_bisulfite_sequence = '';
+
+ ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
+ my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
+
+ # parsing CIGAR string
+ my @len = split (/\D+/,$cigar); # storing the length per operation
+ my @ops = split (/\d+/,$cigar); # storing the operation
+ shift @ops; # remove the empty first element
+ die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+ ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+ return;
+ }
+ $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
+ }
+ my $indels = 0;
+
+ foreach (0..$#len){
+ if ($ops[$_] eq 'M'){
+ #extracting genomic sequence
+ $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
+ # adjusting position
+ $pos += $len[$_];
+ }
+ elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
+ # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
+ $non_bisulfite_sequence .= 'N' x $len[$_];
+ # warn "$non_bisulfite_sequence\n";
+ # position doesn't need to be adjusting
+ $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
+ }
+ elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
+ # we do not add any genomic sequence but only adjust the position
+ $pos += $len[$_];
+ $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
+ }
+ elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
+ die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
+ }
+ else{
+ die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
+ }
+ }
+
+ ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
+ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
+ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
+ unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+ return;
+ }
+ $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
+ # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
+ }
+
+
+
+ ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
+ if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
+ ### [Index 0, sequence originated from (converted) forward strand]
+ $counting{CT_CT_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'CT';
+ }
+
+ ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
+ ### [Index 1, sequence originated from (converted) reverse strand]
+ $counting{CT_GA_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'CT';
+ $genome_conversion = 'GA';
+
+ ### reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+
+ ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
+ ### [Index 2, sequence originated from complementary to (converted) forward strand]
+ $counting{GA_CT_count}++;
+ $alignment_strand = '-';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'CT';
+
+ ### reverse complement!
+ $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
+ }
+
+ ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
+ elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
+ ### [Index 3, sequence originated from complementary to (converted) reverse strand]
+ $counting{GA_GA_count}++;
+ $alignment_strand = '+';
+ $read_conversion_info = 'GA';
+ $genome_conversion = 'GA';
+
+ }
+ else{
+ die "Too many Bowtie 2 result filehandles\n";
+ }
+
+ $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
+ $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
+ $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
+ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
+
+ ### the end position of a read is stored in $pos
+ $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
+ $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
+}
+
+### METHYLATION CALL
+
+sub methylation_call{
+ my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
+ ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
+ my @seq = split(//,$sequence_actually_observed);
+ my @genomic = split(//,$genomic_sequence);
+ # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
+ ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
+ ### CpG, CHH or CHG context
+
+ #################################################################
+ ### . for bases not involving cytosines ###
+ ### X for methylated C in CHG context (was protected) ###
+ ### x for not methylated C in CHG context (was converted) ###
+ ### H for methylated C in CHH context (was protected) ###
+ ### h for not methylated C in CHH context (was converted) ###
+ ### Z for methylated C in CpG context (was protected) ###
+ ### z for not methylated C in CpG context (was converted) ###
+ ### U for methylated C in unknown context (was protected) ###
+ ### u for not methylated C in unknwon context (was converted) ###
+ #################################################################
+
+ my @match =();
+ warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
+ my $methyl_CHH_count = 0;
+ my $methyl_CHG_count = 0;
+ my $methyl_CpG_count = 0;
+ my $methyl_C_unknown_count = 0;
+ my $unmethylated_CHH_count = 0;
+ my $unmethylated_CHG_count = 0;
+ my $unmethylated_CpG_count = 0;
+ my $unmethylated_C_unknown_count = 0;
+
+ if ($read_conversion eq 'CT'){
+ for my $index (0..$#seq) {
+ if ($seq[$index] eq $genomic[$index]) {
+ ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
+ if ($genomic[$index] eq 'C') {
+ ### If the residue is a C we want to know if it was in CpG context or in any other context
+ my $downstream_base = $genomic[$index+1];
+
+ if ($downstream_base eq 'G'){
+ ++$methyl_CpG_count;
+ push @match,'Z'; # protected C, methylated, in CpG context
+ }
+ elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+ ++$methyl_C_unknown_count;
+ push @match,'U'; # protected C, methylated, in Unknown context
+ }
+ else {
+ ### C in not in CpG-context, determining the second downstream base context
+ my $second_downstream_base = $genomic[$index+2];
+
+ if ($second_downstream_base eq 'G'){
+ ++$methyl_CHG_count;
+ push @match,'X'; # protected C, methylated, in CHG context
+ }
+ elsif ($second_downstream_base eq 'N'){
+ ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+ push @match,'U'; # protected C, methylated, in Unknown context
+ }
+ else{
+ ++$methyl_CHH_count;
+ push @match,'H'; # protected C, methylated, in CHH context
+ }
+ }
+ }
+ else {
+ push @match, '.';
+ }
+ }
+ elsif ($seq[$index] ne $genomic[$index]) {
+ ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
+ ### in the actually observed sequence
+ if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
+ ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
+ my $downstream_base = $genomic[$index+1];
+
+ if ($downstream_base eq 'G'){
+ ++$unmethylated_CpG_count;
+ push @match,'z'; # converted C, not methylated, in CpG context
+ }
+ elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+ ++$unmethylated_C_unknown_count;
+ push @match,'u'; # converted C, not methylated, in Unknown context
+ }
+ else{
+ ### C in not in CpG-context, determining the second downstream base context
+ my $second_downstream_base = $genomic[$index+2];
+
+ if ($second_downstream_base eq 'G'){
+ ++$unmethylated_CHG_count;
+ push @match,'x'; # converted C, not methylated, in CHG context
+ }
+ elsif ($second_downstream_base eq 'N'){
+ ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+ push @match,'u'; # converted C, not methylated, in Unknown context
+ }
+ else{
+ ++$unmethylated_CHH_count;
+ push @match,'h'; # converted C, not methylated, in CHH context
+ }
+ }
+ }
+ ### all other mismatches are not of interest for a methylation call
+ else {
+ push @match,'.';
+ }
+ }
+ else{
+ die "There can be only 2 possibilities\n";
+ }
+ }
+ }
+ elsif ($read_conversion eq 'GA'){
+ # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
+
+ for my $index (0..$#seq) {
+ if ($seq[$index] eq $genomic[$index+2]) {
+ ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
+ if ($genomic[$index+2] eq 'G') {
+ ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
+ ### to look if the base upstream is a C
+
+ my $upstream_base = $genomic[$index+1];
+
+ if ($upstream_base eq 'C'){
+ ++$methyl_CpG_count;
+ push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
+ }
+ elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+ ++$methyl_C_unknown_count;
+ push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
+ }
+ else{
+ ### C in not in CpG-context, determining the second upstream base context
+ my $second_upstream_base = $genomic[$index];
+
+ if ($second_upstream_base eq 'C'){
+ ++$methyl_CHG_count;
+ push @match,'X'; # protected C on opposing strand, methylated, in CHG context
+ }
+ elsif ($second_upstream_base eq 'N'){
+ ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+ push @match,'U'; # protected C, methylated, in Unknown context
+ }
+ else{
+ ++$methyl_CHH_count;
+ push @match,'H'; # protected C on opposing strand, methylated, in CHH context
+ }
+ }
+ }
+ else{
+ push @match, '.';
+ }
+ }
+ elsif ($seq[$index] ne $genomic[$index+2]) {
+ ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
+ ### on the opposing strand, so G to A conversions in the actually observed sequence
+ if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
+ ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
+ ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
+
+ my $upstream_base = $genomic[$index+1];
+
+ if ($upstream_base eq 'C'){
+ ++$unmethylated_CpG_count;
+ push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
+ }
+ elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+ ++$unmethylated_C_unknown_count;
+ push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
+ }
+ else{
+ ### C in not in CpG-context, determining the second upstream base context
+ my $second_upstream_base = $genomic[$index];
+
+ if ($second_upstream_base eq 'C'){
+ ++$unmethylated_CHG_count;
+ push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
+ }
+ elsif ($second_upstream_base eq 'N'){
+ ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+ push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
+ }
+ else{
+ ++$unmethylated_CHH_count;
+ push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
+ }
+ }
+ }
+ ### all other mismatches are not of interest for a methylation call
+ else {
+ push @match,'.';
+ }
+ }
+ else{
+ die "There can be only 2 possibilities\n";
+ }
+ }
+ }
+ else{
+ die "Strand conversion info is required to perform a methylation call\n";
+ }
+
+ my $methylation_call = join ("",@match);
+
+ $counting{total_meCHH_count} += $methyl_CHH_count;
+ $counting{total_meCHG_count} += $methyl_CHG_count;
+ $counting{total_meCpG_count} += $methyl_CpG_count;
+ $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
+ $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
+ $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
+ $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
+ $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
+
+ # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
+ return $methylation_call;
+}
+
+sub read_genome_into_memory{
+ ## working directoy
+ my $cwd = shift;
+ ## reading in and storing the specified genome in the %chromosomes hash
+ chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
+ print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
+
+ my @chromosome_filenames = <*.fa>;
+
+ ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+ unless (@chromosome_filenames){
+ @chromosome_filenames = <*.fasta>;
+ }
+
+ unless (@chromosome_filenames){
+ die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
+ }
+
+ foreach my $chromosome_filename (@chromosome_filenames){
+
+ open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
+ ### first line needs to be a fastA header
+ my $first_line = ;
+ chomp $first_line;
+ $first_line =~ s/\r//;
+
+ ### Extracting chromosome name from the FastA header
+ my $chromosome_name = extract_chromosome_name($first_line);
+
+ my $sequence;
+ while (){
+ chomp;
+ $_ =~ s/\r//;
+ if ($_ =~ /^>/){
+ ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
+ if (exists $chromosomes{$chromosome_name}){
+ print "chr $chromosome_name (",length $sequence ," bp)\n";
+ die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+ }
+ else {
+ if (length($sequence) == 0){
+ warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
+ }
+ print "chr $chromosome_name (",length $sequence ," bp)\n";
+ $chromosomes{$chromosome_name} = $sequence;
+ }
+ ### resetting the sequence variable
+ $sequence = '';
+ ### setting new chromosome name
+ $chromosome_name = extract_chromosome_name($_);
+ }
+ else{
+ $sequence .= uc$_;
+ }
+ }
+
+ if (exists $chromosomes{$chromosome_name}){
+ print "chr $chromosome_name (",length $sequence ," bp)\t";
+ die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
+ }
+ else{
+ if (length($sequence) == 0){
+ warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
+ }
+ print "chr $chromosome_name (",length $sequence ," bp)\n";
+ $chromosomes{$chromosome_name} = $sequence;
+ }
+ }
+ print "\n";
+ chdir $cwd or die "Failed to move to directory $cwd\n";
+}
+
+sub extract_chromosome_name {
+ ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
+ my $fasta_header = shift;
+ if ($fasta_header =~ s/^>//){
+ my ($chromosome_name) = split (/\s+/,$fasta_header);
+ return $chromosome_name;
+ }
+ else{
+ die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
+ }
+}
+
+sub reverse_complement{
+ my $sequence = shift;
+ $sequence =~ tr/CATG/GTAC/;
+ $sequence = reverse($sequence);
+ return $sequence;
+}
+
+sub biTransformFastAFiles {
+ my $file = shift;
+ my ($dir,$filename);
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+
+ if ($gzip){
+ $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
+ $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
+ }
+ else{
+ $C_to_T_infile =~ s/$/_C_to_T.fa/;
+ $G_to_A_infile =~ s/$/_G_to_A.fa/;
+ }
+
+ if ($prefix){
+ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+ $C_to_T_infile = "$prefix.$C_to_T_infile";
+ $G_to_A_infile = "$prefix.$G_to_A_infile";
+ # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+ }
+
+ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+
+ if ($gzip){
+ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ }
+
+ unless ($directional){
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ if ($gzip){
+ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ }
+
+ my $count = 0;
+
+ while (1){
+ my $header = ;
+ my $sequence= ;
+ last unless ($header and $sequence);
+
+ $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence = uc$sequence; # make input file case insensitive
+
+ # detecting if the input file contains tab stops, as this is likely to result in no alignments
+ if (index($header,"\t") != -1){
+ $seqID_contains_tabs++;
+ }
+
+ ### small check if the sequence seems to be in FastA format
+ die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
+
+ my $sequence_C_to_T = $sequence;
+ $sequence_C_to_T =~ tr/C/T/;
+ print CTOT "$header$sequence_C_to_T";
+
+ unless ($directional){
+ my $sequence_G_to_A = $sequence;
+ $sequence_G_to_A =~ tr/G/A/;
+ print GTOA "$header$sequence_G_to_A";
+ }
+ }
+ close CTOT or die "Failed to close filehandle $!\n";
+
+ if ($directional){
+ warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ close GTOA or die "Failed to close filehandle $!\n";
+ warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ return ($C_to_T_infile,$G_to_A_infile);
+}
+
+sub biTransformFastAFiles_paired_end {
+ my ($file,$read_number) = @_;
+
+ if ($gzip){
+ warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
+ sleep (2);
+ }
+
+ my ($dir,$filename);
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+
+ $C_to_T_infile =~ s/$/_C_to_T.fa/;
+ $G_to_A_infile =~ s/$/_G_to_A.fa/;
+
+ if ($prefix){
+ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+ $C_to_T_infile = "$prefix.$C_to_T_infile";
+ $G_to_A_infile = "$prefix.$G_to_A_infile";
+ # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ }
+ elsif ($read_number == 2){
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ else{
+ die "Read number needs to be 1 or 2, but was: $read_number\n\n";
+ }
+ }
+ else{ # all four strand output
+ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+
+ my $count = 0;
+
+ while (1){
+ my $header = ;
+ my $sequence= ;
+ last unless ($header and $sequence);
+
+ $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence = uc$sequence; # make input file case insensitive
+
+ # detecting if the input file contains tab stops, as this is likely to result in no alignments
+ if (index($header,"\t") != -1){
+ $seqID_contains_tabs++;
+ }
+
+ ## small check if the sequence seems to be in FastA format
+ die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
+
+ if ($read_number == 1){
+ if ($bowtie2){
+ $header =~ s/$/\/1\/1/;
+ }
+ else{
+ $header =~ s/$/\/1/;
+ }
+ }
+ elsif ($read_number == 2){
+ if ($bowtie2){
+ $header =~ s/$/\/2\/2/;
+ }
+ else{
+ $header =~ s/$/\/2/;
+ }
+ }
+ else{
+ die "Read number needs to be 1 or 2, but was: $read_number\n\n";
+ }
+ my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
+
+ $sequence_C_to_T =~ tr/C/T/;
+ $sequence_G_to_A =~ tr/G/A/;
+
+ if ($directional){
+
+ if ($read_number == 1){
+ print CTOT "$header$sequence_C_to_T";
+ }
+ elsif ($read_number == 2){
+ print GTOA "$header$sequence_G_to_A";
+ }
+ }
+ else{
+ print CTOT "$header$sequence_C_to_T";
+ print GTOA "$header$sequence_G_to_A";
+ }
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
+ }
+ }
+ else{
+ warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ return ($C_to_T_infile);
+ }
+ else{
+ return ($G_to_A_infile);
+ }
+ }
+ else{
+ return ($C_to_T_infile,$G_to_A_infile);
+ }
+}
+
+
+sub biTransformFastQFiles {
+ my $file = shift;
+ my ($dir,$filename);
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+
+ if ($prefix){
+ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+ $C_to_T_infile = "$prefix.$C_to_T_infile";
+ $G_to_A_infile = "$prefix.$G_to_A_infile";
+ # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+ }
+
+ if ($pbat){ # PBAT-Seq
+ if ($gzip){
+ $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
+ }
+ else{
+ $G_to_A_infile =~ s/$/_G_to_A.fastq/;
+ }
+
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+
+ if ($gzip){
+ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ }
+ else{ # directional or non-directional
+ if ($gzip){
+ $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
+ }
+ else{
+ $C_to_T_infile =~ s/$/_C_to_T.fastq/;
+ }
+
+ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+
+ if ($gzip){
+ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
+ }
+
+ unless ($directional){
+ if ($gzip){
+ $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
+ }
+ else{
+ $G_to_A_infile =~ s/$/_G_to_A.fastq/;
+ }
+
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+
+ if ($gzip){
+ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ }
+ }
+
+ my $count = 0;
+ while (1){
+ my $identifier = ;
+ my $sequence = ;
+ my $identifier2 = ;
+ my $quality_score = ;
+ last unless ($identifier and $sequence and $identifier2 and $quality_score);
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ ++$count;
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence = uc$sequence; # make input file case insensitive
+
+ # detecting if the input file contains tab stops, as this is likely to result in no alignments
+ if (index($identifier,"\t") != -1){
+ $seqID_contains_tabs++;
+ }
+
+ ## small check if the sequence file appears to be a FastQ file
+ if ($count == 1){
+ if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
+ die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
+ }
+ }
+
+ if ($pbat){
+ my $sequence_G_to_A = $sequence;
+ $sequence_G_to_A =~ tr/G/A/;
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ else{ # directional or non-directional
+ my $sequence_C_to_T = $sequence;
+ $sequence_C_to_T =~ tr/C/T/;
+ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
+
+ unless ($directional){
+ my $sequence_G_to_A = $sequence;
+ $sequence_G_to_A =~ tr/G/A/;
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ }
+ }
+
+ if ($directional){
+ close CTOT or die "Failed to close filehandle $!\n";
+ warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ elsif($pbat){
+ warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
+ close GTOA or die "Failed to close filehandle $!\n";
+ return ($G_to_A_infile);
+ }
+ else{
+ close CTOT or die "Failed to close filehandle $!\n";
+ close GTOA or die "Failed to close filehandle $!\n";
+ warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+
+ return ($C_to_T_infile,$G_to_A_infile);
+}
+
+sub biTransformFastQFiles_paired_end {
+ my ($file,$read_number) = @_;
+ my ($dir,$filename);
+
+ if ($file =~ /\//){
+ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file;
+ }
+
+ ### gzipped version of the infile
+ if ($file =~ /\.gz$/){
+ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
+ }
+ else{
+ open (IN,$file) or die "Couldn't read from file $file: $!\n";
+ }
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file\n";
+ sleep (1);
+ }
+
+ my $C_to_T_infile = my $G_to_A_infile = $filename;
+
+ if ($gzip){
+ $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
+ $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
+ }
+ else{
+ $C_to_T_infile =~ s/$/_C_to_T.fastq/;
+ $G_to_A_infile =~ s/$/_G_to_A.fastq/;
+ }
+
+ if ($prefix){
+ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+ $C_to_T_infile = "$prefix.$C_to_T_infile";
+ $G_to_A_infile = "$prefix.$G_to_A_infile";
+ # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ if ($gzip){
+ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ }
+ }
+ elsif ($read_number == 2){
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ if ($gzip){
+ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ }
+ else{
+ die "Read number needs to be 1 or 2, but was $read_number!\n\n";
+ }
+ }
+ else{
+ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
+ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
+ if ($gzip){
+ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
+ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
+ }
+ else{
+ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
+ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
+ }
+ }
+
+ my $count = 0;
+ while (1){
+ my $identifier = ;
+ my $sequence = ;
+ my $identifier2 = ;
+ my $quality_score = ;
+ last unless ($identifier and $sequence and $identifier2 and $quality_score);
+ ++$count;
+
+ $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence= uc$sequence; # make input file case insensitive
+
+ ## small check if the sequence file appears to be a FastQ file
+ if ($count == 1){
+ if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
+ die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
+ }
+ }
+ my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
+
+ if ($read_number == 1){
+ if ($bowtie2){
+ $identifier =~ s/$/\/1\/1/;
+ }
+ else{
+ $identifier =~ s/$/\/1/;
+ }
+ }
+ elsif ($read_number == 2){
+ if ($bowtie2){
+ $identifier =~ s/$/\/2\/2/;
+ }
+ else{
+ $identifier =~ s/$/\/2/;
+ }
+ }
+ else{
+ die "Read number needs to be 1 or 2\n";
+ }
+
+ $sequence_C_to_T =~ tr/C/T/;
+ $sequence_G_to_A =~ tr/G/A/;
+
+ if ($directional){
+ if ($read_number == 1){
+ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
+ }
+ else{
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ }
+ else{
+ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
+ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
+ }
+ }
+
+ if ($directional){
+ if ($read_number == 1){
+ warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ else{
+ warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ }
+ else{
+ warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
+ }
+ if ($directional){
+ if ($read_number == 1){
+ close CTOT or die "Failed to close filehandle $!\n";
+ return ($C_to_T_infile);
+ }
+ else{
+ close GTOA or die "Failed to close filehandle $!\n";
+ return ($G_to_A_infile);
+ }
+ }
+ else{
+ close CTOT or die "Failed to close filehandle $!\n";
+ close GTOA or die "Failed to close filehandle $!\n";
+ return ($C_to_T_infile,$G_to_A_infile);
+ }
+}
+
+
+### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
+
+sub biTransformFastQFiles_paired_end_bowtie1_gzip {
+ my ($file_1,$file_2) = @_;
+ my ($dir,$filename);
+
+ if ($file_1 =~ /\//){
+ ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
+ }
+ else{
+ $filename = $file_1;
+ }
+
+ ### gzipped version of infile 1
+ if ($file_1 =~ /\.gz$/){
+ open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
+ }
+ else{
+ open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
+ }
+ ### gzipped version of infile 2
+ if ($file_2 =~ /\.gz$/){
+ open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
+ }
+ else{
+ open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
+ }
+
+
+ if ($skip){
+ warn "Skipping the first $skip reads from $file_1 and $file_2\n";
+ sleep (1);
+ }
+ if ($upto){
+ warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
+ sleep (1);
+ }
+
+ my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
+
+ if ($prefix){
+ # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
+ $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
+ $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
+ # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
+ }
+
+ $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
+ $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
+ # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
+
+ warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
+ open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
+ # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
+
+ unless ($directional){
+ print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
+ open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
+ }
+
+ ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
+ ###
+
+ my $count = 0;
+ while (1){
+ my $identifier_1 = ;
+ my $sequence_1 = ;
+ my $identifier2_1 = ;
+ my $quality_score_1 = ;
+
+ my $identifier_2 = ;
+ my $sequence_2 = ;
+ my $identifier2_2 = ;
+ my $quality_score_2 = ;
+
+ last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
+
+ ++$count;
+
+ ## small check if the sequence file appears to be a FastQ file
+ if ($count == 1){
+ if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
+ die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
+ }
+ if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
+ die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
+ }
+ }
+
+ $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
+ chomp $identifier_1;
+ chomp $sequence_1;
+ chomp $sequence_2;
+ chomp $quality_score_1;
+ chomp $quality_score_2;
+
+ $identifier_1 =~ s/^\@//;
+ $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
+
+ if ($skip){
+ next unless ($count > $skip);
+ }
+ if ($upto){
+ last if ($count > $upto);
+ }
+
+ $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
+ $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
+
+ # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
+ my $sequence_1_C_to_T = $sequence_1;
+ my $sequence_2_G_to_A = $sequence_2;
+ $sequence_1_C_to_T =~ tr/C/T/;
+ $sequence_2_G_to_A =~ tr/G/A/;
+
+ print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
+
+ unless ($directional){
+ my $sequence_1_G_to_A = $sequence_1;
+ my $sequence_2_C_to_T = $sequence_2;
+ $sequence_1_G_to_A =~ tr/G/A/;
+ $sequence_2_C_to_T =~ tr/C/T/;
+ print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
+ }
+ }
+
+ close CTPLUSGA or die "Couldn't close filehandle\n";
+ warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
+
+ if ($directional){
+ warn "\n";
+ return ($CT_plus_GA_infile);
+ }
+ else{
+ close GAPLUSCT or die "Couldn't close filehandle\n";
+ warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
+ return ($CT_plus_GA_infile,$GA_plus_CT_infile);
+ }
+}
+
+
+sub fix_IDs{
+ my $id = shift;
+ $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
+ return $id;
+}
+
+sub ensure_sensical_alignment_orientation_single_end{
+ my $index = shift; # index number if the sequence produced an alignment
+ my $strand = shift;
+ ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
+ my $orientation = 0;
+ ##############################################################################################################
+ ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
+ ## here we only want reads in the forward (+) orientation
+ if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
+ ### if the alignment is (+) we count it, and return 1 for a correct orientation
+ if ($strand eq '+') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (-) the alignment is nonsensical
+ elsif ($strand eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ }
+ ###############################################################################################################
+ ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
+ ## here we only want reads in the forward (-) orientation
+ elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
+ ### if the alignment is (-) we count it and return 1 for a correct orientation
+ if ($strand eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (+) the alignment is nonsensical
+ elsif ($strand eq '+') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ }
+ ###############################################################################################################
+ ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
+ ## here we only want reads in the forward (-) orientation
+ elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
+ ### if the alignment is (-) we count it and return 1 for a correct orientation
+ if ($strand eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (+) the alignment is nonsensical
+ elsif ($strand eq '+') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ }
+ ###############################################################################################################
+ ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
+ ## here we only want reads in the forward (+) orientation
+ elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
+ ### if the alignment is (+) we count it and return 1 for a correct orientation
+ if ($strand eq '+') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the orientation equals (-) the alignment is nonsensical
+ elsif ($strand eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ } else{
+ die "One of the above conditions must be true\n";
+ }
+}
+
+sub ensure_sensical_alignment_orientation_paired_ends{
+ my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
+ ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
+ my $orientation = 0;
+ ##############################################################################################################
+ ## [Index 0, sequence originated from (converted) forward strand]
+ ## CT converted read 1
+ ## GA converted read 2
+ ## CT converted genome
+ ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
+ if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
+ ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ ###############################################################################################################
+ ## [Index 1, sequence originated from (converted) reverse strand]
+ ## GA converted read 1
+ ## CT converted read 2
+ ## GA converted genome
+ ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
+ elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
+ ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ ###############################################################################################################
+ ## [Index 2, sequence originated from complementary to (converted) forward strand]
+ ## GA converted read 1
+ ## CT converted read 2
+ ## CT converted genome
+ ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
+ elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
+ ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ ###############################################################################################################
+ ## [Index 3, sequence originated from complementary to (converted) reverse strand]
+ ## CT converted read 1
+ ## GA converted read 2
+ ## GA converted genome
+ ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
+ elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
+ ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
+ if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
+ $fhs[$index]->{seen}++;
+ $orientation = 1;
+ return $orientation;
+ }
+ ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
+ elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
+ $fhs[$index]->{wrong_strand}++;
+ return $orientation;
+ }
+ else{
+ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
+ }
+ }
+ else{
+ die "One of the above conditions must be true\n";
+ }
+}
+
+#####################################################################################################################################################
+
+### Bowtie 1 (default) | PAIRED-END | FASTA
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastA {
+
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+
+ if ($directional){
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
+ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ my $line_1 = $fh->{fh}->getline();
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | PAIRED-END | FASTA
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ if ($directional){
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else{
+ last; # no alignment output
+ }
+ }
+
+ my $line_1 = $_;
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+### Bowtie 1 (default) | PAIRED-END | FASTQ
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+
+ if ($directional){
+ warn "Input file is $C_to_T_infile_1 (FastQ)\n";
+ }
+ elsif($pbat){
+ warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional or $pbat){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional or $pbat){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next; # skipping unwanted filehandles
+ }
+ }
+
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ if ($gzip){
+ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
+ open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
+ }
+ else{
+ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
+ sleep(5);
+ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+ }
+
+ my $line_1 = $fh->{fh}->getline();
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | PAIRED-END | FASTQ
+
+sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
+ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
+ if ($directional){
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
+ }
+
+ ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ if ($directional){
+ unless ($fh->{inputfile_1}){
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ next;
+ }
+ }
+
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+
+ warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
+
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else{
+ last; # no alignment output
+ }
+ }
+
+ my $line_1 = $_;
+ my $line_2 = $fh->{fh}->getline();
+
+ # if Bowtie produces an alignment we store the first line of the output
+ if ($line_1 and $line_2) {
+ chomp $line_1;
+ chomp $line_2;
+ ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
+ ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
+
+ my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
+ my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
+
+ if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
+ $fh->{last_seq_id} = $id_1;
+ }
+ elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
+ $fh->{last_seq_id} = $id_2;
+ }
+ else{
+ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
+ }
+
+ $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
+ $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
+ warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
+ }
+
+ # otherwise we just initialise last_seq_id and last_lines as undefined
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line_1} = undef;
+ $fh->{last_line_2} = undef;
+ }
+ }
+}
+
+#####################################################################################################################################################
+
+### Bowtie 1 (default) | SINGLE-END | FASTA
+sub single_end_align_fragments_to_bisulfite_genome_fastA {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ warn "Input file is $C_to_T_infile (FastA)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
+ if ($gzip){
+ open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
+ }
+ else{
+ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
+ }
+
+ # if Bowtie produces an alignment we store the first line of the output
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefined
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | SINGLE-END | FASTA
+sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ warn "Input file is $C_to_T_infile (FastA)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+
+ warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
+
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else{
+ last; # no alignment output
+ }
+ }
+
+ # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+
+### Bowtie 1 (default) | SINGLE-END | FASTQ
+sub single_end_align_fragments_to_bisulfite_genome_fastQ {
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ warn "Input file is $C_to_T_infile (FastQ)\n";
+ }
+ elsif($pbat){
+ warn "Input file is $G_to_A_infile (FastQ)\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
+ }
+
+
+ ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## the data structure above
+ if ($directional or $pbat){
+ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+
+ foreach my $fh (@fhs) {
+ my $bt_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt_options .= ' --nofw';
+ }
+
+ warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
+ sleep (5);
+
+ if ($gzip){
+ open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
+ }
+ else{
+ open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
+ }
+
+ # if Bowtie produces an alignment we store the first line of the output
+ $_ = $fh->{fh}->getline();
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefined
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+### Bowtie 2 | SINGLE-END | FASTQ
+sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
+
+ my ($C_to_T_infile,$G_to_A_infile) = @_;
+ if ($directional){
+ warn "Input file is $C_to_T_infile (FastQ)\n\n";
+ }
+ else{
+ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
+ }
+
+ ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
+ ## the data structure above
+ if ($directional){
+ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ else{
+ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
+ }
+ foreach my $fh (@fhs) {
+ my $bt2_options = $bowtie_options;
+ if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
+ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
+ }
+ else {
+ $bt2_options .= ' --nofw';
+ }
+ warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
+ warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
+
+ open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
+ ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
+ while (1){
+ $_ = $fh->{fh}->getline();
+ # warn "$_\n";
+ # sleep(1);
+ if ($_) {
+ last unless ($_ =~ /^\@/); # SAM headers start with @
+ }
+ else {
+ last;
+ }
+ }
+
+ # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
+ if ($_) {
+ chomp;
+ my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
+ $fh->{last_seq_id} = $id;
+ $fh->{last_line} = $_;
+ warn "Found first alignment:\t$fh->{last_line}\n";
+ # warn "storing $id and\n$_\n";
+ }
+ # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
+ else {
+ warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
+ $fh->{last_seq_id} = undef;
+ $fh->{last_line} = undef;
+ }
+ }
+}
+
+###########################################################################################################################################
+
+sub reset_counters_and_fhs{
+ my $filename = shift;
+ %counting=(
+ total_meCHH_count => 0,
+ total_meCHG_count => 0,
+ total_meCpG_count => 0,
+ total_meC_unknown_count => 0,
+ total_unmethylated_CHH_count => 0,
+ total_unmethylated_CHG_count => 0,
+ total_unmethylated_CpG_count => 0,
+ total_unmethylated_C_unknown_count => 0,
+ sequences_count => 0,
+ no_single_alignment_found => 0,
+ unsuitable_sequence_count => 0,
+ genomic_sequence_could_not_be_extracted_count => 0,
+ unique_best_alignment_count => 0,
+ low_complexity_alignments_overruled_count => 0,
+ CT_CT_count => 0, #(CT read/CT genome, original top strand)
+ CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
+ GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
+ GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
+ CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
+ GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
+ GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
+ CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
+ alignments_rejected_count => 0, # only relevant if --directional was specified
+ );
+
+ if ($directional){
+ if ($filename =~ ','){ # paired-end files
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadCTgenome',
+ strand_identity => 'compl ori con forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadGAgenome',
+ strand_identity => 'compl ori con reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+ else{ # single-end files
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+ }
+ elsif($pbat){
+ if ($filename =~ ','){ # paired-end files
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadCTgenome',
+ strand_identity => 'compl ori con forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadGAgenome',
+ strand_identity => 'compl ori con reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+ else{ # single-end files
+ @fhs=(
+ { name => 'GAreadCTgenome',
+ strand_identity => 'compl ori con forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadGAgenome',
+ strand_identity => 'compl ori con reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+ }
+ else{
+ @fhs=(
+ { name => 'CTreadCTgenome',
+ strand_identity => 'con ori forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'CTreadGAgenome',
+ strand_identity => 'con ori reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadCTgenome',
+ strand_identity => 'compl ori con forward',
+ bisulfiteIndex => $CT_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ { name => 'GAreadGAgenome',
+ strand_identity => 'compl ori con reverse',
+ bisulfiteIndex => $GA_index_basename,
+ seen => 0,
+ wrong_strand => 0,
+ },
+ );
+ }
+}
+
+
+sub process_command_line{
+ my @bowtie_options;
+ my $help;
+ my $mates1;
+ my $mates2;
+ my $path_to_bowtie;
+ my $fastq;
+ my $fasta;
+ my $skip;
+ my $qupto;
+ my $phred64;
+ my $phred33;
+ my $solexa;
+ my $mismatches;
+ my $seed_length;
+ my $best;
+ my $sequence_format;
+ my $version;
+ my $quiet;
+ my $chunk;
+ my $non_directional;
+ my $ceiling;
+ my $maxins;
+ my $minins;
+ my $unmapped;
+ my $multi_map;
+ my $output_dir;
+ my $bowtie2;
+ my $vanilla;
+ my $sam_no_hd;
+ my $seed_extension_fails;
+ my $reseed_repetitive_seeds;
+ my $most_valid_alignments;
+ my $score_min;
+ my $parallel;
+ my $temp_dir;
+ my $rdg;
+ my $rfg;
+ my $non_bs_mm;
+ my $samtools_path;
+ my $bam;
+ my $gzip;
+ my $pbat;
+ my $prefix;
+ my $old_flag;
+
+ my $command_line = GetOptions ('help|man' => \$help,
+ '1=s' => \$mates1,
+ '2=s' => \$mates2,
+ 'path_to_bowtie=s' => \$path_to_bowtie,
+ 'f|fasta' => \$fasta,
+ 'q|fastq' => \$fastq,
+ 's|skip=i' => \$skip,
+ 'u|upto=i' => \$qupto,
+ 'phred33-quals' => \$phred33,
+ 'phred64-quals|solexa1' => \$phred64,
+ 'solexa-quals' => \$solexa,
+ 'n|seedmms=i' => \$mismatches,
+ 'l|seedlen=i' => \$seed_length,
+ 'no_best' => \$best,
+ 'version' => \$version,
+ 'quiet' => \$quiet,
+ 'chunkmbs=i' => \$chunk,
+ 'non_directional' => \$non_directional,
+ 'I|minins=i' => \$minins,
+ 'X|maxins=i' => \$maxins,
+ 'e|maqerr=i' => \$ceiling,
+ 'un|unmapped' => \$unmapped,
+ 'ambiguous' => \$multi_map,
+ 'o|output_dir=s' => \$output_dir,
+ 'bowtie2' => \$bowtie2,
+ 'vanilla' => \$vanilla,
+ 'sam-no-hd' => \$sam_no_hd,
+ 'D=i' => \$seed_extension_fails,
+ 'R=i' => \$reseed_repetitive_seeds,
+ 'score_min=s' => \$score_min,
+ 'most_valid_alignments=i' => \$most_valid_alignments,
+ 'p=i' => \$parallel,
+ 'temp_dir=s' => \$temp_dir,
+ 'rdg=s' => \$rdg,
+ 'rfg=s' => \$rfg,
+ 'non_bs_mm' => \$non_bs_mm,
+ 'samtools_path=s' => \$samtools_path,
+ 'bam' => \$bam,
+ 'gzip' => \$gzip,
+ 'pbat' => \$pbat,
+ 'prefix=s' => \$prefix,
+ 'old_flag' => \$old_flag,
+ );
+
+
+ ### EXIT ON ERROR if there were errors with any of the supplied options
+ unless ($command_line){
+ die "Please respecify command line options\n";
+ }
+ ### HELPFILE
+ if ($help){
+ print_helpfile();
+ exit;
+ }
+ if ($version){
+ print << "VERSION";
+
+
+ Bismark - Bisulfite Mapper and Methylation Caller.
+
+ Bismark Version: $bismark_version
+ Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
+ www.bioinformatics.babraham.ac.uk/projects/
+
+
+VERSION
+ exit;
+ }
+
+
+ ##########################
+ ### PROCESSING OPTIONS ###
+ ##########################
+
+ unless ($bowtie2){
+ $bowtie2 = 0;
+ }
+ unless ($sam_no_hd){
+ $sam_no_hd =0;
+ }
+
+ ### PATH TO BOWTIE
+ ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
+ if ($path_to_bowtie){
+ unless ($path_to_bowtie =~ /\/$/){
+ $path_to_bowtie =~ s/$/\//;
+ }
+ if (-d $path_to_bowtie){
+ if ($bowtie2){
+ $path_to_bowtie = "${path_to_bowtie}bowtie2";
+ }
+ else{
+ $path_to_bowtie = "${path_to_bowtie}bowtie";
+ }
+ }
+ else{
+ die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
+ }
+ }
+ else{
+ if ($bowtie2){
+ $path_to_bowtie = 'bowtie2';
+ warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
+ else{
+ $path_to_bowtie = 'bowtie';
+ warn "Path to Bowtie specified as: $path_to_bowtie\n";
+ }
+ }
+
+ ### OUTPUT REQUESTED AS BAM FILE
+ if ($bam){
+ if ($vanilla){
+ die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
+ }
+
+ ### PATH TO SAMTOOLS
+ if (defined $samtools_path){
+ # if Samtools was specified as full command
+ if ($samtools_path =~ /samtools$/){
+ if (-e $samtools_path){
+ # Samtools executable found
+ }
+ else{
+ die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+ }
+ }
+ else{
+ unless ($samtools_path =~ /\/$/){
+ $samtools_path =~ s/$/\//;
+ }
+ $samtools_path .= 'samtools';
+ if (-e $samtools_path){
+ # Samtools executable found
+ }
+ else{
+ die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+ }
+ }
+
+ warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
+ $bam = 1;
+ }
+ # Check whether Samtools is in the PATH if no path was supplied by the user
+ else{
+ if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
+ $samtools_path = `which samtools`;
+ chomp $samtools_path;
+ warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
+ $bam = 1;
+ }
+ }
+
+ unless (defined $samtools_path){
+ $bam = 2;
+ warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
+ }
+ sleep (1);
+ }
+
+
+ ####################################
+ ### PROCESSING ARGUMENTS
+
+ ### GENOME FOLDER
+ my $genome_folder = shift @ARGV; # mandatory
+ unless ($genome_folder){
+ warn "Genome folder was not specified!\n";
+ print_helpfile();
+ exit;
+ }
+
+ ### checking that the genome folder, all subfolders and the required bowtie index files exist
+ unless ($genome_folder =~/\/$/){
+ $genome_folder =~ s/$/\//;
+ }
+
+ if (chdir $genome_folder){
+ my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
+ unless ($absolute_genome_folder =~/\/$/){
+ $absolute_genome_folder =~ s/$/\//;
+ }
+ warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
+ $genome_folder = $absolute_genome_folder;
+ }
+ else{
+ die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] {-1 -2