comparison methylation_extractor @ 17:59a61c1d5aee draft default tip

Uploaded
author eugen
date Thu, 16 Aug 2012 08:11:06 -0400
parents
children
comparison
equal deleted inserted replaced
16:bfda928ca5d8 17:59a61c1d5aee
1 #!/usr/bin/perl
2 use Getopt::Long;
3 use Cwd;
4
5 my @filenames;
6 my %counting;
7
8 my %fhs;
9 my %bases;
10 my ($single,$paired,$output_file) = process_commandline();
11 my $parent_dir = getcwd;
12
13 process_Bismark_results_file();
14
15 sub process_commandline{
16 my $help;
17 my $single_end;
18 my $paired_end;
19 my $ignore;
20 my $genomic_fasta;
21 my $full;
22 my $report;
23 my $extractor_version;
24 my $no_overlap;
25 my $merge_non_CpG;
26 my $vanilla;
27 my $output_file;
28
29 my $command_line = GetOptions ('p|paired-end' => \$paired_end,
30 's|single-end' => \$single_end,
31 'o|output=s' => \$output_file
32 );
33
34 ### EXIT ON ERROR if there were errors with any of the supplied options
35 unless ($command_line){
36 die "Please respecify command line options\n";
37 }
38
39
40 ### no files provided
41 unless (@ARGV){
42 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
43 }
44 @filenames = @ARGV;
45
46 if ($single_end){
47 $paired_end = 0; ### SINGLE END ALIGNMENTS
48 }
49 elsif ($paired_end){
50 $single_end = 0; ### PAIRED-END ALIGNMENTS
51 }
52 else{
53 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
54 }
55
56 return ($single_end,$paired_end, $output_file);
57 }
58
59 sub process_Bismark_results_file{
60 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
61 foreach my $filename (@filenames){
62 %fhs = ();
63 %counting =(
64 total_meCHG_count => 0,
65 total_meCHH_count => 0,
66 total_meCpG_count => 0,
67 total_unmethylated_CHG_count => 0,
68 total_unmethylated_CHH_count => 0,
69 total_unmethylated_CpG_count => 0,
70 sequences_count => 0,
71 );
72 warn "\nNow reading in Bismark result file $filename\n\n";
73 open (IN,$filename) or die "Can't open file $filename: $!\n";
74
75 if($output_file){
76 $full_output=$output_file;
77 }
78 else{
79 #print "lbubb";
80 my($dir, $output_filename, $extension) = $filename =~ m/(.*\/)(.*)(\..*)$/;
81 my $cpg_output = my $other_c_output = my $full_output = $output_filename;
82 $full_output =~ s/^/Full_/;
83 $full_output = $dir."/".$full_output.".bed";
84 }
85
86 ### OPENING OUTPUT-FILEHANDLES
87 open ($fhs{full_output},'>',$full_output) or die "Failed to write to $full_output $!\n";
88
89 ### Write header
90 printf {$fhs{full_output}} ("chr\tpos\tstrand\tcontext\tcoverage\tmethylated\tpercentage methylated\n");
91
92 my $methylation_call_strings_processed = 0;
93 my $line_count = 0;
94
95
96 ### PROCESSING SINGLE-END RESULT FILES
97 if ($single){
98
99 while (<IN>){
100 ### SAM format can either start with header lines (starting with @) or start with alignments directly
101 if (/^\@/){ # skipping header lines (starting with @)
102 warn "skipping SAM header line:\t$_";
103 next;
104 }
105
106 ++$line_count;
107 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
108
109 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
110
111 $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/);
112 $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/);
113
114 if ("\U$chrom" !~ /CHR.*/){
115 $chrom = "chr".$chrom;
116 }
117
118 if (!defined $last_chrom){
119 print "Now parsing data from $chrom\n";
120 }
121
122 if (defined($last_chrom) and $last_chrom ne $chrom){
123 print "Parsed: $last_chrom\n";
124 write_hash($last_chrom);
125 print "Now parsing data from $chrom\n";
126 }
127
128 $meth_call =~ s/^XM:Z://;
129 $read_conversion =~ s/^XR:Z://;
130 $genome_conversion =~ s/^XG:Z://;
131 my $strand;
132 chomp $genome_conversion;
133
134
135 my $index;
136 if ($meth_call){
137 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand
138 $index = 0;
139 $strand = '+';
140 }
141 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand
142 $index = 1;
143 $strand = '-';
144 }
145 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand
146 $index = 2;
147 $strand = '+';
148 }
149 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand
150 $index = 3;
151 $strand = '-';
152 }
153 else {
154 die "Unexpected combination of read and genome conversion: $read_conversion / $genome_conversion\n";
155 }
156
157 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
158 if ($strand eq '-'){
159 $meth_call = reverse $meth_call;
160 }
161 }
162
163
164 ### printing out the methylation state of every C in the read
165 hash_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
166
167 ++$methylation_call_strings_processed; # 1 per single-end result
168
169 $last_chrom = $chrom;
170 }
171 write_hash($last_chrom);
172 print "\n\nDone!";
173 }
174
175 ### PROCESSING PAIRED-END RESULT FILES
176 elsif ($paired){
177 while (<IN>){
178 ### SAM format can either start with header lines (starting with @) or start with alignments directly
179 if (/^\@/){ # skipping header lines (starting with @)
180 warn "skipping SAM header line:\t$_";
181 next;
182 }
183
184 ++$line_count;
185 # example paired-end reads in SAM format (2 consecutive lines)
186 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
187 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
188
189
190 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
191 $_ = <IN>; # reading in the paired read
192 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
193
194
195 $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/);
196 $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/);
197 if ("\U$chrom" !~ /CHR.*/){
198 $chrom = "chr".$chrom;
199 }
200 if (!defined $last_chrom){
201 print "Now parsing data from $chrom\n";
202 }
203 if (defined($last_chrom) and $last_chrom ne $chrom){
204 print "Parsed: $last_chrom\n";
205 write_hash($last_chrom);
206 print "Now parsing data from $chrom\n";
207 }
208 $meth_call_1 =~ s/^XM:Z://;
209 $meth_call_2 =~ s/^XM:Z://;
210 $first_read_conversion =~ s/^XR:Z://;
211 $second_read_conversion =~ s/^XR:Z://;
212 $genome_conversion =~ s/^XG:Z://;
213 chomp $genome_conversion;
214
215
216 my $index;
217 my $strand;
218
219 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){
220 $index = 0; ## this is OT
221 $strand = '+';
222 }
223 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){
224 $index = 1; ## this is CTOT
225 $strand = '-';
226 }
227 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){
228 $index = 2; ## this is CTOB
229 $strand = '+';
230 }
231 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){
232 $index = 3; ## this is OB
233 $strand = '-';
234 }
235 else {
236 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
237 }
238
239 ### reversing the methylation call of the read that was reverse-complemented
240 if ($strand eq '+'){
241 # $meth_call_2 = reverse $meth_call_2;
242 }
243 else{
244 #$meth_call_1 = reverse $meth_call_1;
245 }
246
247 if ($meth_call_1 and $meth_call_2){
248 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
249 my $end_read_1;
250
251 if ($strand eq '+'){
252 $end_read_1 = $start_read_1+length($meth_call_1)-1;
253 #$start_read_2 += length($meth_call_2)-1; ## $meth_call_2 is already shortened! Passing on the start position on the reverse strand
254 }
255 else{
256 $end_read_1 = $start_read_1+length($meth_call_1)-1;
257 #$end_read_1 = $start_read_1;
258 #$start_read_1 += length($meth_call_1)-1; ## $meth_call_1 is already shortened! Passing on the start position on the reverse strand
259 }
260
261 if ($strand eq '+'){
262 ## we first pass the first read which is in + orientation on the forward strand
263 hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0);
264
265 # we next pass the second read which is in - orientation on the reverse strand
266 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
267 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$end_read_1);
268 }
269
270 else{
271 ## we first pass the first read which is in - orientation on the reverse strand
272 hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0);
273
274 # we next pass the second read which is in + orientation on the forward strand
275 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
276 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$end_read_1);
277 }
278
279 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
280 }
281 $last_chrom = $chrom;
282 }
283 write_hash($last_chrom);
284 print "\n\nDone!";
285 }
286 else{
287 die "Single-end or paired-end reads not specified properly\n";
288 }
289
290 print "\n\nProcessed $line_count lines from $filename in total\n";
291 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
292 }
293 }
294
295
296 sub hash_individual_C_methylation_states_paired_end_files{
297
298 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$end_read_1) = @_;
299 my @methylation_calls = split(//,$meth_call);
300
301 #################################################################
302 ### . for bases not involving cytosines ###
303 ### X for methylated C in CHG context (was protected) ###
304 ### x for not methylated C in CHG context (was converted) ###
305 ### H for methylated C in CHH context (was protected) ###
306 ### h for not methylated C in CHH context (was converted) ###
307 ### Z for methylated C in CpG context (was protected) ###
308 ### z for not methylated C in CpG context (was converted) ###
309 #################################################################
310
311 #my $methyl_CHG_count = 0;
312 #my $methyl_CHH_count = 0;
313 #my $methyl_CpG_count = 0;
314 #my $unmethylated_CHG_count = 0;
315 #my $unmethylated_CHH_count = 0;
316 #my $unmethylated_CpG_count = 0;
317
318 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and $methylation_calls[$indexnon-CpG) SECTION IF --merge_non_CpG was specified
319
320 if ($strand eq '+') {
321 for my $index (0..$#methylation_calls) {
322
323 ### Returning as soon as the methylation calls start overlapping
324 #if ($start+$index >= $end_read_1) {
325 # return;
326 #}
327
328 if ($methylation_calls[$index] eq 'X') {
329 #$counting{total_meCHG_count}++;
330 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
331 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
332 }else{
333 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
334 }
335 } elsif ($methylation_calls[$index] eq 'x') {
336 #$counting{total_unmethylated_CHG_count}++;
337 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
338 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
339 }else{
340 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
341 }
342 } elsif ($methylation_calls[$index] eq 'Z') {
343 #$counting{total_meCpG_count}++;
344 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
345 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
346 }else{
347 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
348 }
349 } elsif ($methylation_calls[$index] eq 'z') {
350 #$counting{total_unmethylated_CpG_count}++;
351 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
352 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
353 }else{
354 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
355 }
356 } elsif ($methylation_calls[$index] eq 'H') {
357 #$counting{total_meCHH_count}++;
358 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
359 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
360 }else{
361 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
362 }
363 } elsif ($methylation_calls[$index] eq 'h') {
364 #$counting{total_unmethylated_CHH_count}++;
365 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
366 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
367 }else{
368 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
369 }
370 }
371 elsif ($methylation_calls[$index] eq '.'){}
372 else{
373 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
374 }
375 }
376 } elsif ($strand eq '-') {
377 for my $index (0..$#methylation_calls) {
378
379 ### Returning as soon as the methylation calls start overlapping
380 #if ($start-$index <= $end_read_1) {
381 # return;
382 #}
383 =head1
384 if ($start-$index == 100){
385 print "\nStrand: - ID: $id\n";
386 }
387
388 if ($methylation_calls[$index] eq 'X') {
389 #$counting{total_meCHG_count}++;
390 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
391 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
392 }else{
393 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
394 }
395 } elsif ($methylation_calls[$index] eq 'x') {
396 #$counting{total_unmethylated_CHG_count}++;
397 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
398 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
399 }else{
400 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
401 }
402 } elsif ($methylation_calls[$index] eq 'Z') {
403 #$counting{total_meCpG_count}++;
404 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
405 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
406 }else{
407 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
408 }
409 } elsif ($methylation_calls[$index] eq 'z') {
410 #$counting{total_unmethylated_CpG_count}++;
411 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
412 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
413 }else{
414 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
415 }
416 } elsif ($methylation_calls[$index] eq 'H') {
417 # $counting{total_meCHH_count}++;
418 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
419 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
420 }else{
421 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
422 }
423 } elsif ($methylation_calls[$index] eq 'h') {
424 #$counting{total_unmethylated_CHH_count}++;
425 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
426 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
427 }else{
428 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
429 }
430 }
431 elsif ($methylation_calls[$index] eq '.') {}
432 else{
433 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
434 }
435 }
436 =cut
437 if ($methylation_calls[$index] eq 'X') {
438 #$counting{total_meCHG_count}++;
439 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
440 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
441 }else{
442 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
443 }
444 } elsif ($methylation_calls[$index] eq 'x') {
445 #$counting{total_unmethylated_CHG_count}++;
446 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
447 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
448 }else{
449 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
450 }
451 } elsif ($methylation_calls[$index] eq 'Z') {
452 #$counting{total_meCpG_count}++;
453 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
454 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
455 }else{
456 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
457 }
458 } elsif ($methylation_calls[$index] eq 'z') {
459 #$counting{total_unmethylated_CpG_count}++;
460 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
461 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
462 }else{
463 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
464 }
465 } elsif ($methylation_calls[$index] eq 'H') {
466 # $counting{total_meCHH_count}++;
467 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
468 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
469 }else{
470 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
471 }
472 } elsif ($methylation_calls[$index] eq 'h') {
473 #$counting{total_unmethylated_CHH_count}++;
474 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
475 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
476 }else{
477 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
478 }
479 }
480 elsif ($methylation_calls[$index] eq '.') {}
481 else{
482 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
483 }
484 }
485 } else {
486 die "This cannot happen $!\n";
487 }
488
489 =head1
490 if ($strand eq '+') {
491 for my $index (0..$#methylation_calls) {
492
493 ### Returning as soon as the methylation calls start overlapping
494 #if ($start+$index >= $end_read_1) {
495 # return;
496 #}
497 if ($methylation_calls[$index] eq 'X') {
498 #$counting{total_meCHG_count}++;
499 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
500 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
501 }else{
502 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
503 }
504 } elsif ($methylation_calls[$index] eq 'x') {
505 #$counting{total_unmethylated_CHG_count}++;
506 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
507 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
508 }else{
509 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
510 }
511 } elsif ($methylation_calls[$index] eq 'Z') {
512 #$counting{total_meCpG_count}++;
513 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
514 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
515 }else{
516 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
517 }
518 } elsif ($methylation_calls[$index] eq 'z') {
519 #$counting{total_unmethylated_CpG_count}++;
520 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
521 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
522 }else{
523 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
524 }
525 } elsif ($methylation_calls[$index] eq 'H') {
526 #$counting{total_meCHH_count}++;
527 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
528 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
529 }else{
530 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
531 }
532 } elsif ($methylation_calls[$index] eq 'h') {
533 #$counting{total_unmethylated_CHH_count}++;
534 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
535 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
536 }else{
537 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
538 }
539 }
540 elsif ($methylation_calls[$index] eq '.'){}
541 else{
542 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
543 }
544 }
545 } elsif ($strand eq '-') {
546 for my $index (0..$#methylation_calls) {
547
548 ### Returning as soon as the methylation calls start overlapping
549 #if ($start-$index <= $end_read_1) {
550 # return;
551 #}
552
553 if ($methylation_calls[$index] eq 'X') {
554 #$counting{total_meCHG_count}++;
555 if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){
556 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++;
557 }else{
558 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1;
559 }
560 } elsif ($methylation_calls[$index] eq 'x') {
561 #$counting{total_unmethylated_CHG_count}++;
562 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
563 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
564 }else{
565 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
566 }
567 } elsif ($methylation_calls[$index] eq 'Z') {
568 #$counting{total_meCpG_count}++;
569 if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){
570 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++;
571 }else{
572 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1;
573 }
574 } elsif ($methylation_calls[$index] eq 'z') {
575 #$counting{total_unmethylated_CpG_count}++;
576 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
577 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
578 }else{
579 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
580 }
581 } elsif ($methylation_calls[$index] eq 'H') {
582 # $counting{total_meCHH_count}++;
583 if (defined $bases{$start-$=cutindex}{"+"}{$methylation_calls[$index]}){
584 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++;
585 }else{
586 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1;
587 }
588 } elsif ($methylation_calls[$index] eq 'h') {
589 #$counting{total_unmethylated_CHH_count}++;
590 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
591 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
592 }else{
593 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
594 }
595 }
596 elsif ($methylation_calls[$index] eq '.') {}
597 else{
598 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
599 }
600 }
601 } else {
602 die "This cannot happen $!\n";
603 }
604 =cut
605 }
606
607
608
609 sub hash_individual_C_methylation_states_single_end{
610
611 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index) = @_;
612 my @methylation_calls = split(//,$meth_call);
613
614 #################################################################
615 ### . for bases not involving cytosines ###
616 ### X for methylated C in CHG context (was protected) ###
617 ### x for not methylated C in CHG context (was converted) ###
618 ### H for methylated C in CHH context (was protected) ###
619 ### h for not methylated C in CHH context (was converted) ###
620 ### Z for methylated C in CpG context (was protected) ###
621 ### z for not methylated C in CpG context (was converted) ###
622 #################################################################
623
624 #my $methyl_CHG_count = 0;
625 #my $methyl_CHH_count = 0;
626 #my $methyl_CpG_count = 0;
627 #my $unmethylated_CHG_count = 0;
628 #my $unmethylated_CHH_count = 0;
629 #my $unmethylated_CpG_count = 0;
630
631 ### single-file CpG and other-context output
632 if ($strand eq '+') {
633 for my $index (0..$#methylation_calls) {
634
635 ### methylated Cs (any context) will receive a forward (+) orientation
636 ### not methylated Cs (any context) will receive a reverse (-) orientation
637 if ($methylation_calls[$index] eq 'X') {
638 #$counting{total_meCHG_count}++;
639 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
640 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
641 }else{
642 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
643 }
644 }
645 elsif ($methylation_calls[$index] eq 'x') {
646 #$counting{total_unmethylated_CHG_count}++;
647 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
648 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
649 }else{
650 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
651 }
652 }
653 elsif ($methylation_calls[$index] eq 'Z') {
654 #$counting{total_meCpG_count}++;
655 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
656 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
657 }else{
658 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
659 }
660 }
661 elsif ($methylation_calls[$index] eq 'z') {
662 #$counting{total_unmethylated_CpG_count}++;
663 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
664 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
665 }else{
666 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
667 }
668 }
669 elsif ($methylation_calls[$index] eq 'H') {
670 #$counting{total_meCHH_count}++;
671 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
672 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
673 }else{
674 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
675 }
676 }
677 elsif ($methylation_calls[$index] eq 'h') {
678 #$counting{total_unmethylated_CHH_count}++;
679 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
680 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
681 }else{
682 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
683 }
684 }
685 elsif ($methylation_calls[$index] eq '.') {
686 }
687 else{
688 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
689 }
690 }
691 }
692 elsif ($strand eq '-') {
693
694 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
695 $start += length($meth_call)-1;
696 }
697
698 for my $index (0..$#methylation_calls) {
699 ### methylated Cs (any context) will receive a forward (+) orientation
700 ### not methylated Cs (any context) will receive a reverse (-) orientation
701 if ($methylation_calls[$index] eq 'X') {
702 #$counting{total_meCHG_count}++;
703 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
704 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
705 }else{
706 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
707 }
708 }
709 elsif ($methylation_calls[$index] eq 'x') {
710 #$counting{total_unmethylated_CHG_count}++;
711 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
712 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
713 }else{
714 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
715 }
716 }
717 elsif ($methylation_calls[$index] eq 'Z') {
718 #$counting{total_meCpG_count}++;
719 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
720 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
721 }else{
722 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
723 }
724 }
725 elsif ($methylation_calls[$index] eq 'z') {
726 #$counting{total_unmethylated_CpG_count}++;
727 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
728 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
729 }else{
730 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
731 }
732 }
733 elsif ($methylation_calls[$index] eq 'H') {
734 #$counting{total_meCHH_count}++;
735 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
736 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
737 }else{
738 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
739 }
740 }
741 elsif ($methylation_calls[$index] eq 'h') {
742 #$counting{total_unmethylated_CHH_count}++;
743 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
744 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
745 }else{
746 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
747 }
748 }
749 elsif ($methylation_calls[$index] eq '.'){
750 }
751 else{
752 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
753 }
754 }
755 }
756 else {
757 die "This cannot happen (or at least it shouldn't....)\n";
758 }
759 }
760
761 sub write_hash(){
762 my ($chrom) = @_;
763
764 for my $k1 (sort {$a <=> $b} (keys %bases)){
765 my $coverage = 0;
766 my $methylated = 0;
767 my $context;
768 for my $k2 (keys %{$bases{$k1}}){
769
770 for my $k3 (keys %{$bases{$k1}{$k2}}){
771 if ($k3 eq "Z" or $k3 eq "z"){
772 $context = "CPG";
773 }elsif ($k3 eq "H" or $k3 eq "h"){
774 $context = "CHH";
775 }elsif ($k3 eq "X" or $k3 eq "x"){
776 $context = "CHG";
777 }
778
779 #test, if $k3 is upper-case (==methylated base)
780 if ($k3 eq "\U$k3"){
781 $methylated+=$bases{$k1}{$k2}{$k3};
782 }
783 $coverage+=$bases{$k1}{$k2}{$k3};
784
785 }
786 my $percentage = ($methylated / $coverage)*100;
787
788 #####################################################################################
789 ### BED file format tabs: ###
790 ### 1. Chromosome ###
791 ### 2. Start position ###
792 ### 3. Strand ###
793 ### 4. Context (CpG,CHH,CHG) ###
794 ### 5. Coverage ###
795 ### 6. Methylated Cs ###
796 ### 7. Percentage methylated Cs ###
797 #####################################################################################
798
799 printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage );
800 }
801 }
802 %bases = ();
803 }
804
805
806 sub write_hash_back(){
807 my ($chrom) = @_;
808
809 for my $k1 (sort (keys %bases)){
810 my $coverage = 0;
811 my $methylated = 0;
812 my $context;
813 my $k2 = (keys %{$bases{$k1}})[0];
814 for my $k3 (keys %{$bases{$k1}{$k2}}){
815 if ($k3 eq "Z" or $k3 eq "z"){
816 $context = "CPG";
817 }elsif ($k3 eq "H" or $k3 eq "h"){
818 $context = "CHH";
819 }elsif ($k3 eq "X" or $k3 eq "x"){
820 $context = "CHG";
821 }
822
823 #test, if $k3 is upper-case (==methylated base)
824 if ($k3 eq "\U$k3"){
825 $methylated+=$bases{$k1}{$k2}{$k3};
826 }
827 $coverage+=$bases{$k1}{$k2}{$k3};
828
829 }
830 my $percentage = ($methylated / $coverage)*100;
831
832 ###################################################################################
833 ### BED file format tabs: ###
834 ### 1. Chromosome ###
835 ### 2. Start position ###
836 ### 3. Strand ###
837 ### 4. Context (CpG,CHH,CHG) ###
838 ### 5. Coverage ###
839 ### 6. Methylated Cs ###
840 ### 7. Percentage methylated Cs ###
841 ###################################################################################
842
843 printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage );
844
845 }
846 %bases = ();
847 }
848
849
850