Mercurial > repos > eugen > bs_seq_test_1
comparison methylation_extractor @ 17:59a61c1d5aee draft default tip
Uploaded
author | eugen |
---|---|
date | Thu, 16 Aug 2012 08:11:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:bfda928ca5d8 | 17:59a61c1d5aee |
---|---|
1 #!/usr/bin/perl | |
2 use Getopt::Long; | |
3 use Cwd; | |
4 | |
5 my @filenames; | |
6 my %counting; | |
7 | |
8 my %fhs; | |
9 my %bases; | |
10 my ($single,$paired,$output_file) = process_commandline(); | |
11 my $parent_dir = getcwd; | |
12 | |
13 process_Bismark_results_file(); | |
14 | |
15 sub process_commandline{ | |
16 my $help; | |
17 my $single_end; | |
18 my $paired_end; | |
19 my $ignore; | |
20 my $genomic_fasta; | |
21 my $full; | |
22 my $report; | |
23 my $extractor_version; | |
24 my $no_overlap; | |
25 my $merge_non_CpG; | |
26 my $vanilla; | |
27 my $output_file; | |
28 | |
29 my $command_line = GetOptions ('p|paired-end' => \$paired_end, | |
30 's|single-end' => \$single_end, | |
31 'o|output=s' => \$output_file | |
32 ); | |
33 | |
34 ### EXIT ON ERROR if there were errors with any of the supplied options | |
35 unless ($command_line){ | |
36 die "Please respecify command line options\n"; | |
37 } | |
38 | |
39 | |
40 ### no files provided | |
41 unless (@ARGV){ | |
42 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
43 } | |
44 @filenames = @ARGV; | |
45 | |
46 if ($single_end){ | |
47 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
48 } | |
49 elsif ($paired_end){ | |
50 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
51 } | |
52 else{ | |
53 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; | |
54 } | |
55 | |
56 return ($single_end,$paired_end, $output_file); | |
57 } | |
58 | |
59 sub process_Bismark_results_file{ | |
60 chdir $parent_dir or die "Unable to move to initial working directory $!\n"; | |
61 foreach my $filename (@filenames){ | |
62 %fhs = (); | |
63 %counting =( | |
64 total_meCHG_count => 0, | |
65 total_meCHH_count => 0, | |
66 total_meCpG_count => 0, | |
67 total_unmethylated_CHG_count => 0, | |
68 total_unmethylated_CHH_count => 0, | |
69 total_unmethylated_CpG_count => 0, | |
70 sequences_count => 0, | |
71 ); | |
72 warn "\nNow reading in Bismark result file $filename\n\n"; | |
73 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
74 | |
75 if($output_file){ | |
76 $full_output=$output_file; | |
77 } | |
78 else{ | |
79 #print "lbubb"; | |
80 my($dir, $output_filename, $extension) = $filename =~ m/(.*\/)(.*)(\..*)$/; | |
81 my $cpg_output = my $other_c_output = my $full_output = $output_filename; | |
82 $full_output =~ s/^/Full_/; | |
83 $full_output = $dir."/".$full_output.".bed"; | |
84 } | |
85 | |
86 ### OPENING OUTPUT-FILEHANDLES | |
87 open ($fhs{full_output},'>',$full_output) or die "Failed to write to $full_output $!\n"; | |
88 | |
89 ### Write header | |
90 printf {$fhs{full_output}} ("chr\tpos\tstrand\tcontext\tcoverage\tmethylated\tpercentage methylated\n"); | |
91 | |
92 my $methylation_call_strings_processed = 0; | |
93 my $line_count = 0; | |
94 | |
95 | |
96 ### PROCESSING SINGLE-END RESULT FILES | |
97 if ($single){ | |
98 | |
99 while (<IN>){ | |
100 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
101 if (/^\@/){ # skipping header lines (starting with @) | |
102 warn "skipping SAM header line:\t$_"; | |
103 next; | |
104 } | |
105 | |
106 ++$line_count; | |
107 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
108 | |
109 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
110 | |
111 $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); | |
112 $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); | |
113 | |
114 if ("\U$chrom" !~ /CHR.*/){ | |
115 $chrom = "chr".$chrom; | |
116 } | |
117 | |
118 if (!defined $last_chrom){ | |
119 print "Now parsing data from $chrom\n"; | |
120 } | |
121 | |
122 if (defined($last_chrom) and $last_chrom ne $chrom){ | |
123 print "Parsed: $last_chrom\n"; | |
124 write_hash($last_chrom); | |
125 print "Now parsing data from $chrom\n"; | |
126 } | |
127 | |
128 $meth_call =~ s/^XM:Z://; | |
129 $read_conversion =~ s/^XR:Z://; | |
130 $genome_conversion =~ s/^XG:Z://; | |
131 my $strand; | |
132 chomp $genome_conversion; | |
133 | |
134 | |
135 my $index; | |
136 if ($meth_call){ | |
137 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand | |
138 $index = 0; | |
139 $strand = '+'; | |
140 } | |
141 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand | |
142 $index = 1; | |
143 $strand = '-'; | |
144 } | |
145 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand | |
146 $index = 2; | |
147 $strand = '+'; | |
148 } | |
149 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand | |
150 $index = 3; | |
151 $strand = '-'; | |
152 } | |
153 else { | |
154 die "Unexpected combination of read and genome conversion: $read_conversion / $genome_conversion\n"; | |
155 } | |
156 | |
157 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
158 if ($strand eq '-'){ | |
159 $meth_call = reverse $meth_call; | |
160 } | |
161 } | |
162 | |
163 | |
164 ### printing out the methylation state of every C in the read | |
165 hash_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
166 | |
167 ++$methylation_call_strings_processed; # 1 per single-end result | |
168 | |
169 $last_chrom = $chrom; | |
170 } | |
171 write_hash($last_chrom); | |
172 print "\n\nDone!"; | |
173 } | |
174 | |
175 ### PROCESSING PAIRED-END RESULT FILES | |
176 elsif ($paired){ | |
177 while (<IN>){ | |
178 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
179 if (/^\@/){ # skipping header lines (starting with @) | |
180 warn "skipping SAM header line:\t$_"; | |
181 next; | |
182 } | |
183 | |
184 ++$line_count; | |
185 # example paired-end reads in SAM format (2 consecutive lines) | |
186 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
187 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
188 | |
189 | |
190 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
191 $_ = <IN>; # reading in the paired read | |
192 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; | |
193 | |
194 | |
195 $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); | |
196 $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); | |
197 if ("\U$chrom" !~ /CHR.*/){ | |
198 $chrom = "chr".$chrom; | |
199 } | |
200 if (!defined $last_chrom){ | |
201 print "Now parsing data from $chrom\n"; | |
202 } | |
203 if (defined($last_chrom) and $last_chrom ne $chrom){ | |
204 print "Parsed: $last_chrom\n"; | |
205 write_hash($last_chrom); | |
206 print "Now parsing data from $chrom\n"; | |
207 } | |
208 $meth_call_1 =~ s/^XM:Z://; | |
209 $meth_call_2 =~ s/^XM:Z://; | |
210 $first_read_conversion =~ s/^XR:Z://; | |
211 $second_read_conversion =~ s/^XR:Z://; | |
212 $genome_conversion =~ s/^XG:Z://; | |
213 chomp $genome_conversion; | |
214 | |
215 | |
216 my $index; | |
217 my $strand; | |
218 | |
219 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
220 $index = 0; ## this is OT | |
221 $strand = '+'; | |
222 } | |
223 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
224 $index = 1; ## this is CTOT | |
225 $strand = '-'; | |
226 } | |
227 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
228 $index = 2; ## this is CTOB | |
229 $strand = '+'; | |
230 } | |
231 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
232 $index = 3; ## this is OB | |
233 $strand = '-'; | |
234 } | |
235 else { | |
236 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
237 } | |
238 | |
239 ### reversing the methylation call of the read that was reverse-complemented | |
240 if ($strand eq '+'){ | |
241 # $meth_call_2 = reverse $meth_call_2; | |
242 } | |
243 else{ | |
244 #$meth_call_1 = reverse $meth_call_1; | |
245 } | |
246 | |
247 if ($meth_call_1 and $meth_call_2){ | |
248 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
249 my $end_read_1; | |
250 | |
251 if ($strand eq '+'){ | |
252 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
253 #$start_read_2 += length($meth_call_2)-1; ## $meth_call_2 is already shortened! Passing on the start position on the reverse strand | |
254 } | |
255 else{ | |
256 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
257 #$end_read_1 = $start_read_1; | |
258 #$start_read_1 += length($meth_call_1)-1; ## $meth_call_1 is already shortened! Passing on the start position on the reverse strand | |
259 } | |
260 | |
261 if ($strand eq '+'){ | |
262 ## we first pass the first read which is in + orientation on the forward strand | |
263 hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0); | |
264 | |
265 # we next pass the second read which is in - orientation on the reverse strand | |
266 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
267 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$end_read_1); | |
268 } | |
269 | |
270 else{ | |
271 ## we first pass the first read which is in - orientation on the reverse strand | |
272 hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0); | |
273 | |
274 # we next pass the second read which is in + orientation on the forward strand | |
275 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
276 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$end_read_1); | |
277 } | |
278 | |
279 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
280 } | |
281 $last_chrom = $chrom; | |
282 } | |
283 write_hash($last_chrom); | |
284 print "\n\nDone!"; | |
285 } | |
286 else{ | |
287 die "Single-end or paired-end reads not specified properly\n"; | |
288 } | |
289 | |
290 print "\n\nProcessed $line_count lines from $filename in total\n"; | |
291 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
292 } | |
293 } | |
294 | |
295 | |
296 sub hash_individual_C_methylation_states_paired_end_files{ | |
297 | |
298 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$end_read_1) = @_; | |
299 my @methylation_calls = split(//,$meth_call); | |
300 | |
301 ################################################################# | |
302 ### . for bases not involving cytosines ### | |
303 ### X for methylated C in CHG context (was protected) ### | |
304 ### x for not methylated C in CHG context (was converted) ### | |
305 ### H for methylated C in CHH context (was protected) ### | |
306 ### h for not methylated C in CHH context (was converted) ### | |
307 ### Z for methylated C in CpG context (was protected) ### | |
308 ### z for not methylated C in CpG context (was converted) ### | |
309 ################################################################# | |
310 | |
311 #my $methyl_CHG_count = 0; | |
312 #my $methyl_CHH_count = 0; | |
313 #my $methyl_CpG_count = 0; | |
314 #my $unmethylated_CHG_count = 0; | |
315 #my $unmethylated_CHH_count = 0; | |
316 #my $unmethylated_CpG_count = 0; | |
317 | |
318 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and $methylation_calls[$indexnon-CpG) SECTION IF --merge_non_CpG was specified | |
319 | |
320 if ($strand eq '+') { | |
321 for my $index (0..$#methylation_calls) { | |
322 | |
323 ### Returning as soon as the methylation calls start overlapping | |
324 #if ($start+$index >= $end_read_1) { | |
325 # return; | |
326 #} | |
327 | |
328 if ($methylation_calls[$index] eq 'X') { | |
329 #$counting{total_meCHG_count}++; | |
330 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
331 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
332 }else{ | |
333 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
334 } | |
335 } elsif ($methylation_calls[$index] eq 'x') { | |
336 #$counting{total_unmethylated_CHG_count}++; | |
337 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
338 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
339 }else{ | |
340 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
341 } | |
342 } elsif ($methylation_calls[$index] eq 'Z') { | |
343 #$counting{total_meCpG_count}++; | |
344 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
345 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
346 }else{ | |
347 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
348 } | |
349 } elsif ($methylation_calls[$index] eq 'z') { | |
350 #$counting{total_unmethylated_CpG_count}++; | |
351 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
352 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
353 }else{ | |
354 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
355 } | |
356 } elsif ($methylation_calls[$index] eq 'H') { | |
357 #$counting{total_meCHH_count}++; | |
358 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
359 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
360 }else{ | |
361 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
362 } | |
363 } elsif ($methylation_calls[$index] eq 'h') { | |
364 #$counting{total_unmethylated_CHH_count}++; | |
365 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
366 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
367 }else{ | |
368 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
369 } | |
370 } | |
371 elsif ($methylation_calls[$index] eq '.'){} | |
372 else{ | |
373 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
374 } | |
375 } | |
376 } elsif ($strand eq '-') { | |
377 for my $index (0..$#methylation_calls) { | |
378 | |
379 ### Returning as soon as the methylation calls start overlapping | |
380 #if ($start-$index <= $end_read_1) { | |
381 # return; | |
382 #} | |
383 =head1 | |
384 if ($start-$index == 100){ | |
385 print "\nStrand: - ID: $id\n"; | |
386 } | |
387 | |
388 if ($methylation_calls[$index] eq 'X') { | |
389 #$counting{total_meCHG_count}++; | |
390 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
391 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
392 }else{ | |
393 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
394 } | |
395 } elsif ($methylation_calls[$index] eq 'x') { | |
396 #$counting{total_unmethylated_CHG_count}++; | |
397 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
398 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
399 }else{ | |
400 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
401 } | |
402 } elsif ($methylation_calls[$index] eq 'Z') { | |
403 #$counting{total_meCpG_count}++; | |
404 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
405 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
406 }else{ | |
407 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
408 } | |
409 } elsif ($methylation_calls[$index] eq 'z') { | |
410 #$counting{total_unmethylated_CpG_count}++; | |
411 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
412 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
413 }else{ | |
414 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
415 } | |
416 } elsif ($methylation_calls[$index] eq 'H') { | |
417 # $counting{total_meCHH_count}++; | |
418 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
419 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
420 }else{ | |
421 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
422 } | |
423 } elsif ($methylation_calls[$index] eq 'h') { | |
424 #$counting{total_unmethylated_CHH_count}++; | |
425 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
426 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
427 }else{ | |
428 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
429 } | |
430 } | |
431 elsif ($methylation_calls[$index] eq '.') {} | |
432 else{ | |
433 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
434 } | |
435 } | |
436 =cut | |
437 if ($methylation_calls[$index] eq 'X') { | |
438 #$counting{total_meCHG_count}++; | |
439 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
440 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
441 }else{ | |
442 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
443 } | |
444 } elsif ($methylation_calls[$index] eq 'x') { | |
445 #$counting{total_unmethylated_CHG_count}++; | |
446 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
447 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
448 }else{ | |
449 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
450 } | |
451 } elsif ($methylation_calls[$index] eq 'Z') { | |
452 #$counting{total_meCpG_count}++; | |
453 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
454 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
455 }else{ | |
456 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
457 } | |
458 } elsif ($methylation_calls[$index] eq 'z') { | |
459 #$counting{total_unmethylated_CpG_count}++; | |
460 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
461 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
462 }else{ | |
463 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
464 } | |
465 } elsif ($methylation_calls[$index] eq 'H') { | |
466 # $counting{total_meCHH_count}++; | |
467 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
468 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
469 }else{ | |
470 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
471 } | |
472 } elsif ($methylation_calls[$index] eq 'h') { | |
473 #$counting{total_unmethylated_CHH_count}++; | |
474 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
475 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
476 }else{ | |
477 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
478 } | |
479 } | |
480 elsif ($methylation_calls[$index] eq '.') {} | |
481 else{ | |
482 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
483 } | |
484 } | |
485 } else { | |
486 die "This cannot happen $!\n"; | |
487 } | |
488 | |
489 =head1 | |
490 if ($strand eq '+') { | |
491 for my $index (0..$#methylation_calls) { | |
492 | |
493 ### Returning as soon as the methylation calls start overlapping | |
494 #if ($start+$index >= $end_read_1) { | |
495 # return; | |
496 #} | |
497 if ($methylation_calls[$index] eq 'X') { | |
498 #$counting{total_meCHG_count}++; | |
499 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
500 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
501 }else{ | |
502 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
503 } | |
504 } elsif ($methylation_calls[$index] eq 'x') { | |
505 #$counting{total_unmethylated_CHG_count}++; | |
506 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
507 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
508 }else{ | |
509 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
510 } | |
511 } elsif ($methylation_calls[$index] eq 'Z') { | |
512 #$counting{total_meCpG_count}++; | |
513 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
514 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
515 }else{ | |
516 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
517 } | |
518 } elsif ($methylation_calls[$index] eq 'z') { | |
519 #$counting{total_unmethylated_CpG_count}++; | |
520 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
521 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
522 }else{ | |
523 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
524 } | |
525 } elsif ($methylation_calls[$index] eq 'H') { | |
526 #$counting{total_meCHH_count}++; | |
527 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
528 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
529 }else{ | |
530 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
531 } | |
532 } elsif ($methylation_calls[$index] eq 'h') { | |
533 #$counting{total_unmethylated_CHH_count}++; | |
534 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
535 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
536 }else{ | |
537 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
538 } | |
539 } | |
540 elsif ($methylation_calls[$index] eq '.'){} | |
541 else{ | |
542 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
543 } | |
544 } | |
545 } elsif ($strand eq '-') { | |
546 for my $index (0..$#methylation_calls) { | |
547 | |
548 ### Returning as soon as the methylation calls start overlapping | |
549 #if ($start-$index <= $end_read_1) { | |
550 # return; | |
551 #} | |
552 | |
553 if ($methylation_calls[$index] eq 'X') { | |
554 #$counting{total_meCHG_count}++; | |
555 if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ | |
556 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; | |
557 }else{ | |
558 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; | |
559 } | |
560 } elsif ($methylation_calls[$index] eq 'x') { | |
561 #$counting{total_unmethylated_CHG_count}++; | |
562 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
563 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
564 }else{ | |
565 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
566 } | |
567 } elsif ($methylation_calls[$index] eq 'Z') { | |
568 #$counting{total_meCpG_count}++; | |
569 if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ | |
570 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; | |
571 }else{ | |
572 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; | |
573 } | |
574 } elsif ($methylation_calls[$index] eq 'z') { | |
575 #$counting{total_unmethylated_CpG_count}++; | |
576 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
577 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
578 }else{ | |
579 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
580 } | |
581 } elsif ($methylation_calls[$index] eq 'H') { | |
582 # $counting{total_meCHH_count}++; | |
583 if (defined $bases{$start-$=cutindex}{"+"}{$methylation_calls[$index]}){ | |
584 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; | |
585 }else{ | |
586 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; | |
587 } | |
588 } elsif ($methylation_calls[$index] eq 'h') { | |
589 #$counting{total_unmethylated_CHH_count}++; | |
590 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
591 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
592 }else{ | |
593 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
594 } | |
595 } | |
596 elsif ($methylation_calls[$index] eq '.') {} | |
597 else{ | |
598 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
599 } | |
600 } | |
601 } else { | |
602 die "This cannot happen $!\n"; | |
603 } | |
604 =cut | |
605 } | |
606 | |
607 | |
608 | |
609 sub hash_individual_C_methylation_states_single_end{ | |
610 | |
611 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index) = @_; | |
612 my @methylation_calls = split(//,$meth_call); | |
613 | |
614 ################################################################# | |
615 ### . for bases not involving cytosines ### | |
616 ### X for methylated C in CHG context (was protected) ### | |
617 ### x for not methylated C in CHG context (was converted) ### | |
618 ### H for methylated C in CHH context (was protected) ### | |
619 ### h for not methylated C in CHH context (was converted) ### | |
620 ### Z for methylated C in CpG context (was protected) ### | |
621 ### z for not methylated C in CpG context (was converted) ### | |
622 ################################################################# | |
623 | |
624 #my $methyl_CHG_count = 0; | |
625 #my $methyl_CHH_count = 0; | |
626 #my $methyl_CpG_count = 0; | |
627 #my $unmethylated_CHG_count = 0; | |
628 #my $unmethylated_CHH_count = 0; | |
629 #my $unmethylated_CpG_count = 0; | |
630 | |
631 ### single-file CpG and other-context output | |
632 if ($strand eq '+') { | |
633 for my $index (0..$#methylation_calls) { | |
634 | |
635 ### methylated Cs (any context) will receive a forward (+) orientation | |
636 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
637 if ($methylation_calls[$index] eq 'X') { | |
638 #$counting{total_meCHG_count}++; | |
639 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
640 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
641 }else{ | |
642 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
643 } | |
644 } | |
645 elsif ($methylation_calls[$index] eq 'x') { | |
646 #$counting{total_unmethylated_CHG_count}++; | |
647 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
648 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
649 }else{ | |
650 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
651 } | |
652 } | |
653 elsif ($methylation_calls[$index] eq 'Z') { | |
654 #$counting{total_meCpG_count}++; | |
655 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
656 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
657 }else{ | |
658 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
659 } | |
660 } | |
661 elsif ($methylation_calls[$index] eq 'z') { | |
662 #$counting{total_unmethylated_CpG_count}++; | |
663 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
664 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
665 }else{ | |
666 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
667 } | |
668 } | |
669 elsif ($methylation_calls[$index] eq 'H') { | |
670 #$counting{total_meCHH_count}++; | |
671 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
672 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
673 }else{ | |
674 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
675 } | |
676 } | |
677 elsif ($methylation_calls[$index] eq 'h') { | |
678 #$counting{total_unmethylated_CHH_count}++; | |
679 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
680 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
681 }else{ | |
682 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
683 } | |
684 } | |
685 elsif ($methylation_calls[$index] eq '.') { | |
686 } | |
687 else{ | |
688 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
689 } | |
690 } | |
691 } | |
692 elsif ($strand eq '-') { | |
693 | |
694 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
695 $start += length($meth_call)-1; | |
696 } | |
697 | |
698 for my $index (0..$#methylation_calls) { | |
699 ### methylated Cs (any context) will receive a forward (+) orientation | |
700 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
701 if ($methylation_calls[$index] eq 'X') { | |
702 #$counting{total_meCHG_count}++; | |
703 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
704 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
705 }else{ | |
706 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
707 } | |
708 } | |
709 elsif ($methylation_calls[$index] eq 'x') { | |
710 #$counting{total_unmethylated_CHG_count}++; | |
711 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
712 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
713 }else{ | |
714 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
715 } | |
716 } | |
717 elsif ($methylation_calls[$index] eq 'Z') { | |
718 #$counting{total_meCpG_count}++; | |
719 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
720 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
721 }else{ | |
722 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
723 } | |
724 } | |
725 elsif ($methylation_calls[$index] eq 'z') { | |
726 #$counting{total_unmethylated_CpG_count}++; | |
727 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
728 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
729 }else{ | |
730 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
731 } | |
732 } | |
733 elsif ($methylation_calls[$index] eq 'H') { | |
734 #$counting{total_meCHH_count}++; | |
735 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
736 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
737 }else{ | |
738 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
739 } | |
740 } | |
741 elsif ($methylation_calls[$index] eq 'h') { | |
742 #$counting{total_unmethylated_CHH_count}++; | |
743 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
744 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
745 }else{ | |
746 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
747 } | |
748 } | |
749 elsif ($methylation_calls[$index] eq '.'){ | |
750 } | |
751 else{ | |
752 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
753 } | |
754 } | |
755 } | |
756 else { | |
757 die "This cannot happen (or at least it shouldn't....)\n"; | |
758 } | |
759 } | |
760 | |
761 sub write_hash(){ | |
762 my ($chrom) = @_; | |
763 | |
764 for my $k1 (sort {$a <=> $b} (keys %bases)){ | |
765 my $coverage = 0; | |
766 my $methylated = 0; | |
767 my $context; | |
768 for my $k2 (keys %{$bases{$k1}}){ | |
769 | |
770 for my $k3 (keys %{$bases{$k1}{$k2}}){ | |
771 if ($k3 eq "Z" or $k3 eq "z"){ | |
772 $context = "CPG"; | |
773 }elsif ($k3 eq "H" or $k3 eq "h"){ | |
774 $context = "CHH"; | |
775 }elsif ($k3 eq "X" or $k3 eq "x"){ | |
776 $context = "CHG"; | |
777 } | |
778 | |
779 #test, if $k3 is upper-case (==methylated base) | |
780 if ($k3 eq "\U$k3"){ | |
781 $methylated+=$bases{$k1}{$k2}{$k3}; | |
782 } | |
783 $coverage+=$bases{$k1}{$k2}{$k3}; | |
784 | |
785 } | |
786 my $percentage = ($methylated / $coverage)*100; | |
787 | |
788 ##################################################################################### | |
789 ### BED file format tabs: ### | |
790 ### 1. Chromosome ### | |
791 ### 2. Start position ### | |
792 ### 3. Strand ### | |
793 ### 4. Context (CpG,CHH,CHG) ### | |
794 ### 5. Coverage ### | |
795 ### 6. Methylated Cs ### | |
796 ### 7. Percentage methylated Cs ### | |
797 ##################################################################################### | |
798 | |
799 printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); | |
800 } | |
801 } | |
802 %bases = (); | |
803 } | |
804 | |
805 | |
806 sub write_hash_back(){ | |
807 my ($chrom) = @_; | |
808 | |
809 for my $k1 (sort (keys %bases)){ | |
810 my $coverage = 0; | |
811 my $methylated = 0; | |
812 my $context; | |
813 my $k2 = (keys %{$bases{$k1}})[0]; | |
814 for my $k3 (keys %{$bases{$k1}{$k2}}){ | |
815 if ($k3 eq "Z" or $k3 eq "z"){ | |
816 $context = "CPG"; | |
817 }elsif ($k3 eq "H" or $k3 eq "h"){ | |
818 $context = "CHH"; | |
819 }elsif ($k3 eq "X" or $k3 eq "x"){ | |
820 $context = "CHG"; | |
821 } | |
822 | |
823 #test, if $k3 is upper-case (==methylated base) | |
824 if ($k3 eq "\U$k3"){ | |
825 $methylated+=$bases{$k1}{$k2}{$k3}; | |
826 } | |
827 $coverage+=$bases{$k1}{$k2}{$k3}; | |
828 | |
829 } | |
830 my $percentage = ($methylated / $coverage)*100; | |
831 | |
832 ################################################################################### | |
833 ### BED file format tabs: ### | |
834 ### 1. Chromosome ### | |
835 ### 2. Start position ### | |
836 ### 3. Strand ### | |
837 ### 4. Context (CpG,CHH,CHG) ### | |
838 ### 5. Coverage ### | |
839 ### 6. Methylated Cs ### | |
840 ### 7. Percentage methylated Cs ### | |
841 ################################################################################### | |
842 | |
843 printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); | |
844 | |
845 } | |
846 %bases = (); | |
847 } | |
848 | |
849 | |
850 |