annotate sum_fastqc.pl @ 7:b769c810924e draft default tip

Added option for multiple Q scores
author estrain
date Wed, 17 Oct 2018 07:11:51 -0400
parents a2c2dc7bc724
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/perl
cce90961c022 Uploaded
estrain
parents:
diff changeset
2
cce90961c022 Uploaded
estrain
parents:
diff changeset
3 ####################################################
cce90961c022 Uploaded
estrain
parents:
diff changeset
4 ##
cce90961c022 Uploaded
estrain
parents:
diff changeset
5 ## sum_fastqc.pl
cce90961c022 Uploaded
estrain
parents:
diff changeset
6 ##
cce90961c022 Uploaded
estrain
parents:
diff changeset
7 ## Errol Strain (estrain@gmail.com)
cce90961c022 Uploaded
estrain
parents:
diff changeset
8 ##
cce90961c022 Uploaded
estrain
parents:
diff changeset
9 ## Description: Takes raw FASTQC output and produces
cce90961c022 Uploaded
estrain
parents:
diff changeset
10 ## simple table summary
cce90961c022 Uploaded
estrain
parents:
diff changeset
11 ##
cce90961c022 Uploaded
estrain
parents:
diff changeset
12 ####################################################
cce90961c022 Uploaded
estrain
parents:
diff changeset
13
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
14 my($inname)=shift(@ARGV);
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
15 my($qscore)=shift(@ARGV);
7
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
16 $qscore=~s/\s+//g;
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
17 my(@qlist)=split(/\,/,$qscore);
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
18
7
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
19 print "Input\tFile\tFastQC\tPass-Fail\tReads\tPoor_Reads\tGC";
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
20 foreach(@qlist) {
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
21 print "\tQ".$_;
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
22 }
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
23 print "\n";
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
24
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
25 foreach (@ARGV) {
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
26 print_stats($_);
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
27 }
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
28
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
29 sub print_stats {
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
30 $infile = shift;
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
31 # First 10 lines of raw FASTQC contain basic overview
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
32 @sumlines=`head -n 10 $infile`;
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
33 chomp(@sumlines);
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
34
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
35 # Sequence level Q scores are buried in the middle of the file
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
36 @qlines=`awk '/#Quality\tCount/,/>>END_MODULE/' $infile | head -n -1 | tail -n +2`;
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
37 chomp(@qlines);
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
38
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
39 @fastqc = split(/[\n\t]/,shift(@sumlines));
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
40 @pass = split(/\t/,shift(@sumlines));
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
41 shift(@sumlines);
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
42 @fn = split(/\t/,shift(@sumlines));
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
43 shift(@sumlines);
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
44 shift(@sumlines);
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
45 @nreads = split(/\t/,shift(@sumlines));
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
46 @npoor = split(/\t/,shift(@sumlines));
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
47 shift(@sumlines);
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
48 @gc = split(/\t/,shift(@sumlines));
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
49
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
50 print $inname."\t";
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
51 print $fn[1]."\t";
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
52 print $fastqc[1]."\t";
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
53 print $pass[1]."\t";
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
54 print $nreads[1]."\t";
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
55 print $npoor[1]."\t";
7
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
56 print $gc[1];
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
57 foreach $qs (@qlist) {
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
58 print "\t";
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
59 print qcal($nreads[1],$qs,\@qlines);
b769c810924e Added option for multiple Q scores
estrain
parents: 5
diff changeset
60 }
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
61 print "\n";
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
62 }
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
63
cce90961c022 Uploaded
estrain
parents:
diff changeset
64 # Sum reads w/ Q scores > cutoff and divide by number of reads
cce90961c022 Uploaded
estrain
parents:
diff changeset
65 sub qcal {
cce90961c022 Uploaded
estrain
parents:
diff changeset
66 $nreads=shift(@_);
cce90961c022 Uploaded
estrain
parents:
diff changeset
67 $cutoff=shift(@_);
cce90961c022 Uploaded
estrain
parents:
diff changeset
68 @qarray=@{$_[0]};
cce90961c022 Uploaded
estrain
parents:
diff changeset
69 $sum = 0;
cce90961c022 Uploaded
estrain
parents:
diff changeset
70
cce90961c022 Uploaded
estrain
parents:
diff changeset
71 foreach $item (@qarray) {
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
72 my($qval,$q)=split(/\t/,$item);
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
73 if($qval>=$cutoff) {
5
a2c2dc7bc724 Uploaded
estrain
parents: 3
diff changeset
74 $sum += $q;
0
cce90961c022 Uploaded
estrain
parents:
diff changeset
75 }
cce90961c022 Uploaded
estrain
parents:
diff changeset
76 }
cce90961c022 Uploaded
estrain
parents:
diff changeset
77 $qmean = sprintf("%.2f", 100 * $sum / $nreads);
cce90961c022 Uploaded
estrain
parents:
diff changeset
78 return $qmean;
cce90961c022 Uploaded
estrain
parents:
diff changeset
79 }