Mercurial > repos > jdv > b2b_summarize_run
comparison summarize_run.pl @ 0:bd599efae04d draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit b5d52d0664b01d252cf61b98be373d09f1ecc2df
| author | jdv |
|---|---|
| date | Wed, 17 Jul 2019 17:48:19 -0400 |
| parents | |
| children | 10c319d654df |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bd599efae04d |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 use Getopt::Long; | |
| 7 use BioX::Seq::Stream; | |
| 8 use List::Util qw/sum/; | |
| 9 | |
| 10 my $fn_raw1; | |
| 11 my $fn_raw2; | |
| 12 my $fn_filt1; | |
| 13 my $fn_filt2; | |
| 14 my $fn_bedgraph; | |
| 15 my $fn_qc1; | |
| 16 my $fn_qc2; | |
| 17 my $fn_consensus; | |
| 18 my $fn_out; | |
| 19 my $n_threads = 1; | |
| 20 my $max_aln = 100000; | |
| 21 | |
| 22 GetOptions( | |
| 23 'raw_1=s' => \$fn_raw1, | |
| 24 'raw_2=s' => \$fn_raw2, | |
| 25 'filt_1=s' => \$fn_filt1, | |
| 26 'filt_2=s' => \$fn_filt2, | |
| 27 'bedgraph=s' => \$fn_bedgraph, | |
| 28 'fastqc_1=s' => \$fn_qc1, | |
| 29 'fastqc_2=s' => \$fn_qc2, | |
| 30 'consensus=s' => \$fn_consensus, | |
| 31 'out=s' => \$fn_out, | |
| 32 'threads=i' => \$n_threads, | |
| 33 'max_aln=i' => \$max_aln, | |
| 34 ); | |
| 35 | |
| 36 | |
| 37 my @counts; | |
| 38 for ($fn_raw1, $fn_raw2, $fn_filt1, $fn_filt2) { | |
| 39 open my $in, '-|', 'wc', '-l', $_; | |
| 40 my $ret = <$in>; | |
| 41 close $in; | |
| 42 chomp $ret; | |
| 43 my ($count, $fn) = split ' ', $ret; | |
| 44 die "line length not multiple of four for $_\n" | |
| 45 if ($count % 4); | |
| 46 push @counts, $count/4; | |
| 47 } | |
| 48 | |
| 49 | |
| 50 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]); | |
| 51 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]); | |
| 52 | |
| 53 #warn "calculating fragment length stats...\n"; | |
| 54 my @lens; | |
| 55 open my $stream, '-|', "frag_lens","--forward",$fn_filt1,"--reverse",$fn_filt2,"--ref",$fn_consensus,"--threads",$n_threads,"--max_aln",$max_aln; | |
| 56 while (<$stream>) { | |
| 57 chomp $_; | |
| 58 push @lens, $_; | |
| 59 } | |
| 60 close $stream; | |
| 61 | |
| 62 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 ); | |
| 63 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 ); | |
| 64 | |
| 65 # extract FastQC data | |
| 66 #warn "extracting FastQC stats...\n"; | |
| 67 | |
| 68 my @five_nums; | |
| 69 for my $fn ($fn_qc1, $fn_qc2) { | |
| 70 open my $in, '<', $fn; | |
| 71 | |
| 72 my $in_data = 0; | |
| 73 my @data; | |
| 74 LINE: | |
| 75 while (my $line = <$in>) { | |
| 76 chomp $line; | |
| 77 if ($in_data) { | |
| 78 if ($line =~ /^>>END_MODULE/) { | |
| 79 last LINE; | |
| 80 } | |
| 81 next if ($line =~ /^#/); | |
| 82 my ($score, $count) = split ' ', $line; | |
| 83 push @data, [$score,$count]; | |
| 84 } | |
| 85 elsif ($line =~ /^>>Per sequence quality scores/) { | |
| 86 $in_data = 1; | |
| 87 } | |
| 88 } | |
| 89 | |
| 90 push @five_nums, data_to_5( @data ); | |
| 91 } | |
| 92 | |
| 93 # Count contigs | |
| 94 | |
| 95 my $p = BioX::Seq::Stream->new($fn_consensus); | |
| 96 my %n_contigs; | |
| 97 my @names; | |
| 98 while (my $seq = $p->next_seq) { | |
| 99 | |
| 100 my $id = $seq->id; | |
| 101 push @names, $id; | |
| 102 while ($seq =~ /[^Nn]+/g) { | |
| 103 ++$n_contigs{$id}; | |
| 104 } | |
| 105 } | |
| 106 | |
| 107 # Parse assembly depth info | |
| 108 #warn "calculating coverage stats...\n"; | |
| 109 | |
| 110 open my $in, '<', $fn_bedgraph; | |
| 111 | |
| 112 my %cov_5nums; | |
| 113 my %counts; | |
| 114 my $last_end; | |
| 115 my $last_contig; | |
| 116 my $head = <$in>; | |
| 117 while (my $line = <$in>) { | |
| 118 chomp $line; | |
| 119 my ($contig,$start,$end,$depth) = split "\t", $line; | |
| 120 $last_contig //= $contig; | |
| 121 if ($contig ne $last_contig) { | |
| 122 | |
| 123 my @depths = sort {$a <=> $b} keys %counts; | |
| 124 my @data; | |
| 125 for (@depths) { | |
| 126 push @data, [$_, $counts{$_}]; | |
| 127 } | |
| 128 $cov_5nums{$last_contig} = data_to_5( @data ); | |
| 129 $last_contig = $contig; | |
| 130 %counts = (); | |
| 131 $last_end = undef; | |
| 132 } | |
| 133 | |
| 134 if (defined($last_end) && $last_end < $start) { | |
| 135 $counts{0} += $start - $last_end; | |
| 136 } | |
| 137 $counts{$depth} += $end - $start; | |
| 138 $last_end = $end; | |
| 139 } | |
| 140 my @depths = sort {$a <=> $b} keys %counts; | |
| 141 my @data; | |
| 142 for (@depths) { | |
| 143 push @data, [$_, $counts{$_}]; | |
| 144 } | |
| 145 $cov_5nums{$last_contig} = data_to_5( @data ); | |
| 146 | |
| 147 open my $out, '>', $fn_out; | |
| 148 print {$out} join("\t", | |
| 149 '#id', | |
| 150 'raw_read_pairs', | |
| 151 'filt_read_pairs', | |
| 152 'frag_len_mean', | |
| 153 'frag_len_sd', | |
| 154 'forward_qual', | |
| 155 'reverse_qual', | |
| 156 'n_contigs', | |
| 157 'coverage_depth', | |
| 158 ), "\n"; | |
| 159 for my $id (@names) { | |
| 160 print {$out} join("\t", | |
| 161 $id, | |
| 162 $counts[0], | |
| 163 $counts[2], | |
| 164 $frag_mean, | |
| 165 $frag_sd, | |
| 166 $five_nums[0], | |
| 167 $five_nums[1], | |
| 168 $n_contigs{$id}, | |
| 169 $cov_5nums{$id}, | |
| 170 ), "\n"; | |
| 171 } | |
| 172 close $out; | |
| 173 | |
| 174 exit; | |
| 175 | |
| 176 | |
| 177 sub data_to_5 { | |
| 178 | |
| 179 my (@data) = @_; | |
| 180 my $total = sum map {$_->[1]} @data; | |
| 181 my @five_num; | |
| 182 my $curr = 0; | |
| 183 for my $i (0..$#data) { | |
| 184 $curr += $data[$i]->[1]; | |
| 185 for my $j (0..4) { | |
| 186 next if (defined $five_num[$j]); | |
| 187 my $quant = $j/4; | |
| 188 if ($curr/$total > $quant) { | |
| 189 $five_num[$j] = $data[$i]->[0]; | |
| 190 } | |
| 191 elsif ($curr/$total == $quant) { | |
| 192 $five_num[$j] = $i < $#data | |
| 193 ? int(($data[$i]->[0]+$data[$i+1]->[0])/2) | |
| 194 : $data[$i]->[0]; | |
| 195 } | |
| 196 } | |
| 197 } | |
| 198 return join('|',@five_num); | |
| 199 | |
| 200 } |
