comparison libs/sratoolkit.2.8.0-centos_linux64/example/perl/mismatch-stats.pl @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 #!/usr/bin/env perl
2 # ===========================================================================
3 #
4 # PUBLIC DOMAIN NOTICE
5 # National Center for Biotechnology Information
6 #
7 # This software/database is a "United States Government Work" under the
8 # terms of the United States Copyright Act. It was written as part of
9 # the author's official duties as a United States Government employee and
10 # thus cannot be copyrighted. This software/database is freely available
11 # to the public for use. The National Library of Medicine and the U.S.
12 # Government have not placed any restriction on its use or reproduction.
13 #
14 # Although all reasonable efforts have been taken to ensure the accuracy
15 # and reliability of the software and data, the NLM and the U.S.
16 # Government do not and cannot warrant the performance or results that
17 # may be obtained by using this software or data. The NLM and the U.S.
18 # Government disclaim all warranties, express or implied, including
19 # warranties of performance, merchantability or fitness for any particular
20 # purpose.
21 #
22 # Please cite the author in any work or product based on this material.
23 #
24 # ===========================================================================
25
26 use warnings;
27
28 use Data::Dumper;
29
30 use constant MAX_READ_COUNT => 2;
31
32 my %opts = (
33 'row-range' => '-1000000',
34 'output-type' => 'proportion'
35 );
36
37 sub usage()
38 {
39 print <<"HELP";
40 report BASE statistics for an SRA
41
42 Usage:
43 $0 [<options>...] <accession>...
44 options are
45 -h | -? | --help
46 --row-range <row-range-spec> default is first 1000000 spots
47 --output-type <output-spec> set the type of data output
48 count, percent, or proportion
49 proportion is the default
50
51 Example:
52 $0 --row-range "5,7,11-15,25-37" --output-type count SRR797646
53
54 HELP
55 exit 0;
56 }
57
58 usage if scalar @ARGV == 0;
59 foreach (@ARGV) {
60 usage() if (/^-h$/ || /^-\?$/ || /^--help$/);
61 }
62
63 my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH";
64 chomp $VDB_DUMP;
65
66 my @stats = ();
67 my $max_read = 0;
68 my $max_len = 0;
69
70 sub get_index($$$)
71 {
72 my ($pos, $read, $base) = @_;
73
74 return ($pos * MAX_READ_COUNT + $read) * 6 + $base;
75 }
76
77 sub resize_stats($)
78 {
79 my $need = get_index $_[0], MAX_READ_COUNT, 0;
80 my $have = scalar(@stats);
81
82 return unless ($have < $need);
83 splice @stats, $have, 0, map { 0 } (0..($need - $have));
84 }
85
86 sub translate_base($)
87 {
88 return 0 if $_[0] eq '=';
89 return 1 if $_[0] eq 'A';
90 return 2 if $_[0] eq 'C';
91 return 3 if $_[0] eq 'G';
92 return 4 if $_[0] eq 'T';
93 return 5;
94 }
95
96 sub translate_base_complement($)
97 {
98 return 0 if $_[0] eq '=';
99 return 1 if $_[0] eq 'T';
100 return 2 if $_[0] eq 'G';
101 return 3 if $_[0] eq 'C';
102 return 4 if $_[0] eq 'A';
103 return 5;
104 }
105
106 sub expand_and_summarize($)
107 {
108 my ($READ_ID, $ORIENT, $MM_READ) = \split(/\t/, ${$_[0]});
109 my $len = length $$MM_READ;
110 my $read = $$READ_ID - 1;
111
112 $max_read = $read if $max_read < $read;
113 $max_len = $len if $max_len < $len ;
114
115 resize_stats $len;
116 if ($$ORIENT eq 'false') {
117 for (my $i = 0; $i != $len; ++$i) {
118 my $base = translate_base(substr($$MM_READ, $i, 1));
119
120 ++$stats[get_index($i, $read, $base)];
121 }
122 }
123 else {
124 for (my $j = 0; $j != $len; ++$j) {
125 my $i = $len - $j - 1;
126 my $base = translate_base_complement(substr($$MM_READ, $i, 1));
127
128 ++$stats[get_index($i, $read, $base)];
129 }
130 }
131 }
132
133 sub check_aligned($)
134 {
135 my $aligned = 0;
136
137 if (`$VDB_DUMP -y "$_[0]"` =~ /Database/i) {
138 open CMD, '-|', "$VDB_DUMP -E \"$_[0]\"" or die "$!";
139
140 while (defined(local $_ = <CMD>)) {
141 chomp;
142 if (/PRIMARY_ALIGNMENT/) {
143 $aligned = 1;
144 last;
145 }
146 }
147 close CMD;
148 }
149 return $aligned;
150 }
151
152 sub process($)
153 {
154 my $cmd = "$VDB_DUMP -f tab -C \"SEQ_READ_ID,REF_ORIENTATION,MISMATCH_READ\" -R $opts{'row-range'} -T PRIMARY_ALIGNMENT \"$_[0]\"";
155 open CMD, '-|', $cmd or die "$!";
156
157 while (defined($_ = <CMD>)) {
158 chomp;
159 expand_and_summarize \$_;
160 }
161 close CMD;
162 }
163
164 for (my $i = 0; $i < scalar @ARGV; ++$i) {
165 $_ = $ARGV[$i];
166
167 if (/^-/) {
168 if (/^--row-range$/) {
169 $opts{'row-range'} = $ARGV[++$i];
170 next;
171 }
172 if (/^--output-type$/) {
173 $opts{'output-type'} = $ARGV[++$i];
174 next;
175 }
176 usage();
177 }
178 if (check_aligned $_) {
179 process $_;
180 }
181 else {
182 print "$_ is not aligned\n";
183 }
184 }
185
186 sub format_counts($$$$)
187 {
188 my ($x, $cnt) = @_;
189
190 return $x;
191 }
192
193 sub format_proportion($$$$)
194 {
195 my ($x, $cnt) = @_;
196
197 return $x/$cnt;
198 }
199
200 sub format_percent($$$$)
201 {
202 my ($x, $cnt) = @_;
203
204 return int($x * 100.0/$cnt + 0.5).'%';
205 }
206
207 my $format = \&format_proportion;
208
209 if ($opts{'output-type'} eq 'count') {
210 $format = \&format_counts;
211 }
212 elsif ($opts{'output-type'} eq 'proportion') {
213 $format = \&format_proportion;
214 }
215 elsif ($opts{'output-type'} eq 'percent') {
216 $format = \&format_percent;
217 }
218 else {
219 usage();
220 }
221
222 printf join("\t", ('Read', 'Pos', '=', 'A', 'C', 'G', 'T', 'N'))."\n";
223
224 for (my $read = 0; $read <= $max_read; ++$read) {
225 for (my $pos = 0; $pos != $max_len; ++$pos) {
226 my $idx = get_index $pos, $read, 0;
227 my $tot = $stats[$idx+0] + $stats[$idx+1] + $stats[$idx+2]
228 + $stats[$idx+3] + $stats[$idx+4] + $stats[$idx+5];
229
230 if ($tot) {
231 print join("\t", ($read + 1, $pos + 1,
232 &$format($stats[$idx+0], $tot),
233 &$format($stats[$idx+1], $tot),
234 &$format($stats[$idx+2], $tot),
235 &$format($stats[$idx+3], $tot),
236 &$format($stats[$idx+4], $tot),
237 &$format($stats[$idx+5], $tot)
238 ))."\n";
239 }
240 }
241 }