comparison libs/sratoolkit.2.8.0-centos_linux64/example/perl/base-stats.pl @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 #!/usr/bin/env perl
2 # =============================================================================
3 #
4 # PUBLIC DOMAIN NOTICE
5 # National Center for Biotechnology Information
6 #
7 # This software/database is a "United States Government Work" under the
8 # terms of the United States Copyright Act. It was written as part of
9 # the author's official duties as a United States Government employee and
10 # thus cannot be copyrighted. This software/database is freely available
11 # to the public for use. The National Library of Medicine and the U.S.
12 # Government have not placed any restriction on its use or reproduction.
13 #
14 # Although all reasonable efforts have been taken to ensure the accuracy
15 # and reliability of the software and data, the NLM and the U.S.
16 # Government do not and cannot warrant the performance or results that
17 # may be obtained by using this software or data. The NLM and the U.S.
18 # Government disclaim all warranties, express or implied, including
19 # warranties of performance, merchantability or fitness for any particular
20 # purpose.
21 #
22 # Please cite the author in any work or product based on this material.
23 #
24 # =============================================================================
25
26 =head1 NAME
27
28 base-stats - report base statistics for an SRA run
29
30 =head1 DESCRIPTION
31
32 This script computes base and GC content statistics for an SRA run. The output
33 is summarized by mate and position within the mate.
34
35 =head1 PURPOSE
36
37 This script demonstrates driving the SRA Toolkit tool 'vdb-dump' from Perl for
38 accessing data in the SRA. It requires that the toolkit be properly configured.
39 'vdb-dump' is the Jack-of-all-trades of the SRA Toolkit.
40
41 =head1 EXAMPLE
42
43 base-stats.pl SRR797646
44
45 =head1 SEE ALSO
46
47 base-stats.pl --help
48
49 vdb-dump --help
50
51 =cut
52
53 use strict;
54 use warnings;
55
56 sub usage()
57 {
58 print <<"HELP";
59 report base statistics for an SRA run
60
61 Usage:
62 $0 [<options>...] <accession>...
63 options are
64 -h | -? | --help shows this message
65 --row-range <row-range-spec> default is spots 1-1000000
66 row-range-spec a comma seperated list of row numbers and/or
67 ranges; a range is <first>-<last>
68
69 --output-type <output-spec> set the type of data output; proportion is
70 the default
71 output-spec one of 'count', 'percent', or 'proportion'
72
73 accession an SRA accession or path to an SRA file
74
75 Example:
76 $0 --row-range "5,7,11-15,25-37" --output-type count SRR797646
77
78 HELP
79 exit 0;
80 }
81
82 usage() if scalar @ARGV == 0;
83 foreach (@ARGV) {
84 usage() if (/^-h$/ || /^-\?$/ || /^--help$/);
85 }
86
87 use constant MAX_READ_COUNT => 2;
88
89 # the columns to be read from the input
90 use constant COLUMN_LIST => 'READ_START,READ_LEN,READ_TYPE,READ';
91
92 # need vdb-dump to read input files
93 my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH";
94 chomp $VDB_DUMP;
95
96 my %opts = (
97 'row-range' => '1-1000000', # first million spots
98 'output-type' => 'proportion'
99 );
100
101 sub foreach_row($$)
102 {
103 my ($source, $callback) = @_;
104 my $cmd = $VDB_DUMP
105 . ' --rows "' . $opts{'row-range'} . '"' # set the row range
106 . ' --columns "' . COLUMN_LIST . '"' # set the columns to be read
107 . ' --format tab' # set output format to tab delimited
108 . ' "' . $source . '"'; # the file/accession to be read
109 open CMD, '-|', $cmd or die "failed to exec vdb-dump: $!";
110
111 while (defined(local $_ = <CMD>)) {
112 # print; # uncomment to print raw output from vdb-dump (e.g. for debugging)
113 chomp;
114
115 &$callback(\split(/\t/, $_));
116 }
117 close CMD;
118 }
119
120 sub sra_object_type($)
121 {
122 open CMD, '-|', $VDB_DUMP .' --obj_type "' . $_[0] . '"' or die "failed to exec vdb-dump: $!";
123 local $_ = <CMD>;
124
125 close CMD;
126 chomp;
127 return $_;
128 }
129
130 sub sra_list_tables($)
131 {
132 my @rslt = ();
133
134 if (sra_object_type($_[0]) eq 'Database') {
135 open CMD, '-|', $VDB_DUMP . ' --table_enum "' . $_[0] . '"' or die "failed to exec vdb-dump: $!";
136 while (defined(local $_ = <CMD>)) {
137 push @rslt, $1 if /^tbl #\d+:\s*(\w+)/;
138 }
139 close CMD;
140 }
141 return @rslt;
142 }
143
144 sub warn_if_aligned($);
145 sub print_result();
146 sub process;
147
148 my @stats = ();
149 my $max_read = 0;
150 my $max_pos = 0;
151
152 for (my $i = 0; $i < scalar @ARGV; ++$i) {
153 $_ = $ARGV[$i];
154
155 if (/^-/) {
156 if (/^--row-range$/) {
157 $opts{'row-range'} = $ARGV[++$i];
158 next;
159 }
160 if (/^--output-type$/) {
161 my $type = $ARGV[++$i];
162
163 if ( $type eq 'count'
164 || $type eq 'proportion'
165 || $type eq 'percent')
166 {
167 $opts{'output-type'} = $type;
168 next;
169 }
170 }
171 usage();
172 }
173 warn_if_aligned $_;
174 foreach_row($_, \&process);
175 }
176
177 print_result();
178 exit 0;
179
180 sub warn_if_aligned($)
181 {
182 local $_;
183
184 foreach (sra_list_tables($_[0])) {
185 if (/PRIMARY_ALIGNMENT/) {
186 print "$_[0] is aligned and is likely to be mostly ordered; results may be biased to the reference\n";
187 last;
188 }
189 }
190 }
191
192 sub get_stat($$$)
193 {
194 # (pos * 5 + base) * MAX_READ_COUNT + read_number;
195 return $stats[($_[1] * 5 + $_[2]) * MAX_READ_COUNT + $_[0]];
196 }
197
198 sub inc_stat($$$) # this is called a zillion times and needs to be fast
199 {
200 # (pos * 5 + base) * MAX_READ_COUNT + read_number;
201 ++$stats[($_[1] * 5 + $_[2]) * MAX_READ_COUNT + $_[0]];
202 }
203
204 sub resize_stats($)
205 {
206 my $length = $_[0];
207 my $need = MAX_READ_COUNT * 5 * $length;
208 my $have = scalar(@stats);
209
210 return if $have >= $need;
211 splice @stats, $have, 0, map { 0 } (1..($need - $have));
212 }
213
214 sub convert_bases($)
215 {
216 local $_ = ${$_[0]};
217
218 tr/ACGTN/01234/;
219 return $_;
220 }
221
222 # accumulate statistics for one spot
223 sub process
224 {
225 # this matches COLUMN_LIST above
226 my ($READ_START, $READ_LEN, $READ_TYPE, $READ) = @_;
227
228 # the data in these fields is comma delimited
229 # and describe how to split READ up into its pieces
230 my @start = split(/,\s*/, $$READ_START);
231 my @length = split(/,\s*/, $$READ_LEN);
232 my @type = split(/,\s*/, $$READ_TYPE);
233
234 # READ contains the bases for the whole spot (i.e. all the mates concatenated)
235 my @bases = split(//, convert_bases($READ));
236 my $reads = scalar(@start);
237
238 die "too many reads: $reads reads; adjust MAX_READ_COUNT and rerun" unless $reads <= MAX_READ_COUNT;
239
240 $max_read = $reads if ($max_read < $reads);
241 for (my $read = 0; $read != $reads; ++$read) {
242 # only count biological bases (i.e. no adapters, linkers, etc.)
243 next unless $type[$read] =~ /SRA_READ_TYPE_BIOLOGICAL/;
244 my $pos = $start[$read];
245 my $len = $length[$read];
246
247 resize_stats($len);
248 $max_pos = $len if ($max_pos < $len);
249 for (my $i = 0; $i != $len; ++$i) {
250 inc_stat($read, $i, $bases[$pos + $i]);
251 }
252 }
253 }
254
255 sub print_result() {
256 my $GCp_mean;
257 my $GCp_stdev;
258 my $sum = 0;
259 my $ssum = 0;
260 my $format =
261 $opts{'output-type'} eq 'count' ? sub { $_[0] } :
262 $opts{'output-type'} eq 'percent' ? sub { int($_[0] * 100.0/$_[1] + 0.5).'%' } :
263 sub { $_[0]/$_[1] };
264
265 for (my $read = 0; $read != $max_read; ++$read) {
266 for (my $pos = 0; $pos != $max_pos; ++$pos) {
267 my $gc = (get_stat($read, $pos, 1) + get_stat($read, $pos, 2)) * 1.0;
268 my $n = $gc + get_stat($read, $pos, 0) + get_stat($read, $pos, 3);
269 my $x = $gc/$n;
270
271 $sum += $x;
272 $ssum += $x * $x;
273 }
274 }
275 $GCp_mean = $sum/($max_pos*$max_read);
276 $GCp_stdev = sqrt(($ssum - $sum*$GCp_mean)/($max_pos*$max_read));
277
278 printf join("\t", ('Read', 'Pos', 'A', 'C', 'G', 'T', 'N', 'GC', 'Suspect'))."\n";
279 for (my $read = 0; $read != $max_read; ++$read) {
280 for (my $pos = 0; $pos != $max_pos; ++$pos) {
281 my $cntA = get_stat($read, $pos, 0);
282 my $cntC = get_stat($read, $pos, 1);
283 my $cntG = get_stat($read, $pos, 2);
284 my $cntT = get_stat($read, $pos, 3);
285 my $cntN = get_stat($read, $pos, 4);
286 my $GC = $cntC + $cntG;
287 my $tot = $cntA + $GC + $cntT;
288
289 # keep N out for GC
290 my $GC_score = ($GC/$tot - $GCp_mean)/$GCp_stdev;
291
292 # add N in
293 $tot += $cntN;
294 print join("\t", ($read + 1, $pos + 1,
295 &$format($cntA, $tot),
296 &$format($cntC, $tot),
297 &$format($cntG, $tot),
298 &$format($cntT, $tot),
299 &$format($cntN, $tot),
300 &$format($GC, $tot),
301 '*'x(int(abs($GC_score)))))."\n";
302 }
303 }
304 }