Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/example/perl/base-stats.pl @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 #!/usr/bin/env perl | |
2 # ============================================================================= | |
3 # | |
4 # PUBLIC DOMAIN NOTICE | |
5 # National Center for Biotechnology Information | |
6 # | |
7 # This software/database is a "United States Government Work" under the | |
8 # terms of the United States Copyright Act. It was written as part of | |
9 # the author's official duties as a United States Government employee and | |
10 # thus cannot be copyrighted. This software/database is freely available | |
11 # to the public for use. The National Library of Medicine and the U.S. | |
12 # Government have not placed any restriction on its use or reproduction. | |
13 # | |
14 # Although all reasonable efforts have been taken to ensure the accuracy | |
15 # and reliability of the software and data, the NLM and the U.S. | |
16 # Government do not and cannot warrant the performance or results that | |
17 # may be obtained by using this software or data. The NLM and the U.S. | |
18 # Government disclaim all warranties, express or implied, including | |
19 # warranties of performance, merchantability or fitness for any particular | |
20 # purpose. | |
21 # | |
22 # Please cite the author in any work or product based on this material. | |
23 # | |
24 # ============================================================================= | |
25 | |
26 =head1 NAME | |
27 | |
28 base-stats - report base statistics for an SRA run | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 This script computes base and GC content statistics for an SRA run. The output | |
33 is summarized by mate and position within the mate. | |
34 | |
35 =head1 PURPOSE | |
36 | |
37 This script demonstrates driving the SRA Toolkit tool 'vdb-dump' from Perl for | |
38 accessing data in the SRA. It requires that the toolkit be properly configured. | |
39 'vdb-dump' is the Jack-of-all-trades of the SRA Toolkit. | |
40 | |
41 =head1 EXAMPLE | |
42 | |
43 base-stats.pl SRR797646 | |
44 | |
45 =head1 SEE ALSO | |
46 | |
47 base-stats.pl --help | |
48 | |
49 vdb-dump --help | |
50 | |
51 =cut | |
52 | |
53 use strict; | |
54 use warnings; | |
55 | |
56 sub usage() | |
57 { | |
58 print <<"HELP"; | |
59 report base statistics for an SRA run | |
60 | |
61 Usage: | |
62 $0 [<options>...] <accession>... | |
63 options are | |
64 -h | -? | --help shows this message | |
65 --row-range <row-range-spec> default is spots 1-1000000 | |
66 row-range-spec a comma seperated list of row numbers and/or | |
67 ranges; a range is <first>-<last> | |
68 | |
69 --output-type <output-spec> set the type of data output; proportion is | |
70 the default | |
71 output-spec one of 'count', 'percent', or 'proportion' | |
72 | |
73 accession an SRA accession or path to an SRA file | |
74 | |
75 Example: | |
76 $0 --row-range "5,7,11-15,25-37" --output-type count SRR797646 | |
77 | |
78 HELP | |
79 exit 0; | |
80 } | |
81 | |
82 usage() if scalar @ARGV == 0; | |
83 foreach (@ARGV) { | |
84 usage() if (/^-h$/ || /^-\?$/ || /^--help$/); | |
85 } | |
86 | |
87 use constant MAX_READ_COUNT => 2; | |
88 | |
89 # the columns to be read from the input | |
90 use constant COLUMN_LIST => 'READ_START,READ_LEN,READ_TYPE,READ'; | |
91 | |
92 # need vdb-dump to read input files | |
93 my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH"; | |
94 chomp $VDB_DUMP; | |
95 | |
96 my %opts = ( | |
97 'row-range' => '1-1000000', # first million spots | |
98 'output-type' => 'proportion' | |
99 ); | |
100 | |
101 sub foreach_row($$) | |
102 { | |
103 my ($source, $callback) = @_; | |
104 my $cmd = $VDB_DUMP | |
105 . ' --rows "' . $opts{'row-range'} . '"' # set the row range | |
106 . ' --columns "' . COLUMN_LIST . '"' # set the columns to be read | |
107 . ' --format tab' # set output format to tab delimited | |
108 . ' "' . $source . '"'; # the file/accession to be read | |
109 open CMD, '-|', $cmd or die "failed to exec vdb-dump: $!"; | |
110 | |
111 while (defined(local $_ = <CMD>)) { | |
112 # print; # uncomment to print raw output from vdb-dump (e.g. for debugging) | |
113 chomp; | |
114 | |
115 &$callback(\split(/\t/, $_)); | |
116 } | |
117 close CMD; | |
118 } | |
119 | |
120 sub sra_object_type($) | |
121 { | |
122 open CMD, '-|', $VDB_DUMP .' --obj_type "' . $_[0] . '"' or die "failed to exec vdb-dump: $!"; | |
123 local $_ = <CMD>; | |
124 | |
125 close CMD; | |
126 chomp; | |
127 return $_; | |
128 } | |
129 | |
130 sub sra_list_tables($) | |
131 { | |
132 my @rslt = (); | |
133 | |
134 if (sra_object_type($_[0]) eq 'Database') { | |
135 open CMD, '-|', $VDB_DUMP . ' --table_enum "' . $_[0] . '"' or die "failed to exec vdb-dump: $!"; | |
136 while (defined(local $_ = <CMD>)) { | |
137 push @rslt, $1 if /^tbl #\d+:\s*(\w+)/; | |
138 } | |
139 close CMD; | |
140 } | |
141 return @rslt; | |
142 } | |
143 | |
144 sub warn_if_aligned($); | |
145 sub print_result(); | |
146 sub process; | |
147 | |
148 my @stats = (); | |
149 my $max_read = 0; | |
150 my $max_pos = 0; | |
151 | |
152 for (my $i = 0; $i < scalar @ARGV; ++$i) { | |
153 $_ = $ARGV[$i]; | |
154 | |
155 if (/^-/) { | |
156 if (/^--row-range$/) { | |
157 $opts{'row-range'} = $ARGV[++$i]; | |
158 next; | |
159 } | |
160 if (/^--output-type$/) { | |
161 my $type = $ARGV[++$i]; | |
162 | |
163 if ( $type eq 'count' | |
164 || $type eq 'proportion' | |
165 || $type eq 'percent') | |
166 { | |
167 $opts{'output-type'} = $type; | |
168 next; | |
169 } | |
170 } | |
171 usage(); | |
172 } | |
173 warn_if_aligned $_; | |
174 foreach_row($_, \&process); | |
175 } | |
176 | |
177 print_result(); | |
178 exit 0; | |
179 | |
180 sub warn_if_aligned($) | |
181 { | |
182 local $_; | |
183 | |
184 foreach (sra_list_tables($_[0])) { | |
185 if (/PRIMARY_ALIGNMENT/) { | |
186 print "$_[0] is aligned and is likely to be mostly ordered; results may be biased to the reference\n"; | |
187 last; | |
188 } | |
189 } | |
190 } | |
191 | |
192 sub get_stat($$$) | |
193 { | |
194 # (pos * 5 + base) * MAX_READ_COUNT + read_number; | |
195 return $stats[($_[1] * 5 + $_[2]) * MAX_READ_COUNT + $_[0]]; | |
196 } | |
197 | |
198 sub inc_stat($$$) # this is called a zillion times and needs to be fast | |
199 { | |
200 # (pos * 5 + base) * MAX_READ_COUNT + read_number; | |
201 ++$stats[($_[1] * 5 + $_[2]) * MAX_READ_COUNT + $_[0]]; | |
202 } | |
203 | |
204 sub resize_stats($) | |
205 { | |
206 my $length = $_[0]; | |
207 my $need = MAX_READ_COUNT * 5 * $length; | |
208 my $have = scalar(@stats); | |
209 | |
210 return if $have >= $need; | |
211 splice @stats, $have, 0, map { 0 } (1..($need - $have)); | |
212 } | |
213 | |
214 sub convert_bases($) | |
215 { | |
216 local $_ = ${$_[0]}; | |
217 | |
218 tr/ACGTN/01234/; | |
219 return $_; | |
220 } | |
221 | |
222 # accumulate statistics for one spot | |
223 sub process | |
224 { | |
225 # this matches COLUMN_LIST above | |
226 my ($READ_START, $READ_LEN, $READ_TYPE, $READ) = @_; | |
227 | |
228 # the data in these fields is comma delimited | |
229 # and describe how to split READ up into its pieces | |
230 my @start = split(/,\s*/, $$READ_START); | |
231 my @length = split(/,\s*/, $$READ_LEN); | |
232 my @type = split(/,\s*/, $$READ_TYPE); | |
233 | |
234 # READ contains the bases for the whole spot (i.e. all the mates concatenated) | |
235 my @bases = split(//, convert_bases($READ)); | |
236 my $reads = scalar(@start); | |
237 | |
238 die "too many reads: $reads reads; adjust MAX_READ_COUNT and rerun" unless $reads <= MAX_READ_COUNT; | |
239 | |
240 $max_read = $reads if ($max_read < $reads); | |
241 for (my $read = 0; $read != $reads; ++$read) { | |
242 # only count biological bases (i.e. no adapters, linkers, etc.) | |
243 next unless $type[$read] =~ /SRA_READ_TYPE_BIOLOGICAL/; | |
244 my $pos = $start[$read]; | |
245 my $len = $length[$read]; | |
246 | |
247 resize_stats($len); | |
248 $max_pos = $len if ($max_pos < $len); | |
249 for (my $i = 0; $i != $len; ++$i) { | |
250 inc_stat($read, $i, $bases[$pos + $i]); | |
251 } | |
252 } | |
253 } | |
254 | |
255 sub print_result() { | |
256 my $GCp_mean; | |
257 my $GCp_stdev; | |
258 my $sum = 0; | |
259 my $ssum = 0; | |
260 my $format = | |
261 $opts{'output-type'} eq 'count' ? sub { $_[0] } : | |
262 $opts{'output-type'} eq 'percent' ? sub { int($_[0] * 100.0/$_[1] + 0.5).'%' } : | |
263 sub { $_[0]/$_[1] }; | |
264 | |
265 for (my $read = 0; $read != $max_read; ++$read) { | |
266 for (my $pos = 0; $pos != $max_pos; ++$pos) { | |
267 my $gc = (get_stat($read, $pos, 1) + get_stat($read, $pos, 2)) * 1.0; | |
268 my $n = $gc + get_stat($read, $pos, 0) + get_stat($read, $pos, 3); | |
269 my $x = $gc/$n; | |
270 | |
271 $sum += $x; | |
272 $ssum += $x * $x; | |
273 } | |
274 } | |
275 $GCp_mean = $sum/($max_pos*$max_read); | |
276 $GCp_stdev = sqrt(($ssum - $sum*$GCp_mean)/($max_pos*$max_read)); | |
277 | |
278 printf join("\t", ('Read', 'Pos', 'A', 'C', 'G', 'T', 'N', 'GC', 'Suspect'))."\n"; | |
279 for (my $read = 0; $read != $max_read; ++$read) { | |
280 for (my $pos = 0; $pos != $max_pos; ++$pos) { | |
281 my $cntA = get_stat($read, $pos, 0); | |
282 my $cntC = get_stat($read, $pos, 1); | |
283 my $cntG = get_stat($read, $pos, 2); | |
284 my $cntT = get_stat($read, $pos, 3); | |
285 my $cntN = get_stat($read, $pos, 4); | |
286 my $GC = $cntC + $cntG; | |
287 my $tot = $cntA + $GC + $cntT; | |
288 | |
289 # keep N out for GC | |
290 my $GC_score = ($GC/$tot - $GCp_mean)/$GCp_stdev; | |
291 | |
292 # add N in | |
293 $tot += $cntN; | |
294 print join("\t", ($read + 1, $pos + 1, | |
295 &$format($cntA, $tot), | |
296 &$format($cntC, $tot), | |
297 &$format($cntG, $tot), | |
298 &$format($cntT, $tot), | |
299 &$format($cntN, $tot), | |
300 &$format($GC, $tot), | |
301 '*'x(int(abs($GC_score)))))."\n"; | |
302 } | |
303 } | |
304 } |