Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/example/perl/mismatch-stats.pl @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 #!/usr/bin/env perl | |
2 # =========================================================================== | |
3 # | |
4 # PUBLIC DOMAIN NOTICE | |
5 # National Center for Biotechnology Information | |
6 # | |
7 # This software/database is a "United States Government Work" under the | |
8 # terms of the United States Copyright Act. It was written as part of | |
9 # the author's official duties as a United States Government employee and | |
10 # thus cannot be copyrighted. This software/database is freely available | |
11 # to the public for use. The National Library of Medicine and the U.S. | |
12 # Government have not placed any restriction on its use or reproduction. | |
13 # | |
14 # Although all reasonable efforts have been taken to ensure the accuracy | |
15 # and reliability of the software and data, the NLM and the U.S. | |
16 # Government do not and cannot warrant the performance or results that | |
17 # may be obtained by using this software or data. The NLM and the U.S. | |
18 # Government disclaim all warranties, express or implied, including | |
19 # warranties of performance, merchantability or fitness for any particular | |
20 # purpose. | |
21 # | |
22 # Please cite the author in any work or product based on this material. | |
23 # | |
24 # =========================================================================== | |
25 | |
26 use warnings; | |
27 | |
28 use Data::Dumper; | |
29 | |
30 use constant MAX_READ_COUNT => 2; | |
31 | |
32 my %opts = ( | |
33 'row-range' => '-1000000', | |
34 'output-type' => 'proportion' | |
35 ); | |
36 | |
37 sub usage() | |
38 { | |
39 print <<"HELP"; | |
40 report BASE statistics for an SRA | |
41 | |
42 Usage: | |
43 $0 [<options>...] <accession>... | |
44 options are | |
45 -h | -? | --help | |
46 --row-range <row-range-spec> default is first 1000000 spots | |
47 --output-type <output-spec> set the type of data output | |
48 count, percent, or proportion | |
49 proportion is the default | |
50 | |
51 Example: | |
52 $0 --row-range "5,7,11-15,25-37" --output-type count SRR797646 | |
53 | |
54 HELP | |
55 exit 0; | |
56 } | |
57 | |
58 usage if scalar @ARGV == 0; | |
59 foreach (@ARGV) { | |
60 usage() if (/^-h$/ || /^-\?$/ || /^--help$/); | |
61 } | |
62 | |
63 my $VDB_DUMP = `which vdb-dump` or die "Please put path to vdb-dump in PATH"; | |
64 chomp $VDB_DUMP; | |
65 | |
66 my @stats = (); | |
67 my $max_read = 0; | |
68 my $max_len = 0; | |
69 | |
70 sub get_index($$$) | |
71 { | |
72 my ($pos, $read, $base) = @_; | |
73 | |
74 return ($pos * MAX_READ_COUNT + $read) * 6 + $base; | |
75 } | |
76 | |
77 sub resize_stats($) | |
78 { | |
79 my $need = get_index $_[0], MAX_READ_COUNT, 0; | |
80 my $have = scalar(@stats); | |
81 | |
82 return unless ($have < $need); | |
83 splice @stats, $have, 0, map { 0 } (0..($need - $have)); | |
84 } | |
85 | |
86 sub translate_base($) | |
87 { | |
88 return 0 if $_[0] eq '='; | |
89 return 1 if $_[0] eq 'A'; | |
90 return 2 if $_[0] eq 'C'; | |
91 return 3 if $_[0] eq 'G'; | |
92 return 4 if $_[0] eq 'T'; | |
93 return 5; | |
94 } | |
95 | |
96 sub translate_base_complement($) | |
97 { | |
98 return 0 if $_[0] eq '='; | |
99 return 1 if $_[0] eq 'T'; | |
100 return 2 if $_[0] eq 'G'; | |
101 return 3 if $_[0] eq 'C'; | |
102 return 4 if $_[0] eq 'A'; | |
103 return 5; | |
104 } | |
105 | |
106 sub expand_and_summarize($) | |
107 { | |
108 my ($READ_ID, $ORIENT, $MM_READ) = \split(/\t/, ${$_[0]}); | |
109 my $len = length $$MM_READ; | |
110 my $read = $$READ_ID - 1; | |
111 | |
112 $max_read = $read if $max_read < $read; | |
113 $max_len = $len if $max_len < $len ; | |
114 | |
115 resize_stats $len; | |
116 if ($$ORIENT eq 'false') { | |
117 for (my $i = 0; $i != $len; ++$i) { | |
118 my $base = translate_base(substr($$MM_READ, $i, 1)); | |
119 | |
120 ++$stats[get_index($i, $read, $base)]; | |
121 } | |
122 } | |
123 else { | |
124 for (my $j = 0; $j != $len; ++$j) { | |
125 my $i = $len - $j - 1; | |
126 my $base = translate_base_complement(substr($$MM_READ, $i, 1)); | |
127 | |
128 ++$stats[get_index($i, $read, $base)]; | |
129 } | |
130 } | |
131 } | |
132 | |
133 sub check_aligned($) | |
134 { | |
135 my $aligned = 0; | |
136 | |
137 if (`$VDB_DUMP -y "$_[0]"` =~ /Database/i) { | |
138 open CMD, '-|', "$VDB_DUMP -E \"$_[0]\"" or die "$!"; | |
139 | |
140 while (defined(local $_ = <CMD>)) { | |
141 chomp; | |
142 if (/PRIMARY_ALIGNMENT/) { | |
143 $aligned = 1; | |
144 last; | |
145 } | |
146 } | |
147 close CMD; | |
148 } | |
149 return $aligned; | |
150 } | |
151 | |
152 sub process($) | |
153 { | |
154 my $cmd = "$VDB_DUMP -f tab -C \"SEQ_READ_ID,REF_ORIENTATION,MISMATCH_READ\" -R $opts{'row-range'} -T PRIMARY_ALIGNMENT \"$_[0]\""; | |
155 open CMD, '-|', $cmd or die "$!"; | |
156 | |
157 while (defined($_ = <CMD>)) { | |
158 chomp; | |
159 expand_and_summarize \$_; | |
160 } | |
161 close CMD; | |
162 } | |
163 | |
164 for (my $i = 0; $i < scalar @ARGV; ++$i) { | |
165 $_ = $ARGV[$i]; | |
166 | |
167 if (/^-/) { | |
168 if (/^--row-range$/) { | |
169 $opts{'row-range'} = $ARGV[++$i]; | |
170 next; | |
171 } | |
172 if (/^--output-type$/) { | |
173 $opts{'output-type'} = $ARGV[++$i]; | |
174 next; | |
175 } | |
176 usage(); | |
177 } | |
178 if (check_aligned $_) { | |
179 process $_; | |
180 } | |
181 else { | |
182 print "$_ is not aligned\n"; | |
183 } | |
184 } | |
185 | |
186 sub format_counts($$$$) | |
187 { | |
188 my ($x, $cnt) = @_; | |
189 | |
190 return $x; | |
191 } | |
192 | |
193 sub format_proportion($$$$) | |
194 { | |
195 my ($x, $cnt) = @_; | |
196 | |
197 return $x/$cnt; | |
198 } | |
199 | |
200 sub format_percent($$$$) | |
201 { | |
202 my ($x, $cnt) = @_; | |
203 | |
204 return int($x * 100.0/$cnt + 0.5).'%'; | |
205 } | |
206 | |
207 my $format = \&format_proportion; | |
208 | |
209 if ($opts{'output-type'} eq 'count') { | |
210 $format = \&format_counts; | |
211 } | |
212 elsif ($opts{'output-type'} eq 'proportion') { | |
213 $format = \&format_proportion; | |
214 } | |
215 elsif ($opts{'output-type'} eq 'percent') { | |
216 $format = \&format_percent; | |
217 } | |
218 else { | |
219 usage(); | |
220 } | |
221 | |
222 printf join("\t", ('Read', 'Pos', '=', 'A', 'C', 'G', 'T', 'N'))."\n"; | |
223 | |
224 for (my $read = 0; $read <= $max_read; ++$read) { | |
225 for (my $pos = 0; $pos != $max_len; ++$pos) { | |
226 my $idx = get_index $pos, $read, 0; | |
227 my $tot = $stats[$idx+0] + $stats[$idx+1] + $stats[$idx+2] | |
228 + $stats[$idx+3] + $stats[$idx+4] + $stats[$idx+5]; | |
229 | |
230 if ($tot) { | |
231 print join("\t", ($read + 1, $pos + 1, | |
232 &$format($stats[$idx+0], $tot), | |
233 &$format($stats[$idx+1], $tot), | |
234 &$format($stats[$idx+2], $tot), | |
235 &$format($stats[$idx+3], $tot), | |
236 &$format($stats[$idx+4], $tot), | |
237 &$format($stats[$idx+5], $tot) | |
238 ))."\n"; | |
239 } | |
240 } | |
241 } |