comparison sync_reads @ 1:2367d00c5182 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author jdv
date Tue, 28 Sep 2021 06:12:40 +0000
parents
children
comparison
equal deleted inserted replaced
0:d32139014ec3 1:2367d00c5182
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5 use 5.012;
6
7 use autodie qw/open close/;
8 use BioX::Seq::Stream;
9 use Getopt::Long;
10 use Pod::Usage;
11
12 my $reads1;
13 my $reads2;
14 my $singles;
15 my $sync_suffix = 'sync';
16 my $singles_suffix = 'singles';
17 my $compress = ''; # one of 'gzip' or 'dsrc'
18 my $outname1;
19 my $outname2;
20 my $singles1_name;
21 my $singles2_name;
22
23 use constant DSRC_BIN => 'dsrc';
24 use constant GZIP_BIN => 'gzip';
25 use constant NAME => 'sync_reads';
26 use constant VERSION => '0.005';
27
28 GetOptions(
29
30 'fwd=s' => \$reads1,
31 'rev=s' => \$reads2,
32 'singles' => \$singles,
33 'fwd_out=s' => \$outname1,
34 'rev_out=s' => \$outname2,
35 'fwd_singles_out=s' => \$singles1_name,
36 'rev_singles_out=s' => \$singles2_name,
37 'sync_suffix=s' => \$sync_suffix,
38 'singles_suffix=s' => \$singles_suffix,
39 'compress:s' => \$compress,
40 'help' => sub{ pod2usage(-verbose => 2); },
41 'version' => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; },
42
43 ) or pod2usage( -verbose => 1 );
44
45 pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read")
46 if (! -r $reads1 || ! -r $reads2);
47
48 # open output filehandles, possibly with compression
49 if (! defined $outname1) {
50 $outname1 = $reads1;
51 $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/;
52 }
53 if (! defined $outname2) {
54 $outname2 = $reads2;
55 $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/;
56 }
57 my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
58 my $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
59 : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
60 : '';
61 my $suffix = $compress eq 'gzip' ? '.gz'
62 : $compress eq 'dsrc' ? '.dsrc'
63 : '';
64 open my $s1, $mode, $cmd . $outname1 . $suffix;
65 open my $s2, $mode, $cmd . $outname2 . $suffix;
66
67
68 # open singles output filehandles if requested, possibly with compression
69 my $up1;
70 my $up2;
71 if ($singles) {
72 if (! defined $singles1_name) {
73 $singles1_name = $reads1;
74 $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
75 }
76 if (! defined $singles2_name) {
77 $singles2_name = $reads2;
78 $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
79 }
80 $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
81 $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
82 : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
83 : '';
84 my $suffix = $compress eq 'gzip' ? '.gz'
85 : $compress eq 'dsrc' ? '.dsrc'
86 : '';
87 open $up1, $mode, $cmd . $singles1_name . $suffix;
88 open $up2, $mode, $cmd . $singles2_name . $suffix;
89 }
90
91 my $ll_f1 = {};
92 my $ll_f2 = {};
93 my $f1_prev;
94 my $f2_prev;
95 my $f1_open = 1;
96 my $f2_open = 1;
97
98 my $parser1 = BioX::Seq::Stream->new($reads1);
99 my $parser2 = BioX::Seq::Stream->new($reads2);
100
101 while ($f1_open || $f2_open) {
102
103 # process next read for file 1
104 if ($f1_open && defined (my $seq = $parser1->next_seq)) {
105 my $name = $seq->id;
106 $name =~ s/\/[12]$//; #remove directional tag if present
107 if (defined $ll_f2->{$name}) {
108 my $prev = $ll_f2->{$name}->{prev} // undef;
109 purge_cache( $ll_f2, $prev, $up2 );
110 purge_cache( $ll_f1, $f1_prev, $up1 );
111 print {$s1} $seq->as_fastq;
112 print {$s2} $ll_f2->{$name}->{read};
113 delete $ll_f2->{$name};
114 $f1_prev = undef;
115 }
116 else {
117 $ll_f1->{$name}->{read} = $seq->as_fastq;
118 $ll_f1->{$name}->{prev} = $f1_prev // undef;
119 $f1_prev = $name;
120 }
121 }
122 else {
123 $f1_open = 0;
124 }
125
126 # process next read for file 2
127 if ($f2_open && defined (my $seq = $parser2->next_seq)) {
128 my $name = $seq->id;
129 $name =~ s/\/[12]$//; #remove directional tag if present
130 if (defined $ll_f1->{$name}) {
131 my $prev = $ll_f1->{$name}->{prev} // undef;
132 purge_cache( $ll_f1, $prev, $up1 );
133 purge_cache( $ll_f2, $f2_prev, $up2 );
134 print {$s2} $seq->as_fastq;
135 print {$s1} $ll_f1->{$name}->{read};
136 delete $ll_f1->{$name};
137 $f2_prev = undef;
138 }
139 else {
140 $ll_f2->{$name}->{read} = $seq->as_fastq;
141 $ll_f2->{$name}->{prev} = $f2_prev // undef;
142 $f2_prev = $name;
143 }
144 }
145 else {
146 $f2_open = 0;
147 }
148 }
149
150 #handle remaining unpaired reads if necessary
151 if ($singles) {
152 purge_cache( $ll_f1, $f1_prev, $up1 );
153 purge_cache( $ll_f2, $f2_prev, $up2 );
154 close $up1;
155 close $up2;
156 }
157
158 exit;
159
160 sub purge_cache {
161
162 my ($ll, $prev, $fh) = @_;
163 while (defined $prev && defined $ll->{$prev}) {
164 my $prev2 = $ll->{$prev}->{prev} // undef;
165 print {$fh} $ll->{$prev}->{read} if ($singles);
166 delete $ll->{$prev};
167 $prev = $prev2;
168 }
169
170 }
171
172
173 __END__
174
175 =head1 NAME
176
177 sync_reads - resynchronize paired FASTQ files
178
179 =head1 SYNOPSIS
180
181 sync_reads [options] --fwd I<left_reads> --rev I<right reads>
182
183 =head1 DESCRIPTION
184
185 B<sync_reads> will re-synchronize two FASTQ files containing paired reads which
186 are no longer in sync due to individual removal of reads during pre-processing
187 (trimming, filtering, etc). In this case, "in sync" means that both files have
188 the same number of reads and, at any given read position in the files, the
189 corresponding reads represent proper pairs. The resulting files will contain
190 matching reads in order (assuming the input files were properly ordered). It
191 will optionally print out unpaired reads to separate files. Memory usage is
192 not dependent on the input file size but rather the maximum distance between
193 paired reads in the two files, as the read cache is flushed each time paired
194 reads are identified. In the worst-case scenario (one file has a single read
195 that pairs with the last read in the matching file) memory usage can approach
196 the largest file size, but in typical usage it rarely exceeds a few MB
197 regardless of file size.
198
199 B<IMPORTANT:> Reads in input files MUST be in the same order, aside from
200 missing reads, or the output will report many valid pairs as singletons.
201
202 =head1 OPTIONS
203
204 =head2 Mandatory
205
206 =over 4
207
208 =item B<--fwd> I<forward_fastq>
209
210 Specify FASTQ file containing the first of the trimmed read pairs
211
212 =item B<--rev> I<reverse_fastq>
213
214 Specify FASTQ file containing the second of the trimmed read pairs
215
216 =back
217
218 =head2 Optional
219
220 =over 4
221
222 =item B<--fwd_out> I<filename>
223
224 Specify output name for synced forward reads
225
226 =item B<--rev_out> I<filename>
227
228 Specify output name for synced reverse reads
229
230 =item B<--fwd_singles_out> I<filename>
231
232 Specify output name for forward singleton reads
233
234 =item B<--rev_singles_out> I<filename>
235
236 Specify output name for reverse singleton reads
237
238 =item B<--sync_suffix> I<suffix>
239
240 Specify suffix to add to synced read output files. This will be added to the
241 input read name before the final suffix (i.e. after the last period). Default
242 is 'sync'.
243
244 =item B<--compress> I<gzip|dsrc>
245
246 Specify type of compression for output files (will compress all output files)
247
248 =item B<--singles>
249
250 If given, unpaired reads will be written to separate output files. Default is
251 FALSE.
252
253 =item B<--singles_suffix> I<suffix>
254
255 Specify suffix to add to singles read output files. This will be added to the
256 input read name before the final suffix (i.e. after the last period). Default
257 is 'singles'.
258
259 =item B<--help>
260
261 Display this usage page
262
263 =item B<--version>
264
265 Print version information
266
267 =back
268
269 =head1 CAVEATS AND BUGS
270
271 Currently no input validation is performed on the input files. Files are
272 assumed to be standard FASTQ file format with each read represented by four
273 lines and no other extraneous information present. B<CRITICALLY>, they are also
274 assumed to be in the same input order after accounting for deleted reads
275 (the software will fail miserably if this is not the case).
276
277 Please submit bug reports to the issue tracker in the distribution repository.
278
279 =head1 AUTHOR
280
281 Jeremy Volkening <jdv@base2bio.com>
282
283 =head1 COPYRIGHT AND LICENSE
284
285 Copyright 2014-16 Jeremy Volkening
286
287 This program is free software: you can redistribute it and/or modify
288 it under the terms of the GNU General Public License as published by
289 the Free Software Foundation, either version 3 of the License, or
290 (at your option) any later version.
291
292 This program is distributed in the hope that it will be useful,
293 but WITHOUT ANY WARRANTY; without even the implied warranty of
294 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
295 GNU General Public License for more details.
296
297 You should have received a copy of the GNU General Public License
298 along with this program. If not, see <http://www.gnu.org/licenses/>.
299
300 =cut