diff sync_reads @ 1:2367d00c5182 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author jdv
date Tue, 28 Sep 2021 06:12:40 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sync_reads	Tue Sep 28 06:12:40 2021 +0000
@@ -0,0 +1,300 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use autodie qw/open close/;
+use BioX::Seq::Stream;
+use Getopt::Long;
+use Pod::Usage;
+
+my $reads1;
+my $reads2;
+my $singles;
+my $sync_suffix    = 'sync';
+my $singles_suffix = 'singles';
+my $compress       = ''; # one of 'gzip' or 'dsrc'
+my $outname1;
+my $outname2;
+my $singles1_name;
+my $singles2_name;
+
+use constant DSRC_BIN => 'dsrc';
+use constant GZIP_BIN => 'gzip';
+use constant NAME     => 'sync_reads';
+use constant VERSION  => '0.005';
+
+GetOptions(
+
+    'fwd=s'             => \$reads1,
+    'rev=s'             => \$reads2,
+    'singles'           => \$singles,
+    'fwd_out=s'         => \$outname1,
+    'rev_out=s'         => \$outname2,
+    'fwd_singles_out=s' => \$singles1_name,
+    'rev_singles_out=s' => \$singles2_name,
+    'sync_suffix=s'     => \$sync_suffix,
+    'singles_suffix=s'  => \$singles_suffix,
+    'compress:s'        => \$compress,
+    'help'              => sub{ pod2usage(-verbose => 2); },
+    'version'           => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; },
+
+) or pod2usage( -verbose => 1 );
+
+pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read")
+    if (! -r $reads1 || ! -r $reads2);
+
+# open output filehandles, possibly with compression
+if (! defined $outname1) {
+    $outname1 = $reads1;
+    $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/;
+}
+if (! defined $outname2) {
+    $outname2 = $reads2;
+    $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/;
+}
+my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
+my $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
+      : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
+      : '';
+my $suffix  = $compress eq 'gzip' ? '.gz' 
+            : $compress eq 'dsrc' ? '.dsrc'
+            : '';
+open my $s1, $mode, $cmd . $outname1 . $suffix;
+open my $s2, $mode, $cmd . $outname2 . $suffix;
+
+
+# open singles output filehandles if requested, possibly with compression
+my $up1;
+my $up2;
+if ($singles) {
+    if (! defined $singles1_name) {
+        $singles1_name = $reads1;
+        $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
+    }
+    if (! defined $singles2_name) {
+        $singles2_name = $reads2;
+        $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
+    }
+    $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
+    $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
+          : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
+        : '';
+    my $suffix  = $compress eq 'gzip' ? '.gz' 
+                : $compress eq 'dsrc' ? '.dsrc'
+                : '';
+    open $up1, $mode, $cmd . $singles1_name . $suffix;
+    open $up2, $mode, $cmd . $singles2_name . $suffix;
+}
+
+my $ll_f1 = {};
+my $ll_f2 = {};
+my $f1_prev;
+my $f2_prev;
+my $f1_open = 1;
+my $f2_open = 1;
+
+my $parser1 = BioX::Seq::Stream->new($reads1);
+my $parser2 = BioX::Seq::Stream->new($reads2);
+
+while ($f1_open || $f2_open) {
+
+    # process next read for file 1
+    if ($f1_open && defined (my $seq = $parser1->next_seq)) {
+        my $name = $seq->id;
+        $name =~ s/\/[12]$//; #remove directional tag if present
+        if (defined $ll_f2->{$name}) {
+            my $prev = $ll_f2->{$name}->{prev} // undef;
+            purge_cache( $ll_f2, $prev,    $up2 );
+            purge_cache( $ll_f1, $f1_prev, $up1 );
+            print {$s1} $seq->as_fastq;
+            print {$s2} $ll_f2->{$name}->{read};
+            delete $ll_f2->{$name};
+            $f1_prev = undef;
+        }
+        else {
+            $ll_f1->{$name}->{read} = $seq->as_fastq;
+            $ll_f1->{$name}->{prev} = $f1_prev // undef;
+            $f1_prev = $name;
+        }
+    }
+    else {
+        $f1_open = 0;
+    }
+
+    # process next read for file 2
+    if ($f2_open && defined (my $seq = $parser2->next_seq)) {
+        my $name = $seq->id;
+        $name =~ s/\/[12]$//; #remove directional tag if present
+        if (defined $ll_f1->{$name}) {
+            my $prev = $ll_f1->{$name}->{prev} // undef;
+            purge_cache( $ll_f1, $prev,    $up1 );
+            purge_cache( $ll_f2, $f2_prev, $up2 );
+            print {$s2} $seq->as_fastq;
+            print {$s1} $ll_f1->{$name}->{read};
+            delete $ll_f1->{$name};
+            $f2_prev = undef;
+        }
+        else {
+            $ll_f2->{$name}->{read} = $seq->as_fastq;
+            $ll_f2->{$name}->{prev} = $f2_prev // undef;
+            $f2_prev = $name;
+        }
+    }
+    else {
+        $f2_open = 0;
+    }
+}
+
+#handle remaining unpaired reads if necessary
+if ($singles) {
+    purge_cache( $ll_f1, $f1_prev, $up1 );
+    purge_cache( $ll_f2, $f2_prev, $up2 );
+    close $up1;
+    close $up2;
+}
+
+exit;
+
+sub purge_cache {
+
+    my ($ll, $prev, $fh) = @_;
+    while (defined $prev && defined $ll->{$prev}) {
+        my $prev2 = $ll->{$prev}->{prev} // undef;
+        print {$fh} $ll->{$prev}->{read} if ($singles);
+        delete $ll->{$prev};
+        $prev = $prev2;
+    }
+
+}
+
+
+__END__
+
+=head1 NAME
+
+sync_reads - resynchronize paired FASTQ files
+
+=head1 SYNOPSIS
+
+sync_reads [options] --fwd I<left_reads> --rev I<right reads>
+
+=head1 DESCRIPTION
+
+B<sync_reads> will re-synchronize two FASTQ files containing paired reads which
+are no longer in sync due to individual removal of reads during pre-processing
+(trimming, filtering, etc). In this case, "in sync" means that both files have
+the same number of reads and, at any given read position in the files, the
+corresponding reads represent proper pairs. The resulting files will contain
+matching reads in order (assuming the input files were properly ordered). It
+will optionally print out unpaired reads to separate files. Memory usage is
+not dependent on the input file size but rather the maximum distance between
+paired reads in the two files, as the read cache is flushed each time paired
+reads are identified. In the worst-case scenario (one file has a single read
+that pairs with the last read in the matching file) memory usage can approach
+the largest file size, but in typical usage it rarely exceeds a few MB
+regardless of file size.
+
+B<IMPORTANT:> Reads in input files MUST be in the same order, aside from
+missing reads, or the output will report many valid pairs as singletons.
+
+=head1 OPTIONS
+
+=head2 Mandatory
+
+=over 4
+
+=item B<--fwd> I<forward_fastq>
+
+Specify FASTQ file containing the first of the trimmed read pairs
+
+=item B<--rev> I<reverse_fastq>
+
+Specify FASTQ file containing the second of the trimmed read pairs
+
+=back
+
+=head2 Optional
+
+=over 4 
+
+=item B<--fwd_out> I<filename>
+
+Specify output name for synced forward reads
+
+=item B<--rev_out> I<filename>
+
+Specify output name for synced reverse reads
+
+=item B<--fwd_singles_out> I<filename>
+
+Specify output name for forward singleton reads
+
+=item B<--rev_singles_out> I<filename>
+
+Specify output name for reverse singleton reads
+
+=item B<--sync_suffix> I<suffix>
+
+Specify suffix to add to synced read output files. This will be added to the
+input read name before the final suffix (i.e. after the last period). Default
+is 'sync'.
+
+=item B<--compress> I<gzip|dsrc>
+
+Specify type of compression for output files (will compress all output files)
+
+=item B<--singles>
+
+If given, unpaired reads will be written to separate output files. Default is
+FALSE.
+
+=item B<--singles_suffix> I<suffix>
+
+Specify suffix to add to singles read output files. This will be added to the
+input read name before the final suffix (i.e. after the last period). Default
+is 'singles'.
+
+=item B<--help>
+
+Display this usage page
+
+=item B<--version>
+
+Print version information
+
+=back
+
+=head1 CAVEATS AND BUGS
+
+Currently no input validation is performed on the input files. Files are
+assumed to be standard FASTQ file format with each read represented by four
+lines and no other extraneous information present. B<CRITICALLY>, they are also
+assumed to be in the same input order after accounting for deleted reads
+(the software will fail miserably if this is not the case).
+
+Please submit bug reports to the issue tracker in the distribution repository.
+
+=head1 AUTHOR
+
+Jeremy Volkening <jdv@base2bio.com>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2014-16 Jeremy Volkening
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=cut