changeset 7:2c843c8df821 draft

planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
author tiagoantao
date Thu, 31 Mar 2016 10:32:59 -0400
parents 5cd3cc89bf59
children 8b0f010d356c
files barcode_sort.pl barcode_sort.xml
diffstat 2 files changed, 146 insertions(+), 66 deletions(-) [+]
line wrap: on
line diff
--- a/barcode_sort.pl	Tue Mar 29 12:45:39 2016 -0400
+++ b/barcode_sort.pl	Thu Mar 31 10:32:59 2016 -0400
@@ -1,78 +1,153 @@
 #!/usr/bin/perl -w
 use strict; use warnings;
 
-my $b = 8;			# set barcode length (Hohenlohe lab 2015 bestRAD barcodes are 8bp)
-my $n = 2;			# set number of nucleotides to expect before the barcode (bestRAD libraries run at U. Oregon have 2bp here)
-my $c = $b + $n;	# set position for start of enzyme cutsite, which occurs after the initial nucleotides plus the barcode
-my $e = 6;			# set length of remaining enzyme cutsite sequence (e.g. 6 for SbfI) -- for other than SbfI, need to change the actual sequence below!!!
+my $cut_site = $ARGV[7]; #let the user specify the cut site keyword (e.g. "TGCA" for the SbfI 6 bp cutter)
+
+my $b = $ARGV[6]; #let the user specify barcode length
+my $n = 2;        # set number of nucleotides to expect before the barcode (bestRAD libraries run at U. Oregon have 2bp here)
+my $c = $b + $n;  # set position for start of enzyme cutsite, which occurs after the initial nucleotides plus the barcode
+my $e = 6;        # set length of remaining enzyme cutsite sequence (e.g. 6 for SbfI) -- for other than SbfI, need to change the actual sequence below!!!
+my $e = length($cut_site); #calculate the length of the cut site
 # note this is the length of what's left after enzyme digestion, NOT the full length of the enzyme recognition site
 # the program expects all correct forward reads to follow the pattern: $n initial nucleotides, then $b nucleotides of barcode, then $e nucleotides of the cutsite
 
 my $is_resilient;
 
-$is_resilient = $ARGV[5];  # true if is resilient to errors
-
-open (IN, $ARGV[0]);	# read file with barcodes
-my %counts = ();		# make a hash of barcodes that will be searched
-while(<IN>) {			# counts of each barcode can be tracked with this hash, with a few more lines of code below	
-	chomp($_);
-	$counts{$_} = 0;
+open (IN, $ARGV[0]);    # read file with barcodes
+my %counts = ();        # make a hash of barcodes that will be searched
+while(<IN>) {           # counts of each barcode can be tracked with this hash, with a few more lines of code below
+    chomp($_);
+    $counts{$_} = 0;
 }
 close IN;
 
-open (IN1, $ARGV[1]);		# read fastq file of raw forward reads
-open (IN2, $ARGV[2]);		# read fastq file of raw reverse reads  -- these must have pairs in identical order
-open (OUT1, ">$ARGV[3]");	# create fastq outfile for flipped forward reads (cutsite end)
-open (OUT2, ">$ARGV[4]");	# create fastq outfile for flipped reverse reads (randomly sheared end)
-my $forward; my $reverse; my $barcode;		# establish string variables for all parts of fastq files
+open (IN1, $ARGV[1]);        # read fastq file of raw forward reads
+open (IN2, $ARGV[2]);        # read fastq file of raw reverse reads  -- these must have pairs in identical order
+open (OUT1, ">$ARGV[3]");    # create fastq outfile for flipped forward reads (cutsite end)
+open (OUT2, ">$ARGV[4]");    # create fastq outfile for flipped reverse reads (randomly sheared end)
+my $forward; my $reverse; my $barcode;        # establish string variables for all parts of fastq files
 my $name1; my $name2; my $third1; my $third2; my $qual1; my $qual2;
-while($name1 = <IN1>) {		# start reading through the paired fastq input files
-	$name2 = <IN2>;			# read all parts of a single read pair (4 lines in each of the 2 fastq files)
-	$forward = <IN1>;
-	$reverse = <IN2>;
-	$third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>;
-	my $which = 0; my $for; my $rev;		# establish variables used below
-	if(substr($forward, $c, $e) eq "TGCAGG") {			# check for SbfI cutsite in the correct place in forward read
-		if(substr($reverse, $c, $e) eq "TGCAGG") {		# check for SbfI cutsite in the correct place in reverse read
-			$for = substr($forward, $n, $b);			# this is where a barcode should be if it's in the forward read
-			$rev = substr($reverse, $n, $b);			# this is where a barcode should be if it's in the reverse read
-			if(exists $counts{$for} && (exists $counts{$rev}) == 0) {		# if a correct barcode and cutsite are in forward but not reverse read...
-				$which = 1; $barcode = $for; $counts{$for}++;				# which = 1 means the pair is correctly oriented
-			}
-			elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) {	# if a correct barcode and cutsite are only in the reverse read...
-				$which = 2; $barcode = $rev; $counts{$rev}++;				# which = 2 means the pair needs to be flipped
-			}
-		}
-		else {											# the cutsite is only found in the forward read
-			$barcode = substr($forward, $n, $b);			
-			if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; }	# if a correct barcode is in the forward read, the pair is correctly oriented
-		}
-	}
-	elsif(substr($reverse, $c, $e) eq "TGCAGG") {		# cutsite not in forward read but is in reverse read
-		$barcode = substr($reverse, $n, $b);				
-		if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; }		# pair needs to be flipped
-	}													# if a cutsite and correct barcode has not been found in either read, which = 0 at this point
-	if($which == 1) {									# if the pair is correctly oriented, print out fastq format for the pair
-		my $temp1 = substr($forward, $n);				# trim initial nucleotides off read and quality scores...
-		my $temp2 = substr($qual1, $n);					# so that output keeps barcode and cutsite but not other nucleotides...
-		print OUT1 "$name1$temp1$third1$temp2";			# and is ready to go into process_radtags
-		print OUT2 "$name2$reverse$third2$qual2";
-	}
-	elsif($which == 2) {								# if the pair needs to be flipped, print out fastq format for the flipped pair
-		my $temp1 = substr($reverse, $n);
-		my $temp2 = substr($qual2, $n);
-		print OUT1 "$name2$temp1$third2$temp2";
-		print OUT2 "$name1$forward$third1$qual1";
-	}													# if which == 0, nothing is printed out for the pair
+
+$is_resilient = $ARGV[5];  # true if is resilient to errors - Brian's additions
+
+
+sub strDiffMaxDelta {
+
+    my ( $s1, $s2, $maxDelta ) = @_;
+    my $diffCount = () = ( $s1 ^ $s2 ) =~ /[^\x00]/g;
+    return $diffCount <= $maxDelta;
+
 }
-close IN1;
-close IN2;
-close OUT1;
-close OUT2;
+
+if ($is_resilient eq "true") {
+
+    my $max_errors = $ARGV[8];
+
+    while($name1 = <IN1>) {        # start reading through the paired fastq input files
+        $name2 = <IN2>;            # read all parts of a single read pair (4 lines in each of the 2 fastq files)
+        $forward = <IN1>;
+        $reverse = <IN2>;
+        $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>;
+        my $which = 0; my $for; my $rev;        # establish variables used below
+
+        if(strDiffMaxDelta(substr($forward, $c, $e), $cut_site, $max_errors)) {            # check for SbfI cutsite in the correct place in forward read
+            if(strDiffMaxDelta(substr($reverse, $c, $e), $cut_site, $max_errors)) {        # check for SbfI cutsite in the correct place in reverse read
+                $for = substr($forward, $n, $b);            # this is where a barcode should be if it's in the forward read
+                $rev = substr($reverse, $n, $b);            # this is where a barcode should be if it's in the reverse read
+                if(exists $counts{$for} && (exists $counts{$rev}) == 0) {        # if a correct barcode and cutsite are in forward but not reverse read...
+                    $which = 1; $barcode = $for; $counts{$for}++;                # which = 1 means the pair is correctly oriented
+                }
+                elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) {    # if a correct barcode and cutsite are only in the reverse read...
+                    $which = 2; $barcode = $rev; $counts{$rev}++;                # which = 2 means the pair needs to be flipped
+                }
+            }
+            else {                                            # the cutsite is only found in the forward read
+                #$barcode = substr($forward, $n, $b);
+                #if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; }    # if a correct barcode is in the forward read, the pair is correctly oriented
+                $which = 1;
 
-foreach my $key (sort keys %counts) {
-	print "$key" . "\t" . "$counts{$key}\n";
+            }
+        }
+        elsif(strDiffMaxDelta(substr($reverse, $c, $e), $cut_site, $max_errors)){    # cutsite not in forward read but is in reverse read
+            #$barcode = substr($reverse, $n, $b);
+            #if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; }        # pair needs to be flipped
+            $which = 2;
+        }                                                    # if a cutsite and correct barcode has not been found in either read, which = 0 at this point
+        if($which == 1) {                                    # if the pair is correctly oriented, print out fastq format for the pair
+            my $temp1 = substr($forward, $n);                # trim initial nucleotides off read and quality scores...
+            my $temp2 = substr($qual1, $n);                    # so that output keeps barcode and cutsite but not other nucleotides...
+            print OUT1 "$name1$temp1$third1$temp2";            # and is ready to go into process_radtags
+            print OUT2 "$name2$reverse$third2$qual2";
+        }
+        elsif($which == 2) {                                # if the pair needs to be flipped, print out fastq format for the flipped pair
+            my $temp1 = substr($reverse, $n);
+            my $temp2 = substr($qual2, $n);
+            print OUT1 "$name2$temp1$third2$temp2";
+            print OUT2 "$name1$forward$third1$qual1";
+        }                                                    # if which == 0, nothing is printed out for the pair
+    }
+    close IN1;
+    close IN2;
+    close OUT1;
+    close OUT2;
+
+    foreach my $key (sort keys %counts) {
+        print "$key" . "\t" . "$counts{$key}\n";
+    }
+
 }
 
 
+else{ #Paul's code
 
+    while($name1 = <IN1>) {        # start reading through the paired fastq input files
+        $name2 = <IN2>;            # read all parts of a single read pair (4 lines in each of the 2 fastq files)
+        $forward = <IN1>;
+        $reverse = <IN2>;
+        $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>;
+
+        my $which = 0; my $for; my $rev;        # establish variables used below
+        if(substr($forward, $c, $e) eq "TGCAGG") {            # check for SbfI cutsite in the correct place in forward read
+            if(substr($reverse, $c, $e) eq "TGCAGG") {        # check for SbfI cutsite in the correct place in reverse read
+                $for = substr($forward, $n, $b);            # this is where a barcode should be if it's in the forward read
+                $rev = substr($reverse, $n, $b);            # this is where a barcode should be if it's in the reverse read
+                if(exists $counts{$for} && (exists $counts{$rev}) == 0) {        # if a correct barcode and cutsite are in forward but not reverse read...
+                    $which = 1; $barcode = $for; $counts{$for}++;                # which = 1 means the pair is correctly oriented
+                }
+                elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) {    # if a correct barcode and cutsite are only in the reverse read...
+                    $which = 2; $barcode = $rev; $counts{$rev}++;                # which = 2 means the pair needs to be flipped
+                }
+            }
+            else {                                            # the cutsite is only found in the forward read
+                $barcode = substr($forward, $n, $b);
+                if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; }    # if a correct barcode is in the forward read, the pair is correctly oriented
+            }
+        }
+        elsif(substr($reverse, $c, $e) eq "TGCAGG") {        # cutsite not in forward read but is in reverse read
+            $barcode = substr($reverse, $n, $b);
+            if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; }        # pair needs to be flipped
+        }                                                    # if a cutsite and correct barcode has not been found in either read, which = 0 at this point
+        if($which == 1) {                                    # if the pair is correctly oriented, print out fastq format for the pair
+            my $temp1 = substr($forward, $n);                # trim initial nucleotides off read and quality scores...
+            my $temp2 = substr($qual1, $n);                    # so that output keeps barcode and cutsite but not other nucleotides...
+            print OUT1 "$name1$temp1$third1$temp2";            # and is ready to go into process_radtags
+            print OUT2 "$name2$reverse$third2$qual2";
+        }
+        elsif($which == 2) {                                # if the pair needs to be flipped, print out fastq format for the flipped pair
+            my $temp1 = substr($reverse, $n);
+            my $temp2 = substr($qual2, $n);
+            print OUT1 "$name2$temp1$third2$temp2";
+            print OUT2 "$name1$forward$third1$qual1";
+        }                                                    # if which == 0, nothing is printed out for the pair
+
+
+    }
+    close IN1;
+    close IN2;
+    close OUT1;
+    close OUT2;
+
+    foreach my $key (sort keys %counts) {
+        print "$key" . "\t" . "$counts{$key}\n";
+    }
+}
--- a/barcode_sort.xml	Tue Mar 29 12:45:39 2016 -0400
+++ b/barcode_sort.xml	Thu Mar 31 10:32:59 2016 -0400
@@ -7,14 +7,16 @@
 
 <command interpreter="perl">
 
-barcode_sort.pl $barcode $f1 $f2 $barcoded $nonbarcoded
+barcode_sort.pl $barcode $f1 $f2 $barcoded $nonbarcoded $barcode_length $cut_site
 
 </command>
 
 <inputs>
-  <param name="barcode" format="txt" type="data" label="Barcode file 1" />
-  <param name="f1" type="data" format="fastq" label="First reads" />
-  <param name="f2" type="data" format="fastq" label="Second reads" />
+  <param name="barcode" format="txt" type="data" label="Barcode file" />
+  <param name="barcode_length" type="integer" value="6" label="Barcode length" />
+  <param name="f1" type="data" format="fastq" label="First read FASTQ" />
+  <param name="f2" type="data" format="fastq" label="Second read FASTQ" />
+  <param name="cut_site" type="txt" value="TGCA" label="Cut site" />
 </inputs>
 <outputs>
   <data format="fastq" name="barcoded" label="Barcoded sequences"/>
@@ -27,7 +29,11 @@
 
 **What it does**
 
-This program takes 2 input sequence files where the barcode can be on either the 1st or 2nd read (but not both) and sorts all barcoded reads into a single file and all non-barcoded files into a second file. This separation is needed for the STACKs program as it does not handle sequences where the barcode is arbitrarily on the 1st or 2nd read.
+This program takes 2 input sequence files where the barcode can be on either
+the 1st or 2nd read (but not both) and sorts all barcoded reads into a single
+file and all non-barcoded files into a second file. This separation is needed
+for the STACKs program as it does not handle sequences where the barcode is
+arbitrarily on the 1st or 2nd read.
 
 --------
 
@@ -43,4 +49,3 @@
 </help>
 
 </tool>
-