Mercurial > repos > tiagoantao > barcode_stacks
changeset 0:d838c6a2ec0f draft
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
author | tiagoantao |
---|---|
date | Mon, 28 Mar 2016 18:46:00 -0400 |
parents | |
children | 89ab2498e32c |
files | barcode_sort.pl barcode_sort.xml tool_dependencies.xml |
diffstat | 3 files changed, 135 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_sort.pl Mon Mar 28 18:46:00 2016 -0400 @@ -0,0 +1,74 @@ +#!/usr/bin/perl -w +use strict; use warnings; + +my $b = 8; # set barcode length (Hohenlohe lab 2015 bestRAD barcodes are 8bp) +my $n = 2; # set number of nucleotides to expect before the barcode (bestRAD libraries run at U. Oregon have 2bp here) +my $c = $b + $n; # set position for start of enzyme cutsite, which occurs after the initial nucleotides plus the barcode +my $e = 6; # set length of remaining enzyme cutsite sequence (e.g. 6 for SbfI) -- for other than SbfI, need to change the actual sequence below!!! +# note this is the length of what's left after enzyme digestion, NOT the full length of the enzyme recognition site +# the program expects all correct forward reads to follow the pattern: $n initial nucleotides, then $b nucleotides of barcode, then $e nucleotides of the cutsite + +open (IN, $ARGV[0]); # read file with barcodes +my %counts = (); # make a hash of barcodes that will be searched +while(<IN>) { # counts of each barcode can be tracked with this hash, with a few more lines of code below + chomp($_); + $counts{$_} = 0; +} +close IN; + +open (IN1, $ARGV[1]); # read fastq file of raw forward reads +open (IN2, $ARGV[2]); # read fastq file of raw reverse reads -- these must have pairs in identical order +open (OUT1, ">$ARGV[3]"); # create fastq outfile for flipped forward reads (cutsite end) +open (OUT2, ">$ARGV[4]"); # create fastq outfile for flipped reverse reads (randomly sheared end) +my $forward; my $reverse; my $barcode; # establish string variables for all parts of fastq files +my $name1; my $name2; my $third1; my $third2; my $qual1; my $qual2; +while($name1 = <IN1>) { # start reading through the paired fastq input files + $name2 = <IN2>; # read all parts of a single read pair (4 lines in each of the 2 fastq files) + $forward = <IN1>; + $reverse = <IN2>; + $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>; + my $which = 0; my $for; my $rev; # establish variables used below + if(substr($forward, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in forward read + if(substr($reverse, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in reverse read + $for = substr($forward, $n, $b); # this is where a barcode should be if it's in the forward read + $rev = substr($reverse, $n, $b); # this is where a barcode should be if it's in the reverse read + if(exists $counts{$for} && (exists $counts{$rev}) == 0) { # if a correct barcode and cutsite are in forward but not reverse read... + $which = 1; $barcode = $for; $counts{$for}++; # which = 1 means the pair is correctly oriented + } + elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) { # if a correct barcode and cutsite are only in the reverse read... + $which = 2; $barcode = $rev; $counts{$rev}++; # which = 2 means the pair needs to be flipped + } + } + else { # the cutsite is only found in the forward read + $barcode = substr($forward, $n, $b); + if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; } # if a correct barcode is in the forward read, the pair is correctly oriented + } + } + elsif(substr($reverse, $c, $e) eq "TGCAGG") { # cutsite not in forward read but is in reverse read + $barcode = substr($reverse, $n, $b); + if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; } # pair needs to be flipped + } # if a cutsite and correct barcode has not been found in either read, which = 0 at this point + if($which == 1) { # if the pair is correctly oriented, print out fastq format for the pair + my $temp1 = substr($forward, $n); # trim initial nucleotides off read and quality scores... + my $temp2 = substr($qual1, $n); # so that output keeps barcode and cutsite but not other nucleotides... + print OUT1 "$name1$temp1$third1$temp2"; # and is ready to go into process_radtags + print OUT2 "$name2$reverse$third2$qual2"; + } + elsif($which == 2) { # if the pair needs to be flipped, print out fastq format for the flipped pair + my $temp1 = substr($reverse, $n); + my $temp2 = substr($qual2, $n); + print OUT1 "$name2$temp1$third2$temp2"; + print OUT2 "$name1$forward$third1$qual1"; + } # if which == 0, nothing is printed out for the pair +} +close IN1; +close IN2; +close OUT1; +close OUT2; + +foreach my $key (sort keys %counts) { + print "$key" . "\t" . "$counts{$key}\n"; +} + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_sort.xml Mon Mar 28 18:46:00 2016 -0400 @@ -0,0 +1,55 @@ +<tool id="barcode_sort_stacks" name="Barcode sorting for Stacks" version="1.0.0" force_history_refresh="True"> + <description>Run the STACKS populations program</description> + + +<requirements> + <requirement type="package" version="5.18">perl</requirement> +</requirements> + + + +<stdio> + <exit_code range="1" level="fatal" description="Error in script execution" /> +</stdio> + +<command interpreter="perl"> + +barcode_sort.pl $barcode $f1 $f2 $barcoded $nonbarcoded + +</command> + +<inputs> + <param name="barcode" format="txt" type="data" label="Barcode file 1" /> + <param name="f1" format="fastq" label="First reads" /> + <param name="f2" format="fastq" label="Second reads" /> +</inputs> +<outputs> + <data format="fastq" name="barcoded" label="Barcoded sequences"/> + <data format="fastq" name="nonbarcoded" label="Non barcoded sequences"/> +</outputs> + +<help> + +.. class:: infomark + +**What it does** + +This program takes 2 input sequence files where the barcode can be on either the 1st or 2nd read (but not both) and sorts all barcoded reads into a single file and all non-barcoded files into a second file. This separation is needed for the STACKs program as it does not handle sequences where the barcode is arbitrarily on the 1st or 2nd read. + +-------- + +**Created by:** + +Paul Hohenlohe with changes from Brian Hand. + +**Integrated by:** + +Tiago Antao + + +</help> + +<citations> +</citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Mar 28 18:46:00 2016 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="perl" version="5.18"> + <repository changeset_revision="95dad0955d7e" name="package_perl_5_18" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>