changeset 0:8dc0438702c7 draft

Uploaded
author lparsons
date Thu, 07 Nov 2013 16:33:14 -0500
parents
children 1daaf4774a43
files fastx_barcode_splitter.xml fastx_barcode_splitter_galaxy_wrapper.sh test-data/fastx_barcode_splitter1.fastq test-data/fastx_barcode_splitter1.out test-data/fastx_barcode_splitter1.txt tool_dependencies.xml
diffstat 6 files changed, 386 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter.xml	Thu Nov 07 16:33:14 2013 -0500
@@ -0,0 +1,95 @@
+<tool id="cshl_princeton_fastx_barcode_splitter" version="1.1" name="Barcode Splitter" force_history_refresh="True">
+	<description></description>
+    <requirements>
+        <requirement type="package" version="0.0.13">fastx_toolkit</requirement>
+    </requirements>
+	<command interpreter="bash">
+		fastx_barcode_splitter_galaxy_wrapper.sh $BARCODE $input "primary_$output.id" "$output.files_path" $input.extension --mismatches $mismatches --partial $partial 
+		#if $refBarcodeLocation.barcodeLocation == "idxfile":
+		  --idxfile $refBarcodeLocation.idxfile 
+		#else: 
+		  $refBarcodeLocation.EOL 
+		#end if
+		> $output
+	</command>
+
+	<inputs>
+		<param format="txt" version="1.1" name="BARCODE" type="data" label="Barcodes to use" />
+		<param format="fasta,fastqsanger,fastqsolexa,fastqillumina" version="1.1" name="input" type="data" label="Library to split" />
+
+		<conditional name="refBarcodeLocation">
+			<param version="1.1" name="barcodeLocation" type="select" label="Barcodes found at">
+				<option value="bol">Start of sequence (5' end)</option>
+				<option value="eol">End of sequence (3' end)</option>
+				<option value="idxfile">Separate index file</option>
+			</param>
+			<when value="bol">
+				<param version="1.1" name="EOL" type="hidden" value="--bol" />
+			</when>
+			<when value="eol">
+				<param version="1.1" name="EOL" type="hidden" value="--eol" />
+			</when>
+			<when value="idxfile">
+				<param version="1.1" name="idxfile" type="data" format="fasta,fastq,fastqsanger" label="Select index read file" />
+			</when>
+		</conditional>
+
+		<param version="1.1" name="mismatches" type="integer" size="3" value="0" label="Number of allowed mismatches" />
+		
+		<param version="1.1" name="partial" type="integer" size="3" value="0" label="Number of allowed barcodes nucleotide deletions" />
+	
+	</inputs>
+	
+	<tests>
+		<test>
+			<!-- Split a FASTQ file -->
+			<param version="1.1" name="BARCODE" value="fastx_barcode_splitter1.txt" />
+			<param version="1.1" name="input" value="fastx_barcode_splitter1.fastq" ftype="fastqsolexa" />
+			<param version="1.1" name="EOL" value="Start of sequence (5' end)" />
+			<param version="1.1" name="mismatches" value="2" />
+			<param version="1.1" name="partial" value="0" />
+			<output version="1.1" name="output" file="fastx_barcode_splitter1.out" />
+		</test>
+	</tests>
+
+	<outputs>
+		<data version="1.1" format="html" name="output" />
+	</outputs>
+<help>
+
+**What it does**
+
+This tool splits a FASTQ or FASTA file into several files, using barcodes as the split criteria.
+
+--------
+
+**Barcode file Format**
+
+Barcode files are simple text files.
+Each line should contain an identifier (descriptive name for the barcode), and the barcode itself (A/C/G/T), separated by a TAB character.
+Example::
+
+    #This line is a comment (starts with a 'number' sign)
+    BC1	GATCT
+    BC2	ATCGT
+    BC3	GTGAT
+    BC4 TGTCT
+    
+For each barcode, a new FASTQ file will be created (with the barcode's identifier as part of the file name).
+Sequences matching the barcode will be stored in the appropriate file.
+
+One additional FASTQ file will be created (the 'unmatched' file), where sequences not matching any barcode will be stored.
+
+The output of this tool is an HTML file, displaying the split counts and the file names.
+In addition, each fastq file produced will be loaded into the galaxy history automatically.
+
+
+------
+
+This tool is based on `FASTX-toolkit`__ by Assaf Gordon.
+
+ .. __: http://hannonlab.cshl.edu/fastx_toolkit/
+ 
+</help>
+<!-- FASTX-barcode-splitter is part of the FASTX-toolkit, by A.Gordon (gordon@cshl.edu) -->
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastx_barcode_splitter_galaxy_wrapper.sh	Thu Nov 07 16:33:14 2013 -0500
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+#    FASTX-toolkit - FASTA/FASTQ preprocessing tools.
+#    Copyright (C) 2009  A. Gordon (gordon@cshl.edu)
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#    You should have received a copy of the GNU Affero General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#  Modified by Lance Parsons (lparsons@princeton.edu)
+#	2011-03-15	Adapted to allow galaxy to determine filetype
+#
+#This is a shell script wrapper for 'fastx_barcode_splitter.pl'
+#
+# 1. Output files are saved at the dataset's files_path directory.
+#    
+# 2. 'fastx_barcode_splitter.pl' outputs a textual table.
+#    This script turns it into pretty HTML with working URL
+#    (so lazy users can just click on the URLs and get their files)
+
+if [ "$1x" = "x" ]; then
+	echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [LIBRARY_NAME] [OUTPUT_PATH] [FILETYPE]" >&2
+	exit 1
+fi
+
+BARCODE_FILE="$1"
+FASTQ_FILE="$2"
+LIBNAME="$3"
+OUTPUT_PATH="$4"
+FILETYPE="$5"
+shift 5
+# The rest of the parameters are passed to the split program
+
+if [ "${OUTPUT_PATH}x" = "x" ]; then
+	echo "Usage: $0 [BARCODE FILE] [FASTQ FILE] [LIBRARY_NAME] [OUTPUT_PATH] [FILETYPE]" >&2
+	exit 1
+fi
+
+#Sanitize library name, make sure we can create a file with this name
+LIBNAME=${LIBNAME%.gz}
+LIBNAME=${LIBNAME%.txt}
+LIBNAME=$(echo "$LIBNAME" | tr -cd '[:alnum:]_')
+
+if [ ! -r "$FASTQ_FILE" ]; then
+	echo "Error: Input file ($FASTQ_FILE) not found!" >&2
+	exit 1
+fi
+if [ ! -r "$BARCODE_FILE" ]; then
+	echo "Error: barcode file ($BARCODE_FILE) not found!" >&2
+	exit 1
+fi
+mkdir -p "$OUTPUT_PATH"
+if [ ! -d "$OUTPUT_PATH" ]; then
+	echo "Error: failed to create output path '$OUTPUT_PATH'" >&2
+	exit 1
+fi
+
+PUBLICURL=""
+BASEPATH="$OUTPUT_PATH/"
+#PREFIX="$BASEPATH"`date "+%Y-%m-%d_%H%M__"`"${LIBNAME}__"
+PREFIX="$BASEPATH""${LIBNAME}_"
+SUFFIX="_visible_$FILETYPE"
+
+RESULTS=`gzip -cdf "$FASTQ_FILE" | fastx_barcode_splitter.pl --bcfile "$BARCODE_FILE" --prefix "$PREFIX" --suffix "$SUFFIX" "$@"`
+if [ $? != 0 ]; then
+	echo "error"
+fi
+
+#
+# Convert the textual tab-separated table into simple HTML table,
+# with the local path replaces with a valid URL
+#HTMLSUMMARY=${PREFIX}stats_visible_html
+echo "<html><body><table border=1>" 
+echo "$RESULTS" | sed -r "s|$BASEPATH(.*)|\\1|" | sed '
+i<tr><td>
+s|\t|</td><td>|g
+a<\/td><\/tr>
+'
+echo "<p>"
+echo "</table></body></html>"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastx_barcode_splitter1.fastq	Thu Nov 07 16:33:14 2013 -0500
@@ -0,0 +1,168 @@
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GATCTAGTAGTAGTAGA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GATCTAGTAGTAGTAGA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GATCTAGTAGTAGTAGA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GATCTAGTAGTAGTAGA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GATCTAGTAGTAGTAGA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTCTAGTAGTAGTAGA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTCTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTCTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTACGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTACTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGTACGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCGTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCGTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCGTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCGTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTCGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTCGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTCTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+ATCTCGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GGAATGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TAGTTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TAGTTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TAGTTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TAGTTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TAGTTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TAGTTTCTCTATGTACA
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
+@CSHL_3_FC042AGLLWW:1:2:7:203
+TGTCTGAGTATACACAT
++CSHL_3_FC042AGLLWW:1:2:7:203
+aab^V^aU]`aa^aZaa
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastx_barcode_splitter1.out	Thu Nov 07 16:33:14 2013 -0500
@@ -0,0 +1,24 @@
+<html><body><table border=1>
+<tr><td>
+Barcode</td><td>Count</td><td>Location
+</td></tr>
+<tr><td>
+BC1</td><td>11</td><td><a href="fastx_barcode_splitter1_fastq__BC1.txt">fastx_barcode_splitter1_fastq__BC1.txt</a>
+</td></tr>
+<tr><td>
+BC2</td><td>12</td><td><a href="fastx_barcode_splitter1_fastq__BC2.txt">fastx_barcode_splitter1_fastq__BC2.txt</a>
+</td></tr>
+<tr><td>
+BC3</td><td>9</td><td><a href="fastx_barcode_splitter1_fastq__BC3.txt">fastx_barcode_splitter1_fastq__BC3.txt</a>
+</td></tr>
+<tr><td>
+BC4</td><td>1</td><td><a href="fastx_barcode_splitter1_fastq__BC4.txt">fastx_barcode_splitter1_fastq__BC4.txt</a>
+</td></tr>
+<tr><td>
+unmatched</td><td>9</td><td><a href="fastx_barcode_splitter1_fastq__unmatched.txt">fastx_barcode_splitter1_fastq__unmatched.txt</a>
+</td></tr>
+<tr><td>
+total</td><td>42
+</td></tr>
+<p>
+</table></body></html>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastx_barcode_splitter1.txt	Thu Nov 07 16:33:14 2013 -0500
@@ -0,0 +1,4 @@
+BC1	GATCT
+BC2	ATCGT
+BC3	GTGAT
+BC4	TGTCT
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Nov 07 16:33:14 2013 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="fastx_toolkit" version="0.0.13">
+        <repository changeset_revision="ec66ae4c269b" name="package_fastx_toolkit_0_0_13" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>