Mercurial > repos > davidvanzessen > sff_extract_demultiplex

--- a/demultiplex.xml	Mon Jul 07 05:49:20 2014 -0400
+++ b/demultiplex.xml	Tue Jul 08 09:38:54 2014 -0400
@@ -4,10 +4,12 @@
 		<requirement type="package" version="0.0.13">fastx_toolkit</requirement>
 	</requirements>
 	<command interpreter="bash">
-		r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name $trim_start $trim_end
+		wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name
 		#for $i, $b in enumerate($barcodes)
             "$b.id"
             "$b.mid"
+            "$b.trim_start"
+            "$b.trim_end"
     #end for
 	</command>
 	<inputs>
@@ -318,6 +320,10 @@
 				<option value="TGTGCGCGTG">MID-151</option>
 				<option value="CACGCGCACA">MID-151 reverse complement</option>
 			</param>
+
+			<param name="trim_start" type="integer" size="3" value="0" label="How many nucleotides to trim from the start" />
+
+			<param name="trim_end" type="integer" size="3" value="0" label="How many nucleotides to trim from the end" />
 		</repeat>

 		<param name="where" type="select" label="Barcodes found at">
@@ -325,21 +331,19 @@
 			<option value="eol">End: 3' end</option>
 		</param>

-
-
 		<param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." />

 		<param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." />

-		<param name="trim_start" type="integer" size="3" value="25" label="How many nucleotides to trim from the start" />
-
-		<param name="trim_end" type="integer" size="3" value="25" label="How many nucleotides to trim from the end" />
-
 	</inputs>
 	<outputs>
 		<data format="html" name="out_file" />
 	</outputs>
 	<help>
-		Splitting FASTA or FASTQ files, this tool uses sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file and fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex.
+- Splitting sff files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses:
+- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file.
+- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex.
+- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files.
+
 	</help>
 </tool>
Binary file fastqc_v0.11.2.zip has changed
--- a/r_wrapper.sh	Mon Jul 07 05:49:20 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,41 +0,0 @@
-#!/bin/bash
-input=$1
-output=$2
-outDir=$3
-mkdir $outDir
-EOL=$4
-mismatches=$5
-partial=$6
-name=$(basename "$7")
-ext="${name##*.}"
-name="${name%.*}"
-prefix=$name"_"
-trim_start=$8
-trim_end=$9
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-for ((i=10;i<=$#;i=i+2))
-do
-	j=$((i+1))
-  echo -e "${!i}\t${!j}" >> $outDir/barcodes.txt
-done
-
-cd $outDir
-echo "$3"
-result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
-echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt
-echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th></tr></thead><tbody>" >> $output
-ls
-while IFS=, read barcode count location
-	do
-		if [ "total" == "$barcode" ]
-		then
-			echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td></tr>" >> $output
-			break
-		fi
-		file=$name"_"$barcode
-		cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta
-		python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start $trim_start --end $trim_end
-		echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td></tr>" >> $output
-done < output.txt
-echo "</tbody></body></html>" >> $output
--- a/trim.py	Mon Jul 07 05:49:20 2014 -0400
+++ b/trim.py	Tue Jul 08 09:38:54 2014 -0400
@@ -11,6 +11,11 @@
 start = int(args.start)
 end = int(args.end)

+print args.input
+print args.output
+print start
+print end
+
 if end <= 0 and start <= 0:
 	import shutil
 	shutil.copy(args.input, args.output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh	Tue Jul 08 09:38:54 2014 -0400
@@ -0,0 +1,58 @@
+#!/bin/bash
+input=$1
+output=$2
+outDir=$3
+mkdir $outDir
+EOL=$4
+mismatches=$5
+partial=$6
+name=$(basename "$7")
+ext="${name##*.}"
+name="${name%.*}"
+prefix=$name"_"
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log
+chmod 755 $PWD/FastQC/fastqc
+
+declare -A trim_start
+declare -A trim_end
+for ((i=8;i<=$#;i=i+4))
+do
+	j=$((i+1))
+	start_int=$((i+2))
+	end_int=$((i+3))
+	id="${!i}"
+	echo "$id"
+	trim_start[$id]=${!start_int}
+	trim_end[$id]=${!end_int}
+  echo -e "$id\t${!j}" >> $outDir/barcodes.txt
+
+done
+trim_start["unmatched"]=0
+trim_end["unmatched"]=0
+
+echo "trim_start = ${trim_start[@]}"
+echo "trim_end = ${trim_end[@]}"
+
+workdir=$PWD
+cd $outDir
+echo "$3"
+result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
+echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt
+echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th><th>FASTQC</th></tr></thead><tbody>" >> $output
+while IFS=, read barcode count location
+	do
+		if [ "total" == "$barcode" ]
+		then
+			echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" >> $output
+			break
+		fi
+		file=$name"_"$barcode
+		mkdir $outDir/fastqc_$barcode
+		$workdir/FastQC/fastqc $file.fastq -o $outDir 2> /dev/null
+		cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta
+		python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start ${trim_start[$barcode]} --end ${trim_end[$barcode]}
+		echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td><td><a href='${name}_${barcode}_fastqc.html'>Report</a></td></tr>" >> $output
+done < output.txt
+echo "</tbody></body></html>" >> $output