Mercurial > repos > davidvanzessen > sff_extract_demultiplex
changeset 5:35b55f1c0c59 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 08 Jul 2014 09:38:54 -0400 |
parents | 8e3d95d7f342 |
children | 6b348d07da49 |
files | demultiplex.xml fastqc_v0.11.2.zip r_wrapper.sh trim.py wrapper.sh |
diffstat | 5 files changed, 75 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/demultiplex.xml Mon Jul 07 05:49:20 2014 -0400 +++ b/demultiplex.xml Tue Jul 08 09:38:54 2014 -0400 @@ -4,10 +4,12 @@ <requirement type="package" version="0.0.13">fastx_toolkit</requirement> </requirements> <command interpreter="bash"> - r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name $trim_start $trim_end + wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name #for $i, $b in enumerate($barcodes) "$b.id" "$b.mid" + "$b.trim_start" + "$b.trim_end" #end for </command> <inputs> @@ -318,6 +320,10 @@ <option value="TGTGCGCGTG">MID-151</option> <option value="CACGCGCACA">MID-151 reverse complement</option> </param> + + <param name="trim_start" type="integer" size="3" value="0" label="How many nucleotides to trim from the start" /> + + <param name="trim_end" type="integer" size="3" value="0" label="How many nucleotides to trim from the end" /> </repeat> <param name="where" type="select" label="Barcodes found at"> @@ -325,21 +331,19 @@ <option value="eol">End: 3' end</option> </param> - - <param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." /> <param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." /> - <param name="trim_start" type="integer" size="3" value="25" label="How many nucleotides to trim from the start" /> - - <param name="trim_end" type="integer" size="3" value="25" label="How many nucleotides to trim from the end" /> - </inputs> <outputs> <data format="html" name="out_file" /> </outputs> <help> - Splitting FASTA or FASTQ files, this tool uses sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file and fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex. +- Splitting sff files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses: +- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file. +- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex. +- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files. + </help> </tool>
--- a/r_wrapper.sh Mon Jul 07 05:49:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -#!/bin/bash -input=$1 -output=$2 -outDir=$3 -mkdir $outDir -EOL=$4 -mismatches=$5 -partial=$6 -name=$(basename "$7") -ext="${name##*.}" -name="${name%.*}" -prefix=$name"_" -trim_start=$8 -trim_end=$9 -dir="$(cd "$(dirname "$0")" && pwd)" - -for ((i=10;i<=$#;i=i+2)) -do - j=$((i+1)) - echo -e "${!i}\t${!j}" >> $outDir/barcodes.txt -done - -cd $outDir -echo "$3" -result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` -echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt -echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th></tr></thead><tbody>" >> $output -ls -while IFS=, read barcode count location - do - if [ "total" == "$barcode" ] - then - echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td></tr>" >> $output - break - fi - file=$name"_"$barcode - cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta - python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start $trim_start --end $trim_end - echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td></tr>" >> $output -done < output.txt -echo "</tbody></body></html>" >> $output
--- a/trim.py Mon Jul 07 05:49:20 2014 -0400 +++ b/trim.py Tue Jul 08 09:38:54 2014 -0400 @@ -11,6 +11,11 @@ start = int(args.start) end = int(args.end) +print args.input +print args.output +print start +print end + if end <= 0 and start <= 0: import shutil shutil.copy(args.input, args.output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Tue Jul 08 09:38:54 2014 -0400 @@ -0,0 +1,58 @@ +#!/bin/bash +input=$1 +output=$2 +outDir=$3 +mkdir $outDir +EOL=$4 +mismatches=$5 +partial=$6 +name=$(basename "$7") +ext="${name##*.}" +name="${name%.*}" +prefix=$name"_" +dir="$(cd "$(dirname "$0")" && pwd)" + +unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log +chmod 755 $PWD/FastQC/fastqc + +declare -A trim_start +declare -A trim_end +for ((i=8;i<=$#;i=i+4)) +do + j=$((i+1)) + start_int=$((i+2)) + end_int=$((i+3)) + id="${!i}" + echo "$id" + trim_start[$id]=${!start_int} + trim_end[$id]=${!end_int} + echo -e "$id\t${!j}" >> $outDir/barcodes.txt + +done +trim_start["unmatched"]=0 +trim_end["unmatched"]=0 + +echo "trim_start = ${trim_start[@]}" +echo "trim_end = ${trim_end[@]}" + +workdir=$PWD +cd $outDir +echo "$3" +result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt +echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th><th>FASTQC</th></tr></thead><tbody>" >> $output +while IFS=, read barcode count location + do + if [ "total" == "$barcode" ] + then + echo "<tr><td>$barcode</td><td>$count</td><td></td><td></td><td></td><td></td><td></td><td></td></tr>" >> $output + break + fi + file=$name"_"$barcode + mkdir $outDir/fastqc_$barcode + $workdir/FastQC/fastqc $file.fastq -o $outDir 2> /dev/null + cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta + python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start ${trim_start[$barcode]} --end ${trim_end[$barcode]} + echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td><td><a href='${name}_${barcode}_fastqc.html'>Report</a></td></tr>" >> $output +done < output.txt +echo "</tbody></body></html>" >> $output