# HG changeset patch # User davidvanzessen # Date 1404826734 14400 # Node ID 35b55f1c0c598c1e5068afa365b38d2c750b7427 # Parent 8e3d95d7f342a4c429f8cea630b8b361c0a964d2 Uploaded diff -r 8e3d95d7f342 -r 35b55f1c0c59 demultiplex.xml --- a/demultiplex.xml Mon Jul 07 05:49:20 2014 -0400 +++ b/demultiplex.xml Tue Jul 08 09:38:54 2014 -0400 @@ -4,10 +4,12 @@ fastx_toolkit - r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name $trim_start $trim_end + wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name #for $i, $b in enumerate($barcodes) "$b.id" "$b.mid" + "$b.trim_start" + "$b.trim_end" #end for @@ -318,6 +320,10 @@ + + + + @@ -325,21 +331,19 @@ - - - - - - - Splitting FASTA or FASTQ files, this tool uses sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file and fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex. +- Splitting sff files into FASTQ, FASTA and (optional) trimmed FASTA files with a FASTQC report on the FASTQ file, this tool uses: +- sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file. +- fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex. +- fastqc (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to provide analysis of the fastq files. + diff -r 8e3d95d7f342 -r 35b55f1c0c59 fastqc_v0.11.2.zip Binary file fastqc_v0.11.2.zip has changed diff -r 8e3d95d7f342 -r 35b55f1c0c59 r_wrapper.sh --- a/r_wrapper.sh Mon Jul 07 05:49:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -#!/bin/bash -input=$1 -output=$2 -outDir=$3 -mkdir $outDir -EOL=$4 -mismatches=$5 -partial=$6 -name=$(basename "$7") -ext="${name##*.}" -name="${name%.*}" -prefix=$name"_" -trim_start=$8 -trim_end=$9 -dir="$(cd "$(dirname "$0")" && pwd)" - -for ((i=10;i<=$#;i=i+2)) -do - j=$((i+1)) - echo -e "${!i}\t${!j}" >> $outDir/barcodes.txt -done - -cd $outDir -echo "$3" -result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` -echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt -echo "$name demultiplex" >> $output -ls -while IFS=, read barcode count location - do - if [ "total" == "$barcode" ] - then - echo "" >> $output - break - fi - file=$name"_"$barcode - cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta - python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start $trim_start --end $trim_end - echo "" >> $output -done < output.txt -echo "" >> $output diff -r 8e3d95d7f342 -r 35b55f1c0c59 trim.py --- a/trim.py Mon Jul 07 05:49:20 2014 -0400 +++ b/trim.py Tue Jul 08 09:38:54 2014 -0400 @@ -11,6 +11,11 @@ start = int(args.start) end = int(args.end) +print args.input +print args.output +print start +print end + if end <= 0 and start <= 0: import shutil shutil.copy(args.input, args.output) diff -r 8e3d95d7f342 -r 35b55f1c0c59 wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Tue Jul 08 09:38:54 2014 -0400 @@ -0,0 +1,58 @@ +#!/bin/bash +input=$1 +output=$2 +outDir=$3 +mkdir $outDir +EOL=$4 +mismatches=$5 +partial=$6 +name=$(basename "$7") +ext="${name##*.}" +name="${name%.*}" +prefix=$name"_" +dir="$(cd "$(dirname "$0")" && pwd)" + +unzip $dir/fastqc_v0.11.2.zip -d $PWD/ > $PWD/unziplog.log +chmod 755 $PWD/FastQC/fastqc + +declare -A trim_start +declare -A trim_end +for ((i=8;i<=$#;i=i+4)) +do + j=$((i+1)) + start_int=$((i+2)) + end_int=$((i+3)) + id="${!i}" + echo "$id" + trim_start[$id]=${!start_int} + trim_end[$id]=${!end_int} + echo -e "$id\t${!j}" >> $outDir/barcodes.txt + +done +trim_start["unmatched"]=0 +trim_end["unmatched"]=0 + +echo "trim_start = ${trim_start[@]}" +echo "trim_end = ${trim_end[@]}" + +workdir=$PWD +cd $outDir +echo "$3" +result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt +echo "$name demultiplex
IDCountFASTQFASTATrimmed FASTA
$barcode$count
$barcode$count$file.fastq$file.fasta${file}_trimmed.fasta
" >> $output +while IFS=, read barcode count location + do + if [ "total" == "$barcode" ] + then + echo "" >> $output + break + fi + file=$name"_"$barcode + mkdir $outDir/fastqc_$barcode + $workdir/FastQC/fastqc $file.fastq -o $outDir 2> /dev/null + cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta + python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start ${trim_start[$barcode]} --end ${trim_end[$barcode]} + echo "" >> $output +done < output.txt +echo "" >> $output
IDCountFASTQFASTATrimmed FASTAFASTQC
$barcode$count
$barcode$count$file.fastq$file.fasta${file}_trimmed.fastaReport