Mercurial > repos > davidvanzessen > sff_extract_demultiplex
changeset 3:79be0752711d draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 01 Jul 2014 08:27:57 -0400 |
parents | afddfd016ba6 |
children | 8e3d95d7f342 |
files | demultiplex.xml r_wrapper.sh trim.py |
diffstat | 3 files changed, 51 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/demultiplex.xml Thu May 08 07:50:42 2014 -0400 +++ b/demultiplex.xml Tue Jul 01 08:27:57 2014 -0400 @@ -4,7 +4,7 @@ <requirement type="package" version="0.0.13">fastx_toolkit</requirement> </requirements> <command interpreter="bash"> - r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name + r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name $trim_start $trim_end #for $i, $b in enumerate($barcodes) "$b.id" "$b.mid" @@ -173,10 +173,17 @@ <option value="bol">Start: 5' end</option> <option value="eol">End: 3' end</option> </param> - + + + <param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." /> <param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." /> + + <param name="trim_start" type="integer" size="3" value="25" label="How many nucleotides to trim from the start" /> + + <param name="trim_end" type="integer" size="3" value="25" label="How many nucleotides to trim from the end" /> + </inputs> <outputs> <data format="html" name="out_file" />
--- a/r_wrapper.sh Thu May 08 07:50:42 2014 -0400 +++ b/r_wrapper.sh Tue Jul 01 08:27:57 2014 -0400 @@ -10,9 +10,11 @@ ext="${name##*.}" name="${name%.*}" prefix=$name"_" +trim_start=$8 +trim_end=$9 dir="$(cd "$(dirname "$0")" && pwd)" -for ((i=8;i<=$#;i=i+2)) +for ((i=10;i<=$#;i=i+2)) do j=$((i+1)) echo -e "${!i}\t${!j}" >> $outDir/barcodes.txt @@ -22,7 +24,7 @@ echo "$3" result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt -echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th></tr></thead><tbody>" >> $output +echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th></tr></thead><tbody>" >> $output ls while IFS=, read barcode count location do @@ -33,8 +35,7 @@ fi file=$name"_"$barcode cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta - #cat $file.fastq | perl -e '$i=0;while(< >){if(/^\@/&&$i==0){s/^\@/\>/;print;}elsif($i==1){print;$i=-3}$i++;}' > $file.fasta - #cat $file.fastq - echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td></tr>" >> $output + python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start $trim_start --end $trim_end + echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td></tr>" >> $output done < output.txt -echo "</tbody></body></html>" >> $output +echo "</tbody></body><a href='trimmed.fasta'>Original fasta after trim</a></html>" >> $output
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trim.py Tue Jul 01 08:27:57 2014 -0400 @@ -0,0 +1,35 @@ +import argparse + +#docs.python.org/dev/library/argparse.html +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Input folder with files") +parser.add_argument("--output", help="Output file") +parser.add_argument("--start", help="How many nucleotides to trim from the start", type=int) +parser.add_argument("--end", help="How many nucleotides to trim from the end", type=int) + +args = parser.parse_args() +start = int(args.start) +end = int(args.end) + +if end <= 0: + import shutil + shutil.copy(args.input, args.output) + import sys + sys.exit() + +currentSeq = "" +currentId = "" + +with open(args.input, 'r') as i: + with open(args.output, 'w') as o: + for line in i.readlines(): + if line[0] is ">": + if currentSeq is not "" or currentId is not "": + o.write(currentId) + o.write(currentSeq[start:-end] + "\n") + currentId = line + currentSeq = "" + else: + currentSeq += line.rstrip() + o.write(currentId) + o.write(currentSeq.rstrip()[start:-end] + "\n")