Mercurial > repos > davidvanzessen > sff_extract_demultiplex

--- a/demultiplex.xml	Thu May 08 07:50:42 2014 -0400
+++ b/demultiplex.xml	Tue Jul 01 08:27:57 2014 -0400
@@ -4,7 +4,7 @@
 		<requirement type="package" version="0.0.13">fastx_toolkit</requirement>
 	</requirements>
 	<command interpreter="bash">
-		r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name
+		r_wrapper.sh $input $out_file $out_file.files_path $where $mismatches $partial $input.name $trim_start $trim_end
 		#for $i, $b in enumerate($barcodes)
             "$b.id"
             "$b.mid"
@@ -173,10 +173,17 @@
 			<option value="bol">Start: 5' end</option>
 			<option value="eol">End: 3' end</option>
 		</param>
-
+
+
+
 		<param name="mismatches" type="integer" size="3" value="2" label="Max. number of mismatches allowed." />

 		<param name="partial" type="integer" size="3" value="0" label="Allow partial overlap of barcodes." />
+
+		<param name="trim_start" type="integer" size="3" value="25" label="How many nucleotides to trim from the start" />
+
+		<param name="trim_end" type="integer" size="3" value="25" label="How many nucleotides to trim from the end" />
+
 	</inputs>
 	<outputs>
 		<data format="html" name="out_file" />
--- a/r_wrapper.sh	Thu May 08 07:50:42 2014 -0400
+++ b/r_wrapper.sh	Tue Jul 01 08:27:57 2014 -0400
@@ -10,9 +10,11 @@
 ext="${name##*.}"
 name="${name%.*}"
 prefix=$name"_"
+trim_start=$8
+trim_end=$9
 dir="$(cd "$(dirname "$0")" && pwd)"

-for ((i=8;i<=$#;i=i+2))
+for ((i=10;i<=$#;i=i+2))
 do
 	j=$((i+1))
   echo -e "${!i}\t${!j}" >> $outDir/barcodes.txt
@@ -22,7 +24,7 @@
 echo "$3"
 result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial`
 echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt
-echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th></tr></thead><tbody>" >> $output
+echo "<html><head><title>$name demultiplex</title></head><body><table border='1'><thead><tr><th>ID</th><th>Count</th><th>FASTQ</th><th>FASTA</th><th>Trimmed FASTA</th></tr></thead><tbody>" >> $output
 ls
 while IFS=, read barcode count location
 	do
@@ -33,8 +35,7 @@
 		fi
 		file=$name"_"$barcode
 		cat $file.fastq | awk 'NR%4==1{printf ">%s\n", substr($0,2)}NR%4==2{print}' > $file.fasta
-		#cat $file.fastq | perl -e '$i=0;while(< >){if(/^\@/&&$i==0){s/^\@/\>/;print;}elsif($i==1){print;$i=-3}$i++;}' > $file.fasta
-		#cat $file.fastq
-		echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td></tr>" >> $output
+		python $dir/trim.py --input $file.fasta --output ${file}_trimmed.fasta --start $trim_start --end $trim_end
+		echo "<tr><td>$barcode</td><td>$count</td><td><a href='$file.fastq'>$file.fastq</a></td><td><a href='$file.fasta'>$file.fasta</a></td><td><a href='${file}_trimmed.fasta'>${file}_trimmed.fasta</a></td></tr>" >> $output
 done < output.txt
-echo "</tbody></body></html>" >> $output
+echo "</tbody></body><a href='trimmed.fasta'>Original fasta after trim</a></html>" >> $output
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trim.py	Tue Jul 01 08:27:57 2014 -0400
@@ -0,0 +1,35 @@
+import argparse
+
+#docs.python.org/dev/library/argparse.html
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Input folder with files")
+parser.add_argument("--output", help="Output file")
+parser.add_argument("--start", help="How many nucleotides to trim from the start", type=int)
+parser.add_argument("--end", help="How many nucleotides to trim from the end", type=int)
+
+args = parser.parse_args()
+start = int(args.start)
+end = int(args.end)
+
+if end <= 0:
+	import shutil
+	shutil.copy(args.input, args.output)
+	import sys
+	sys.exit()
+
+currentSeq = ""
+currentId = ""
+
+with open(args.input, 'r') as i:
+	with open(args.output, 'w') as o:
+		for line in i.readlines():
+			if line[0] is ">":
+				if currentSeq is not "" or currentId is not "":
+					o.write(currentId)
+					o.write(currentSeq[start:-end] + "\n")
+				currentId = line
+				currentSeq = ""
+			else:
+				currentSeq += line.rstrip()
+		o.write(currentId)
+		o.write(currentSeq.rstrip()[start:-end] + "\n")