# HG changeset patch
# User nikos
# Date 1410438605 14400
# Node ID 14d6929f8aa15ecd69b5f7399dd95832d4ff4f12
# Parent  fd7e52483bbcc3ef4f2d8cc0b505f120cea476a7
Tool tests or/and test-data are missing.

diff -r fd7e52483bbc -r 14d6929f8aa1 estimate_unique_counts.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/estimate_unique_counts.sh	Thu Sep 11 08:30:05 2014 -0400
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+############################################################
+#Create table of fragments containing coordinates[RNA molecule, start,end], number of mapped reads, number of unique barcodes and 3'most nucleotide of cDNA (on that was ligated to)
+############################################################
+#Remove untemplated nucleotides and create positions_temp file 
+
+#defaults
+output_dir="output_dir"
+priming_pos=-1
+
+#parse input
+while getopts hf:b:p:o: myarg
+do	case "$myarg" in
+	h)	echo "Usage: estimate_unique_counts.sh -f <bam_file> -b <barcodes_file> -o <output_dir>"
+		exit ;;
+	f)	bamfile="$OPTARG" ;; #required
+	b)	barcodes="$OPTARG" ;; #required
+	p)	priming_pos="$OPTARG" ;; #optional
+	o)	output_dir="$OPTARG" ;;	#optional
+	[?])	echo "Usage: estimate_unique_counts.sh -f <myfile.bam> -b <barcodes.txt>"
+		exit 1 ;;
+	esac
+done
+
+mkdir $output_dir
+
+samtools view $bamfile |  awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0}
+function abs(value){return(value<0?-value:value)}
+($2==99 && /[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/)  {print($1, $3, $4+0, $4+abs($9)-1);counter[0]++;next};
+($2==99 && /[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {print($1, $3, $4+1, $4+abs($9)-1);counter[1]++;next};
+($2==99 && (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ || (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/))) {print($1, $3, $4+2, $4+abs($9)-1);counter[2]++;next};
+($2==99 && ((/[\s\t]MD:Z:1[ACGT]0[ACGT]/)||(/MD:Z:0[ACGT]1[ACGT]/)||(/MD:Z:0[ACGT]0[ACGT]0[ACGT]/)||(/MD:Z:2[ACGT]/))) {print($1, $3, $4+3, $4+abs($9)-1);counter[3]++;next}
+END{print("No trimming:",counter[0],"1 nt trimmed:", counter[1],"2 nt trimmed:", counter[2],"3 nt trimmed:",counter[3]) > out}' | sort -k1,1 | gzip > positions_temp_sorted.gz &
+
+# Computing barcode length (Use the first line and compute the string length of the second column
+
+TMP=`head -1 $barcodes | awk '{print $2}'`
+BAR_LEN=`echo ${#TMP}`
+
+# Remove "@" from barcodes and sort them
+sed 's/^.//' $barcodes | sort -k1,1 | gzip > barcodes_temp_sorted.gz &
+
+wait 
+
+# Merge poistions and barcodes
+join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz
+
+#### If priming flag is set....
+if [ $priming_pos != -1 ]; then
+    zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4}' - > merged_temp2
+    cat merged_temp2 | gzip > merged_temp.gz
+    rm merged_temp2
+fi
+
+#File summary.gz columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts]
+
+zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{
+for(RNA in barcode){
+for(start_position in barcode[RNA]){
+for(end_position in barcode[RNA][start_position]){
+for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt
+
+#File unique_barcodes columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment [PROBLEM: How to treat the different 3cdns for the same fragment? if the template was homogenous then it should be always the same]
+
+awk '{barcode[$1][$2][$3]++}END{
+for(RNA in barcode){
+for(start_position in barcode[RNA]){
+for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt &
+
+#read_counts.gz colums: RNA_ID, Start, End, sequenced_count
+
+zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{
+for(RNA in barcode){
+for(start_position in barcode[RNA]){
+for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt &
+
+wait
+
+##Remove temporary files - didn't do it, in case debugging needed.
+
+rm positions_temp_sorted.gz
+rm barcodes_temp_sorted.gz
+#rm merged_temp.gz
+
+##End of remove temp files
diff -r fd7e52483bbc -r 14d6929f8aa1 estimate_unique_counts.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/estimate_unique_counts.xml	Thu Sep 11 08:30:05 2014 -0400
@@ -0,0 +1,52 @@
+<tool id="hrf_euc" version="0.1" name="Estimate Unique Counts" force_history_refresh="True">
+    <description></description>
+    <requirements>
+        <requirement type="package" version="0.1.18">samtools</requirement>
+    </requirements>
+    <command interpreter="bash">
+	estimate_unique_counts.sh
+
+	## Inputs
+	-f $input1 -b $input2 
+	
+	##
+	#if str( $priming.flag ) == 'False':
+		-p -1
+	#else
+		-p $priming.position
+	#end if
+		
+    </command>
+
+    <inputs>
+        <param format="bam" name="input1" type="data" label="Aligned Reads" help="BAM format." />
+	<param format="tabular" name="input2" type="data" label="Barcodes" help="Produced by Debarcoding tool." />
+	<conditional name="priming"> 
+	    <param name="flag" type="boolean" checked="False" truevalue="True" falsevalue="False" label="Set priming position" help="Set all reads to have a specific priming position." />	
+	    <when value="True">
+	        <param name="position" type="integer" value="-1" label="Priming position" />
+	    </when>
+	</conditional>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="trimming_stats" label="${tool.name} on ${on_string}: Trimming stats" from_work_dir="output_dir/trimming_stats.txt" />
+        <data format="tabular" name="summary" label="${tool.name} on ${on_string}: Summary" from_work_dir="output_dir/summary.txt" />
+        <data format="tabular" name="unique_barcodes" label="${tool.name} on ${on_string}: Unique Barcodes" from_work_dir="output_dir/unique_barcodes.txt" />
+        <data format="tabular" name="read_counts" label="${tool.name} on ${on_string}: Read Counts" from_work_dir="output_dir/read_counts.txt" />
+    </outputs>
+    
+    <tests>
+	<test>
+	    <param name="input1" value="sample.bam" />
+	    <param name="input2" value="barcodes.txt" />
+	    <output name="trimming_stats" file="trimming_stats.txt" />
+	    <output name="summary" file="summary.txt" />
+	    <output name="unique_barcodes" file="unique_barcodes.txt" />
+	    <output name="read_counts" file="read_counts.txt" />
+	</test>
+    </tests>
+
+    <help>
+    </help>
+</tool>
diff -r fd7e52483bbc -r 14d6929f8aa1 preprocessing.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocessing.sh	Thu Sep 11 08:30:05 2014 -0400
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+#Preprocessing workflow - trimming adapters, barcodes etc
+
+# Barcode in the oligo used: NWTRYSNNNN
+
+# Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N
+
+# As regex:
+
+# ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT]
+
+
+READ1=${1}
+READ2=${2}
+BARCODE=$3
+ADAPTER1=$4
+ADAPTER2=$5
+CUTOFF=$6
+CUTADAPT_LOG=${7}
+TRIM_LENGTH=$8
+OVERLAP_LENGTH=$9
+OUT_R1=${10}
+OUT_R2=${11}
+OUT_BARCODES=${12}
+
+BAR_LENGTH=`eval echo ${#BARCODE}`
+
+#########################################################################################
+
+# Reverse complement the barcode sequence and create a regular expression.
+
+REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $BARCODE))
+
+REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCT]/g;' <<< $REV_COMPLEMENT)
+
+########################################################################
+#1. Remove all the reads that do not start with the signature 
+
+# (first awk removes them, second awk removes corresponding quality strings) followed by cutadapt. 
+
+# After cutadapt, remove last 15 nt - may be derived from the random primer [should be optional, as the user may: 1) use different random primer length, 2) have short reasd and would lose too much info] and remove all the reads that are shorter than 30 nt 
+
+# (10 nt barcode + 20 nt for mapping)
+
+echo -e '------------------------Read1------------------------\n' > $CUTADAPT_LOG
+
+awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $READ1 | 
+
+awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0}
+{
+    if(NR%4==1){print; all_processed++}
+    if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print}
+    if(NR%4==3){print}
+    if(NR%4==0){if(trim_flag==1){print ""}else{print $0}}
+}END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | 
+
+cutadapt -a $ADAPTER1 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH - 2>>$CUTADAPT_LOG |
+
+awk -v len="${TRIM_LENGTH}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | 
+
+awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip  > R1.fastq.gz &
+
+wait
+
+########################################################################
+#2. Trim the adapter, primer and possible random barcode from the second read
+
+echo -e '------------------------Read2------------------------\n' >> $CUTADAPT_LOG
+
+cutadapt -a $ADAPTER2 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH $READ2 2>>$CUTADAPT_LOG | 
+
+awk -v len1="${TRIM_LENGTH}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' - |
+
+awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz &
+
+wait
+
+########################################################################
+#3. Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic for tophat mapping)
+
+#first define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file)
+
+paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep
+
+paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz &
+
+paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' > $OUT_R2 &
+
+wait
+
+########################################################################
+#4. Extract the barcode sequence from the first read:
+zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' > $OUT_R1 &
+
+zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > ${OUT_BARCODES} &
+
+wait 
+
+########################################################################
+#6. Remove temporary fastq files"
+
+rm R1.fastq.gz R2.fastq.gz R1_readsANDbarcodes.fastq.gz
+rm lines_to_keep
+
+########################################################################
+#7. problem! Spaces added at the end of the strings in fastq files (AWK induced). I will remove them:
+
+mv $OUT_R1 R1.temp.fastq
+mv $OUT_R2 R2.temp.fastq
+
+awk '{print($1)}' R1.temp.fastq > $OUT_R1 &
+awk '{print($1)}' R2.temp.fastq > $OUT_R2 &
+
+wait
+rm R?.temp.fastq
diff -r fd7e52483bbc -r 14d6929f8aa1 preprocessing.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocessing.xml	Thu Sep 11 08:30:05 2014 -0400
@@ -0,0 +1,61 @@
+<tool id="hrf_preprocessing" version="0.1" name="Preprocessing" force_history_refresh="True">
+    <description>HRF-Seq raw reads</description>
+    <requirements>
+        <requirement type="package" version="1.4.2">cutadapt</requirement>
+	<requirement type="package" version="0.1.18">samtools</requirement>
+	<requirement type="package" version="2.1.0">bowtie2</requirement>
+    </requirements>
+
+    <command interpreter="bash">
+        preprocessing.sh
+
+	## Inputs
+	$input1 $input2
+
+	## Barcode sequence
+	$barcode_seq
+
+	## Cutadapt options
+	$adapter1 $adapter2 $quality_cutoff $log
+	
+	## Trimming length
+	$trim 
+	
+	## Minimum overlap length
+	$overlap_length
+
+	## Outputs
+	$output1 $output2 $barcodes
+
+    </command>
+
+    <inputs>
+        <param format="fastqsanger" name="input1" type="data" label="FASTQ file (read 1)" help="Must have Sanger-scaled quality values (fastqsanger)." />
+        <param format="fastqsanger" name="input2" type="data" label="FASTQ file (read 2)" help="Must have Sanger-scaled quality values (fastqsanger)." />
+
+	<param name="barcode_seq" type="text" size="20" label="Barcode sequence" help="Reads that do not start with the signature will be removed. Use IUPAC alphabet, e.g. NNNNXRTYNN as in the randomized part of the ligation adapter." />
+
+	<param name="adapter1" type="text" size="40" value="AGATCGGAAGAGCACACGTCT" label="Read 1 3' adapter." help="Reverse complement of an adapter introduced in the reverse transcription, excluding gene-specific or random region. The adapter itself and anything that follows is trimmed." />
+
+        <param name="adapter2" type="text" size="40" value="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" label="Read 2 3' adapter." help="Sequence of an adapter that was ligated to the cDNA 3' end, excluding random barcode sequence. The adapter itself and anything that follows is trimmed." />
+
+        <param name="quality_cutoff" type="integer" min="0" optional="true" value="17" label="Quality cutoff" help="Trim low-quality ends from reads before adapter removal. The algorithm is the same as the one used by BWA (Subtract CUTOFF from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). Value of 0 means no quality trimming." />
+
+	<param name="trim" type="integer" min="0" optional="true" value="15" label="3' trimming length" help="Number of random bases for random priming, will be removed as they are likely to differ from a template." />
+
+        <param name="overlap_length" type="integer" min="1" optional="true" value="3" label="Minimum overlap length" help="If the overlap between the read and the adapter is shorter than LENGTH, the read is not modified. This reduces the no. of bases trimmed purely due to short random adapter matches." />
+
+    </inputs>
+
+    <outputs>
+	<data format="fastqsanger" name="output1" label="${tool.name} on ${on_string}: Read 1" />
+	<data format="fastqsanger" name="output2" label="${tool.name} on
+ ${on_string}: Read 2" />
+	<data format="tabular" name="barcodes" label="${tool.name} on ${on_string}: Barcodes" />
+	<data format="txt" name="log" label="${tool.name} on ${on_string}: Cutadapt log" />
+    </outputs>
+
+    <help>
+    </help>
+
+</tool>
diff -r fd7e52483bbc -r 14d6929f8aa1 tool_dependencies.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Sep 11 08:30:05 2014 -0400
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="cutadapt" version="1.4.2">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://pypi.python.org/packages/source/c/cutadapt/cutadapt-1.4.2.tar.gz</action>
+                <action type="shell_command">python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin</action>
+                <action type="set_environment">
+                    <environment_variable name="PYTHONPATH" action="append_to">$INSTALL_DIR/lib/python</environment_variable>
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+        </readme>
+    </package>
+    <package name="bowtie2" version="2.1.0">
+        <repository changeset_revision="606d435a57a4" name="package_bowtie2_2_1_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="samtools" version="0.1.18">
+	<repository changeset_revision="c0f72bdba484" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>