# HG changeset patch # User nikos # Date 1410438605 14400 # Node ID 14d6929f8aa15ecd69b5f7399dd95832d4ff4f12 # Parent fd7e52483bbcc3ef4f2d8cc0b505f120cea476a7 Tool tests or/and test-data are missing. diff -r fd7e52483bbc -r 14d6929f8aa1 estimate_unique_counts.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/estimate_unique_counts.sh Thu Sep 11 08:30:05 2014 -0400 @@ -0,0 +1,86 @@ +#!/bin/bash + +############################################################ +#Create table of fragments containing coordinates[RNA molecule, start,end], number of mapped reads, number of unique barcodes and 3'most nucleotide of cDNA (on that was ligated to) +############################################################ +#Remove untemplated nucleotides and create positions_temp file + +#defaults +output_dir="output_dir" +priming_pos=-1 + +#parse input +while getopts hf:b:p:o: myarg +do case "$myarg" in + h) echo "Usage: estimate_unique_counts.sh -f -b -o " + exit ;; + f) bamfile="$OPTARG" ;; #required + b) barcodes="$OPTARG" ;; #required + p) priming_pos="$OPTARG" ;; #optional + o) output_dir="$OPTARG" ;; #optional + [?]) echo "Usage: estimate_unique_counts.sh -f -b " + exit 1 ;; + esac +done + +mkdir $output_dir + +samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0} +function abs(value){return(value<0?-value:value)} +($2==99 && /[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {print($1, $3, $4+0, $4+abs($9)-1);counter[0]++;next}; +($2==99 && /[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {print($1, $3, $4+1, $4+abs($9)-1);counter[1]++;next}; +($2==99 && (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ || (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/))) {print($1, $3, $4+2, $4+abs($9)-1);counter[2]++;next}; +($2==99 && ((/[\s\t]MD:Z:1[ACGT]0[ACGT]/)||(/MD:Z:0[ACGT]1[ACGT]/)||(/MD:Z:0[ACGT]0[ACGT]0[ACGT]/)||(/MD:Z:2[ACGT]/))) {print($1, $3, $4+3, $4+abs($9)-1);counter[3]++;next} +END{print("No trimming:",counter[0],"1 nt trimmed:", counter[1],"2 nt trimmed:", counter[2],"3 nt trimmed:",counter[3]) > out}' | sort -k1,1 | gzip > positions_temp_sorted.gz & + +# Computing barcode length (Use the first line and compute the string length of the second column + +TMP=`head -1 $barcodes | awk '{print $2}'` +BAR_LEN=`echo ${#TMP}` + +# Remove "@" from barcodes and sort them +sed 's/^.//' $barcodes | sort -k1,1 | gzip > barcodes_temp_sorted.gz & + +wait + +# Merge poistions and barcodes +join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz + +#### If priming flag is set.... +if [ $priming_pos != -1 ]; then + zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4}' - > merged_temp2 + cat merged_temp2 | gzip > merged_temp.gz + rm merged_temp2 +fi + +#File summary.gz columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts] + +zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{ +for(RNA in barcode){ +for(start_position in barcode[RNA]){ +for(end_position in barcode[RNA][start_position]){ +for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt + +#File unique_barcodes columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment [PROBLEM: How to treat the different 3cdns for the same fragment? if the template was homogenous then it should be always the same] + +awk '{barcode[$1][$2][$3]++}END{ +for(RNA in barcode){ +for(start_position in barcode[RNA]){ +for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt & + +#read_counts.gz colums: RNA_ID, Start, End, sequenced_count + +zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{ +for(RNA in barcode){ +for(start_position in barcode[RNA]){ +for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt & + +wait + +##Remove temporary files - didn't do it, in case debugging needed. + +rm positions_temp_sorted.gz +rm barcodes_temp_sorted.gz +#rm merged_temp.gz + +##End of remove temp files diff -r fd7e52483bbc -r 14d6929f8aa1 estimate_unique_counts.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/estimate_unique_counts.xml Thu Sep 11 08:30:05 2014 -0400 @@ -0,0 +1,52 @@ + + + + samtools + + + estimate_unique_counts.sh + + ## Inputs + -f $input1 -b $input2 + + ## + #if str( $priming.flag ) == 'False': + -p -1 + #else + -p $priming.position + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r fd7e52483bbc -r 14d6929f8aa1 preprocessing.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocessing.sh Thu Sep 11 08:30:05 2014 -0400 @@ -0,0 +1,116 @@ +#!/bin/bash + +#Preprocessing workflow - trimming adapters, barcodes etc + +# Barcode in the oligo used: NWTRYSNNNN + +# Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N + +# As regex: + +# ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT] + + +READ1=${1} +READ2=${2} +BARCODE=$3 +ADAPTER1=$4 +ADAPTER2=$5 +CUTOFF=$6 +CUTADAPT_LOG=${7} +TRIM_LENGTH=$8 +OVERLAP_LENGTH=$9 +OUT_R1=${10} +OUT_R2=${11} +OUT_BARCODES=${12} + +BAR_LENGTH=`eval echo ${#BARCODE}` + +######################################################################################### + +# Reverse complement the barcode sequence and create a regular expression. + +REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $BARCODE)) + +REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCT]/g;' <<< $REV_COMPLEMENT) + +######################################################################## +#1. Remove all the reads that do not start with the signature + +# (first awk removes them, second awk removes corresponding quality strings) followed by cutadapt. + +# After cutadapt, remove last 15 nt - may be derived from the random primer [should be optional, as the user may: 1) use different random primer length, 2) have short reasd and would lose too much info] and remove all the reads that are shorter than 30 nt + +# (10 nt barcode + 20 nt for mapping) + +echo -e '------------------------Read1------------------------\n' > $CUTADAPT_LOG + +awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $READ1 | + +awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0} +{ + if(NR%4==1){print; all_processed++} + if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print} + if(NR%4==3){print} + if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} +}END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | + +cutadapt -a $ADAPTER1 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH - 2>>$CUTADAPT_LOG | + +awk -v len="${TRIM_LENGTH}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | + +awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz & + +wait + +######################################################################## +#2. Trim the adapter, primer and possible random barcode from the second read + +echo -e '------------------------Read2------------------------\n' >> $CUTADAPT_LOG + +cutadapt -a $ADAPTER2 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH $READ2 2>>$CUTADAPT_LOG | + +awk -v len1="${TRIM_LENGTH}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' - | + +awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & + +wait + +######################################################################## +#3. Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic for tophat mapping) + +#first define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) + +paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep + +paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & + +paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' > $OUT_R2 & + +wait + +######################################################################## +#4. Extract the barcode sequence from the first read: +zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' > $OUT_R1 & + +zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > ${OUT_BARCODES} & + +wait + +######################################################################## +#6. Remove temporary fastq files" + +rm R1.fastq.gz R2.fastq.gz R1_readsANDbarcodes.fastq.gz +rm lines_to_keep + +######################################################################## +#7. problem! Spaces added at the end of the strings in fastq files (AWK induced). I will remove them: + +mv $OUT_R1 R1.temp.fastq +mv $OUT_R2 R2.temp.fastq + +awk '{print($1)}' R1.temp.fastq > $OUT_R1 & +awk '{print($1)}' R2.temp.fastq > $OUT_R2 & + +wait +rm R?.temp.fastq diff -r fd7e52483bbc -r 14d6929f8aa1 preprocessing.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocessing.xml Thu Sep 11 08:30:05 2014 -0400 @@ -0,0 +1,61 @@ + + HRF-Seq raw reads + + cutadapt + samtools + bowtie2 + + + + preprocessing.sh + + ## Inputs + $input1 $input2 + + ## Barcode sequence + $barcode_seq + + ## Cutadapt options + $adapter1 $adapter2 $quality_cutoff $log + + ## Trimming length + $trim + + ## Minimum overlap length + $overlap_length + + ## Outputs + $output1 $output2 $barcodes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r fd7e52483bbc -r 14d6929f8aa1 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Sep 11 08:30:05 2014 -0400 @@ -0,0 +1,23 @@ + + + + + + http://pypi.python.org/packages/source/c/cutadapt/cutadapt-1.4.2.tar.gz + python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin + + $INSTALL_DIR/lib/python + $INSTALL_DIR/bin + + + + + + + + + + + + +