# HG changeset patch # User nikos # Date 1410438223 14400 # Node ID fd7e52483bbcc3ef4f2d8cc0b505f120cea476a7 # Parent 3ecde8c1bd83ca8b8fa9aad0dda1a3abbebaf511 Deleted selected files diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/estimate_unique_counts.sh --- a/hrf-seq/estimate_unique_counts.sh Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -#!/bin/bash - -############################################################ -#Create table of fragments containing coordinates[RNA molecule, start,end], number of mapped reads, number of unique barcodes and 3'most nucleotide of cDNA (on that was ligated to) -############################################################ -#Remove untemplated nucleotides and create positions_temp file - -#defaults -output_dir="output_dir" -priming_pos=-1 - -#parse input -while getopts hf:b:p:o: myarg -do case "$myarg" in - h) echo "Usage: estimate_unique_counts.sh -f -b -o " - exit ;; - f) bamfile="$OPTARG" ;; #required - b) barcodes="$OPTARG" ;; #required - p) priming_pos="$OPTARG" ;; #optional - o) output_dir="$OPTARG" ;; #optional - [?]) echo "Usage: estimate_unique_counts.sh -f -b " - exit 1 ;; - esac -done - -mkdir $output_dir - -samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0} -function abs(value){return(value<0?-value:value)} -($2==99 && /[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {print($1, $3, $4+0, $4+abs($9)-1);counter[0]++;next}; -($2==99 && /[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {print($1, $3, $4+1, $4+abs($9)-1);counter[1]++;next}; -($2==99 && (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ || (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/))) {print($1, $3, $4+2, $4+abs($9)-1);counter[2]++;next}; -($2==99 && ((/[\s\t]MD:Z:1[ACGT]0[ACGT]/)||(/MD:Z:0[ACGT]1[ACGT]/)||(/MD:Z:0[ACGT]0[ACGT]0[ACGT]/)||(/MD:Z:2[ACGT]/))) {print($1, $3, $4+3, $4+abs($9)-1);counter[3]++;next} -END{print("No trimming:",counter[0],"1 nt trimmed:", counter[1],"2 nt trimmed:", counter[2],"3 nt trimmed:",counter[3]) > out}' | sort -k1,1 | gzip > positions_temp_sorted.gz & - -# Computing barcode length (Use the first line and compute the string length of the second column - -TMP=`head -1 $barcodes | awk '{print $2}'` -BAR_LEN=`echo ${#TMP}` - -# Remove "@" from barcodes and sort them -sed 's/^.//' $barcodes | sort -k1,1 | gzip > barcodes_temp_sorted.gz & - -wait - -# Merge poistions and barcodes -join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz - -#### If priming flag is set.... -if [ $priming_pos != -1 ]; then - zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4}' - > merged_temp2 - cat merged_temp2 | gzip > merged_temp.gz - rm merged_temp2 -fi - -#File summary.gz columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts] - -zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{ -for(RNA in barcode){ -for(start_position in barcode[RNA]){ -for(end_position in barcode[RNA][start_position]){ -for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt - -#File unique_barcodes columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment [PROBLEM: How to treat the different 3cdns for the same fragment? if the template was homogenous then it should be always the same] - -awk '{barcode[$1][$2][$3]++}END{ -for(RNA in barcode){ -for(start_position in barcode[RNA]){ -for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt & - -#read_counts.gz colums: RNA_ID, Start, End, sequenced_count - -zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{ -for(RNA in barcode){ -for(start_position in barcode[RNA]){ -for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt & - -wait - -##Remove temporary files - didn't do it, in case debugging needed. - -rm positions_temp_sorted.gz -rm barcodes_temp_sorted.gz -#rm merged_temp.gz - -##End of remove temp files diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/estimate_unique_counts.xml --- a/hrf-seq/estimate_unique_counts.xml Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ - - - - samtools - - - estimate_unique_counts.sh - - ## Inputs - -f $input1 -b $input2 - - ## - #if str( $priming.flag ) == 'False': - -p -1 - #else - -p $priming.position - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/preprocessing.sh --- a/hrf-seq/preprocessing.sh Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -#!/bin/bash - -#Preprocessing workflow - trimming adapters, barcodes etc - -# Barcode in the oligo used: NWTRYSNNNN - -# Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N - -# As regex: - -# ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT] - - -READ1=${1} -READ2=${2} -BARCODE=$3 -ADAPTER1=$4 -ADAPTER2=$5 -CUTOFF=$6 -CUTADAPT_LOG=${7} -TRIM_LENGTH=$8 -OVERLAP_LENGTH=$9 -OUT_R1=${10} -OUT_R2=${11} -OUT_BARCODES=${12} - -BAR_LENGTH=`eval echo ${#BARCODE}` - -######################################################################################### - -# Reverse complement the barcode sequence and create a regular expression. - -REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $BARCODE)) - -REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCT]/g;' <<< $REV_COMPLEMENT) - -######################################################################## -#1. Remove all the reads that do not start with the signature - -# (first awk removes them, second awk removes corresponding quality strings) followed by cutadapt. - -# After cutadapt, remove last 15 nt - may be derived from the random primer [should be optional, as the user may: 1) use different random primer length, 2) have short reasd and would lose too much info] and remove all the reads that are shorter than 30 nt - -# (10 nt barcode + 20 nt for mapping) - -echo -e '------------------------Read1------------------------\n' > $CUTADAPT_LOG - -awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $READ1 | - -awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0} -{ - if(NR%4==1){print; all_processed++} - if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print} - if(NR%4==3){print} - if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} -}END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | - -cutadapt -a $ADAPTER1 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH - 2>>$CUTADAPT_LOG | - -awk -v len="${TRIM_LENGTH}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | - -awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz & - -wait - -######################################################################## -#2. Trim the adapter, primer and possible random barcode from the second read - -echo -e '------------------------Read2------------------------\n' >> $CUTADAPT_LOG - -cutadapt -a $ADAPTER2 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH $READ2 2>>$CUTADAPT_LOG | - -awk -v len1="${TRIM_LENGTH}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' - | - -awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & - -wait - -######################################################################## -#3. Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic for tophat mapping) - -#first define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) - -paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep - -paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & - -paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' > $OUT_R2 & - -wait - -######################################################################## -#4. Extract the barcode sequence from the first read: -zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' > $OUT_R1 & - -zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > ${OUT_BARCODES} & - -wait - -######################################################################## -#6. Remove temporary fastq files" - -rm R1.fastq.gz R2.fastq.gz R1_readsANDbarcodes.fastq.gz -rm lines_to_keep - -######################################################################## -#7. problem! Spaces added at the end of the strings in fastq files (AWK induced). I will remove them: - -mv $OUT_R1 R1.temp.fastq -mv $OUT_R2 R2.temp.fastq - -awk '{print($1)}' R1.temp.fastq > $OUT_R1 & -awk '{print($1)}' R2.temp.fastq > $OUT_R2 & - -wait -rm R?.temp.fastq diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/preprocessing.xml --- a/hrf-seq/preprocessing.xml Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ - - HRF-Seq raw reads - - cutadapt - samtools - bowtie2 - - - - preprocessing.sh - - ## Inputs - $input1 $input2 - - ## Barcode sequence - $barcode_seq - - ## Cutadapt options - $adapter1 $adapter2 $quality_cutoff $log - - ## Trimming length - $trim - - ## Minimum overlap length - $overlap_length - - ## Outputs - $output1 $output2 $barcodes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/rna_probing_coverage.R --- a/hrf-seq/rna_probing_coverage.R Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,138 +0,0 @@ -## Setup R error handling to go to stderr -options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) - -# we need that to not crash galaxy with an UTF8 error on German LC settings. -#Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") - -suppressMessages(library('getopt')) - -options(stringAsfactors = FALSE, useFancyQuotes = FALSE) -args <- commandArgs(trailingOnly = TRUE) - -#get options, using the spec as defined by the enclosed list. -#we read the options from the default: commandArgs(TRUE). -spec = matrix(c( - 'verbose', 'v', 2, "integer", - 'help' , 'h', 0, "logical", - 'sample', 's', 1, "character", - 'control', 'c', 2, "character", - 'window', 'w', 1, "integer", - 'ball', 'b', 1, "double", - 'peak', 'p', 1, "integer", - 'output', 'o', 1, "character", - 'counts', 'x', 1, "character", - 'ratio', 'y', 1, "character", - 'priming', 'z', 1, "character", - 'plots', 'l', 2, "character" - ), byrow=TRUE, ncol=4); - -opt = getopt(spec); - -# if help was asked for print a friendly message -# and exit with a non-zero error code -if ( !is.null(opt$help) ) { - cat(getopt(spec, usage=TRUE)); - q(status=1); -} - - -#This is streamlined protocol that works only with reads mapped to one RNA molecule (16S rRNA) #For plot used in NAR preparation (without two bottom plots) see under many #'s - -suppressMessages(library('GenomicRanges')); -#setwd("/home/nikos/rna_probing_galaxy") - -#Inputs: -#Options: -window_size <- opt$window -ball_size <- opt$ball -peak <- opt$peak - -#Initialize variables: -counts <- list() -gene_coverage <- c() -gene_counts <- c() -gene_ratio <- c() -gene_priming <- c() -#End of initializing - -####Read in datasets: -counts[[1]] <- read.table( opt$sample ) - -#if ( !is.null( opt$control ) ) { -# counts[[2]] <- read.table( opt$control ) -#} -##### - -#Filter out the short inserts not to deal with size selection correctors. -for(i in seq(1,length(counts))){ - counts[[i]] <- subset(counts[[i]], (counts[[i]][,3]-counts[[i]][,2])>=peak) -} - -############Initilize matrices for storing data: -counter <-1 -for(i in seq(1,length(counts))){ - temp_storage <- counts[[i]] - gene_coverage <- matrix(nrow=max(c(temp_storage[,3], nrow(gene_coverage))), ncol=counter) - gene_counts <- matrix(nrow=max(c(temp_storage[,3], nrow(gene_counts))), ncol=counter) - gene_priming <- matrix(nrow=max(c(temp_storage[,3], nrow(gene_priming))), ncol=counter) - gene_ratio <- matrix(nrow=max(c(temp_storage[,3], nrow(gene_ratio))), ncol=counter) - counter <- counter+1 -} -###########End of Initilize matrices for storing data - -################## -##Fill in the matrices with data -counter <-1 -for(i in seq(1,length(counts))){ - temp_storage <- counts[[i]] - cover <- coverage(IRanges(start=temp_storage[,2], end=(temp_storage[,3]-peak)), weight=temp_storage[,4]) - gene_coverage[1:length((rep(runValue(cover), runLength(cover)))),counter] <- c((rep(runValue(cover), runLength(cover)))) - stopping_reads <- aggregate(temp_storage[,4]~temp_storage[,2],temp_storage, sum) - gene_counts[stopping_reads[,1],counter] <- stopping_reads[,2] - gene_counts[is.na(gene_counts)] <- 0 - priming_reads <- aggregate(temp_storage[,4]~temp_storage[,3],temp_storage, sum) - gene_priming[priming_reads[,1],counter] <- priming_reads[,2] - gene_priming[is.na(gene_priming)] <- 0 - counter <- counter+1 -} - -#End of filling in -gene_ratio <- gene_counts/gene_coverage - -###Export to Galaxy -data <- data.frame(gene_counts, gene_coverage, gene_priming, gene_ratio ) -colnames(data) <- c("Read counts", "Effective Coverage EUC", "Termination EUC", "TCR") - -write.table( data, opt$output, sep = "\t", quote = F, row.names = F) - - -#Return plots -if ( !is.null(opt$plots) ) { - pdf(opt$plots) - - # Termination signal - plot(gene_counts, type= 'l', main = "Termination signal", ylab = "", xlab = "Position") - - # Priming signal - plot(gene_priming, type= 'l', main = "Priming signal", ylab = "", xlab = "Position") - # Effective Coverage - - y <- gene_coverage - y[is.na(y)] <- 0 - x <- seq_along(y) - y2 <- rep(y, each=2) - y2 <- y2[-length(y2)] - x2 <- rep(x, each=2)[-1] - x3 <- c(min(x2), x2, max(x2)) - y3 <- c(0, y2, 0) - - # because polygon() is dumb and wants a pre-existing plot - plot(x, y, ylim=c(0, max(y)), type="n", main = "Effective Coverage", ylab = "", xlab = "Position") - - polygon(x3, y3, border=NA, col="black") - lines(x2, y2) - - # Termination Coverage Ratio - plot(gene_ratio, type= 'l', main = "Termination Coverage Ratio", ylab = "", xlab = "Position") - dump <- dev.off() -} diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/rna_probing_coverage.xml --- a/hrf-seq/rna_probing_coverage.xml Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ - - (Termination Coverage Ratio) on HRF-Seq data - - rna_probing_coverage.R - - ## Inputs - --sample '$input1' - - ##--control '$input2' - - ## Plotting - #if $pdf: - --plots '$plots' - #end if - - ## Options - #if str( $params.full) == "yes": - - -p $params.peak - -b $params.ball - -w $params.window - #else - -p 100 - -b 4 - -w 1 - #end if - - ## Output - -o '$output' - - - - - - - - - - - - - - - - - - - - - - - - - - pdf is True - - - - - diff -r 3ecde8c1bd83 -r fd7e52483bbc hrf-seq/tool_dependencies.xml --- a/hrf-seq/tool_dependencies.xml Thu Sep 11 08:21:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ - - - - - - http://pypi.python.org/packages/source/c/cutadapt/cutadapt-1.4.2.tar.gz - python setup.py install --home $INSTALL_DIR --install-scripts $INSTALL_DIR/bin - - $INSTALL_DIR/lib/python - $INSTALL_DIR/bin - - - - - - - - - - - - - - - -