Mercurial > repos > nikos > rna_probing
changeset 21:f64937805d0d draft
Fixed bugs in k2n.R
author | nikos |
---|---|
date | Wed, 11 Feb 2015 12:43:49 -0500 |
parents | 4821a9e5dcf6 |
children | 5ee5afb56ca4 |
files | k2n.R normalize.R summarize_unique_barcodes.sh summarize_unique_barcodes.xml tool_dependencies.xml |
diffstat | 5 files changed, 97 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/k2n.R Wed Feb 11 08:30:45 2015 -0500 +++ b/k2n.R Wed Feb 11 12:43:49 2015 -0500 @@ -1,18 +1,103 @@ -## Setup R error handling to go to stderr +# Setup R error handling to go to stderr options( show.error.messages = FALSE, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) -# we need that to not crash galaxy with an UTF8 error on German LC settings. +# we need that to not crash galaxy with an UTF8 error on LC settings. Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") options(stringAsfactors = FALSE, useFancyQuotes = FALSE) args <- commandArgs(trailingOnly = TRUE) -.libPaths(args[1]) -suppressMessages(require(RNAprobR)) +# Declare functions + +k2n_calc <- function(merged_file, unique_barcode_file, output_file) +{ + + # Read in inputs: + merged <- read.table( merged_file ) + colnames(merged) <- c("RNAid", "Start", "End", "Barcodes") + barcode_length <- max(nchar(as.character(merged$Barcodes))) + read_counts <- as.data.frame(table(subset(merged, select = - Barcodes))) + read_counts <- read_counts[read_counts$Freq != 0, ] + + max_observed <- max(read.table(unique_barcode_file)[,4]) + + #Check if max_observed equals max possible complexity. If yes - subtract 1 + #(otherwise 'while' loop below never ends) + if( max_observed == 4**barcode_length ) { + max_observed <- max_observed - 1 + barcodes_saturated <- TRUE + } else + barcodes_saturated <- FALSE + + # Remove top-quartile of reads: + barcodes_nt <- merged[ do.call( + paste, as.list(subset(merged, select = - Barcodes))) %in% + do.call(paste, as.list(read_counts[( + read_counts$Freq ) <= quantile(read_counts$Freq, 0.75), + c("RNAid", "Start", "End") ] ) ) , "Barcodes" ] + + # make the matrix with the nucleotide freqs per position: + nt_counts <- matrix( nrow = 4, ncol = barcode_length ) + for( h in 1:ncol( nt_counts ) ){ + j <- 1 + for( nt_local in c( "A","C","G","T" ) ) { + nt_counts[ j, h ] <- sum( substr( as.character( barcodes_nt ), h, + h) == nt_local ) + j <- j + 1 + } + } + + # Calculate frequencies + nt_freqs <- nt_counts / colSums( nt_counts ) + + nt_values <- split(nt_freqs, rep(1:ncol(nt_freqs), each = nrow(nt_freqs))) + + all_posible_comb <- expand.grid( nt_values ) + + probs <- apply( all_posible_comb, 1, prod ) + + ###Create Mf_to_Sf: + results <- c() + + i <- 1 + results[ i ] <- sum( 1 - ( 1 - probs )**i ) + j <- 1 + while( results[ i ] <= max_observed ) { + i <- i + 1 + results[ i ] <- sum( 1 - ( 1 - probs )**i ) #Mf to Sf + if(results[ i ] == results[ i - 1]){ + if(j > 2) + stop("Too low complexity to estimate k2n distribution") + else + j <- j + 1 + }else + j <- 1 + } + + #assign molecules number to unique barcode number: + k2n <- c() + for( i in 1:floor( max( results ) ) ) { + abs( results - i ) -> difference + + #if you want to know how many molecules underlie n unique barcodes + #ask k2n[n] + k2n[ i ] <- which( difference == min( difference ) ) + } + + #Assign +Inf for fragments which have saturated the barcodes + #(see correct_oversaturation function): + if( barcodes_saturated ) + k2n[max_observed + 1] <- Inf + + if(!missing(output_file)) { + write(k2n, file = output_file ) + } else + k2n +} # Read inputs -merged <- args[2] -unique_barcodes <- args[3] -output <- args[4] +merged <- args[1] +unique_barcodes <- args[2] +output <- args[3] k2n_calc( merged, unique_barcodes, output )
--- a/normalize.R Wed Feb 11 08:30:45 2015 -0500 +++ b/normalize.R Wed Feb 11 12:43:49 2015 -0500 @@ -3,7 +3,7 @@ ## Setup R error handling to go to stderr options( show.error.messages = FALSE, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) -# we need that to not crash galaxy with an UTF8 error on German LC settings. +# we need that to not crash galaxy with an UTF8 error on LC settings. Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") suppressMessages(library('getopt'))
--- a/summarize_unique_barcodes.sh Wed Feb 11 08:30:45 2015 -0500 +++ b/summarize_unique_barcodes.sh Wed Feb 11 12:43:49 2015 -0500 @@ -25,10 +25,9 @@ -t: Trim untemplated nucleotides. -k: Produce k2n file. Warning: Can be sloooow! -r: Rscript path --l: R lib path -o: Output folder (default: "output_dir") ------------------------------------- -Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH> -l <R_LIB_PATH> +Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH> End-of-message exit } @@ -38,7 +37,7 @@ trim_flag="False" #parse input -while getopts hf:b::p:o:ktr:l: myarg +while getopts hf:b::p:o:ktr: myarg do case "$myarg" in h) print_help exit ;; @@ -49,7 +48,6 @@ p) priming_pos="$OPTARG" ;; o) output_dir="$OPTARG" ;; r) R_SCRIPT_PATH="$OPTARG" ;; #required - l) R_LIB_PATH="$OPTARG" ;; #required [?]) echo "ERROR: Unknown parameter" print_help exit 1 ;; @@ -222,7 +220,7 @@ #Produce k2n file if [ "$k2n" == "True" ]; then - Rscript $R_SCRIPT_PATH/k2n.R $R_LIB_PATH merged_temp.gz $output_dir/unique_barcodes.txt $output_dir/k2n.txt + Rscript $R_SCRIPT_PATH/k2n.R merged_temp.gz $output_dir/unique_barcodes.txt $output_dir/k2n.txt fi #Remove temp files
--- a/summarize_unique_barcodes.xml Wed Feb 11 08:30:45 2015 -0500 +++ b/summarize_unique_barcodes.xml Wed Feb 11 12:43:49 2015 -0500 @@ -8,8 +8,6 @@ <requirement type="R-module">RNAprobR</requirement> <requirement type="package" version="1.0.0">RNAprobR</requirement> <requirement type="set_environment">RNA_PROBING_SCRIPT_PATH</requirement> - <requirement type="set_environment">RNAprobR_PATH</requirement> - </requirements> <command interpreter="bash"> @@ -33,8 +31,6 @@ #end if -r \$RNA_PROBING_SCRIPT_PATH - -l \$RNAprobR_PATH - </command> <!-- basic error handling -->
--- a/tool_dependencies.xml Wed Feb 11 08:30:45 2015 -0500 +++ b/tool_dependencies.xml Wed Feb 11 12:43:49 2015 -0500 @@ -10,10 +10,7 @@ <repository changeset_revision="d9964efbfbe3" name="package_r_3_1_1" owner="fubar" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> <package name="RNAprobR" version="1.0.0"> - <repository changeset_revision="cabd73083325" name="package_rna_probing_0_99_0" owner="nikos" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - <set_environment version="1.0"> - <environment_variable action="set_to" name="RNA_PROBING_LIB_PATH">$REPOSITORY_INSTALL_DIR</environment_variable> - </set_environment> + <repository changeset_revision="0455dc195eea" name="package_rna_probing_0_99_0" owner="nikos" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> <set_environment version="1.0"> <environment_variable action="set_to" name="RNA_PROBING_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>