changeset 21:f64937805d0d draft

Fixed bugs in k2n.R
author nikos
date Wed, 11 Feb 2015 12:43:49 -0500
parents 4821a9e5dcf6
children 5ee5afb56ca4
files k2n.R normalize.R summarize_unique_barcodes.sh summarize_unique_barcodes.xml tool_dependencies.xml
diffstat 5 files changed, 97 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/k2n.R	Wed Feb 11 08:30:45 2015 -0500
+++ b/k2n.R	Wed Feb 11 12:43:49 2015 -0500
@@ -1,18 +1,103 @@
-## Setup R error handling to go to stderr
+# Setup R error handling to go to stderr
 options( show.error.messages = FALSE, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
 
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
+# we need that to not crash galaxy with an UTF8 error on LC settings.
 Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 
 options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
 args <- commandArgs(trailingOnly = TRUE)
 
-.libPaths(args[1])
-suppressMessages(require(RNAprobR))
+# Declare functions
+
+k2n_calc <- function(merged_file, unique_barcode_file, output_file)
+{
+
+    # Read in inputs:
+    merged <- read.table( merged_file )
+    colnames(merged) <- c("RNAid", "Start", "End", "Barcodes")
+    barcode_length <- max(nchar(as.character(merged$Barcodes)))
+    read_counts <- as.data.frame(table(subset(merged, select = - Barcodes)))
+    read_counts <- read_counts[read_counts$Freq != 0, ]
+
+    max_observed <- max(read.table(unique_barcode_file)[,4])
+
+    #Check if max_observed equals max possible complexity. If yes - subtract 1
+    #(otherwise 'while' loop below never ends)
+    if( max_observed == 4**barcode_length ) {
+        max_observed <- max_observed - 1
+        barcodes_saturated <- TRUE
+    } else
+        barcodes_saturated <- FALSE
+
+    # Remove top-quartile of reads:
+    barcodes_nt <- merged[ do.call(
+        paste, as.list(subset(merged, select = - Barcodes))) %in%
+            do.call(paste, as.list(read_counts[(
+                read_counts$Freq ) <= quantile(read_counts$Freq, 0.75),
+                c("RNAid", "Start", "End") ] ) ) , "Barcodes" ]
+
+    # make the matrix with the nucleotide freqs per position:
+    nt_counts <- matrix( nrow = 4, ncol = barcode_length )
+    for( h in 1:ncol( nt_counts ) ){
+        j <- 1
+        for( nt_local in c( "A","C","G","T" ) ) {
+            nt_counts[ j, h ] <- sum( substr( as.character( barcodes_nt ), h,
+                                              h) == nt_local )
+            j <- j + 1
+        }
+    }
+
+    # Calculate frequencies
+    nt_freqs <- nt_counts / colSums( nt_counts )
+
+    nt_values <- split(nt_freqs, rep(1:ncol(nt_freqs), each = nrow(nt_freqs)))
+
+    all_posible_comb <- expand.grid( nt_values )
+
+    probs <- apply( all_posible_comb, 1, prod )
+
+    ###Create Mf_to_Sf:
+    results <- c()
+
+    i <- 1
+    results[ i ] <- sum( 1 - ( 1 - probs )**i )
+    j <- 1
+    while( results[ i ] <= max_observed ) {
+        i <- i + 1
+        results[ i ] <- sum( 1 - ( 1 - probs )**i )   #Mf to Sf
+        if(results[ i ] == results[ i - 1]){
+            if(j > 2)
+                stop("Too low complexity to estimate k2n distribution")
+            else
+                j <- j + 1
+        }else
+            j <- 1
+    }
+
+    #assign molecules number to unique barcode number:
+    k2n <- c()
+    for( i in 1:floor( max( results ) ) ) {
+        abs( results - i ) -> difference
+
+        #if you want to know how many molecules underlie n unique barcodes
+        #ask k2n[n]
+        k2n[ i ] <- which( difference == min( difference ) )
+    }
+
+    #Assign +Inf for fragments which have saturated the barcodes
+    #(see correct_oversaturation function):
+    if( barcodes_saturated )
+        k2n[max_observed + 1] <- Inf
+
+    if(!missing(output_file)) {
+        write(k2n, file = output_file )
+    } else
+        k2n
+}
 
 # Read inputs
-merged <- args[2]
-unique_barcodes <- args[3]
-output <- args[4]
+merged <- args[1]
+unique_barcodes <- args[2]
+output <- args[3]
 
 k2n_calc( merged, unique_barcodes, output )
--- a/normalize.R	Wed Feb 11 08:30:45 2015 -0500
+++ b/normalize.R	Wed Feb 11 12:43:49 2015 -0500
@@ -3,7 +3,7 @@
 ## Setup R error handling to go to stderr
 options( show.error.messages = FALSE, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
 
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
+# we need that to not crash galaxy with an UTF8 error on LC settings.
 Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 
 suppressMessages(library('getopt'))
--- a/summarize_unique_barcodes.sh	Wed Feb 11 08:30:45 2015 -0500
+++ b/summarize_unique_barcodes.sh	Wed Feb 11 12:43:49 2015 -0500
@@ -25,10 +25,9 @@
 -t: Trim untemplated nucleotides.
 -k: Produce k2n file. Warning: Can be sloooow!
 -r: Rscript path
--l: R lib path
 -o: Output folder (default: "output_dir")
 -------------------------------------
-Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH> -l <R_LIB_PATH>
+Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH>
 End-of-message
 exit
 }
@@ -38,7 +37,7 @@
 trim_flag="False"
 
 #parse input
-while getopts hf:b::p:o:ktr:l: myarg
+while getopts hf:b::p:o:ktr: myarg
 do  case "$myarg" in
     h)  print_help
         exit ;;
@@ -49,7 +48,6 @@
     p)  priming_pos="$OPTARG" ;;
     o)  output_dir="$OPTARG" ;;
     r)  R_SCRIPT_PATH="$OPTARG" ;; #required
-    l)  R_LIB_PATH="$OPTARG" ;; #required
     [?])  echo "ERROR: Unknown parameter"
         print_help
         exit 1 ;;
@@ -222,7 +220,7 @@
 
 #Produce k2n file
 if [ "$k2n" == "True" ]; then
-    Rscript $R_SCRIPT_PATH/k2n.R $R_LIB_PATH merged_temp.gz $output_dir/unique_barcodes.txt $output_dir/k2n.txt
+    Rscript $R_SCRIPT_PATH/k2n.R merged_temp.gz $output_dir/unique_barcodes.txt $output_dir/k2n.txt
 fi
 
 #Remove temp files
--- a/summarize_unique_barcodes.xml	Wed Feb 11 08:30:45 2015 -0500
+++ b/summarize_unique_barcodes.xml	Wed Feb 11 12:43:49 2015 -0500
@@ -8,8 +8,6 @@
         <requirement type="R-module">RNAprobR</requirement>
         <requirement type="package" version="1.0.0">RNAprobR</requirement>
         <requirement type="set_environment">RNA_PROBING_SCRIPT_PATH</requirement>
-        <requirement type="set_environment">RNAprobR_PATH</requirement>
-
     </requirements>
 
     <command interpreter="bash">
@@ -33,8 +31,6 @@
         #end if
 
         -r \$RNA_PROBING_SCRIPT_PATH
-        -l \$RNAprobR_PATH
-
     </command>
 
     <!-- basic error handling -->
--- a/tool_dependencies.xml	Wed Feb 11 08:30:45 2015 -0500
+++ b/tool_dependencies.xml	Wed Feb 11 12:43:49 2015 -0500
@@ -10,10 +10,7 @@
         <repository changeset_revision="d9964efbfbe3" name="package_r_3_1_1" owner="fubar" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <package name="RNAprobR" version="1.0.0">
-        <repository changeset_revision="cabd73083325" name="package_rna_probing_0_99_0" owner="nikos" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-        <set_environment version="1.0">
-            <environment_variable action="set_to" name="RNA_PROBING_LIB_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>
-        </set_environment>
+        <repository changeset_revision="0455dc195eea" name="package_rna_probing_0_99_0" owner="nikos" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <set_environment version="1.0">
         <environment_variable action="set_to" name="RNA_PROBING_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>