Mercurial > repos > nikos > rna_probing
diff k2n.R @ 24:431aebd93843 draft default tip
Fixed a bug in k2n.R where the function k2n_calc() would result in an error for single-end read files.
author | nikos |
---|---|
date | Wed, 05 Aug 2015 09:21:02 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/k2n.R Wed Aug 05 09:21:02 2015 -0400 @@ -0,0 +1,119 @@ +# Setup R error handling to go to stderr +options( show.error.messages = FALSE, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on LC settings. +Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +# Declare functions + +k2n_calc <- function(merged_file, unique_barcode_file, output_file, paired = TRUE) +{ + + # Read in inputs: + merged <- read.table( merged_file ) + colnames(merged) <- c("RNAid", "Start", "End", "Barcodes") + barcode_length <- max(nchar(as.character(merged$Barcodes))) + + #Single end + if( !paired) + read_counts <- as.data.frame(table(subset(merged, select = - (End:Barcodes)))) + #Paired end + else + read_counts <- as.data.frame(table(subset(merged, select = - Barcodes))) + read_counts <- read_counts[read_counts$Freq != 0, ] + + max_observed <- max(read.table(unique_barcode_file)[,4]) + + #Check if max_observed equals max possible complexity. If yes - subtract 1 + #(otherwise 'while' loop below never ends) + if( max_observed == 4**barcode_length ) { + max_observed <- max_observed - 1 + barcodes_saturated <- TRUE + } else + barcodes_saturated <- FALSE + + # Remove top-quartile of reads: + #Single end + if (!paired){ + barcodes_nt <- merged[ do.call( + paste, as.list(subset(merged, select = - (End:Barcodes)))) %in% + do.call(paste, as.list(read_counts[( + read_counts$Freq ) <= quantile(read_counts$Freq, 0.75), + c("RNAid", "Start") ] ) ) , "Barcodes" ] + } else { + barcodes_nt <- merged[ do.call( + paste, as.list(subset(merged, select = - Barcodes))) %in% + do.call(paste, as.list(read_counts[( + read_counts$Freq ) <= quantile(read_counts$Freq, 0.75), + c("RNAid", "Start", "End") ] ) ) , "Barcodes" ] + } + + # make the matrix with the nucleotide freqs per position: + nt_counts <- matrix( nrow = 4, ncol = barcode_length ) + for( h in 1:ncol( nt_counts ) ){ + j <- 1 + for( nt_local in c( "A","C","G","T" ) ) { + nt_counts[ j, h ] <- sum( substr( as.character( barcodes_nt ), h, + h) == nt_local ) + j <- j + 1 + } + } + + # Calculate frequencies + nt_freqs <- nt_counts / colSums( nt_counts ) + + nt_values <- split(nt_freqs, rep(1:ncol(nt_freqs), each = nrow(nt_freqs))) + + all_posible_comb <- expand.grid( nt_values ) + + probs <- apply( all_posible_comb, 1, prod ) + + ###Create Mf_to_Sf: + results <- c() + + i <- 1 + results[ i ] <- sum( 1 - ( 1 - probs )**i ) + j <- 1 + while( results[ i ] <= max_observed ) { + i <- i + 1 + results[ i ] <- sum( 1 - ( 1 - probs )**i ) #Mf to Sf + if(results[ i ] == results[ i - 1]){ + if(j > 2) + stop("Too low complexity to estimate k2n distribution") + else + j <- j + 1 + }else + j <- 1 + } + + #assign molecules number to unique barcode number: + k2n <- c() + for( i in 1:floor( max( results ) ) ) { + abs( results - i ) -> difference + + #if you want to know how many molecules underlie n unique barcodes + #ask k2n[n] + k2n[ i ] <- which( difference == min( difference ) ) + } + + #Assign +Inf for fragments which have saturated the barcodes + #(see correct_oversaturation function): + if( barcodes_saturated ) + k2n[max_observed + 1] <- Inf + + if(!missing(output_file)) { + write(k2n, file = output_file ) + } else + k2n +} + +# Read inputs +merged <- args[1] +unique_barcodes <- args[2] +output <- args[3] +paired_check <- as.logical(args[4]) + +k2n_calc( merged, unique_barcodes, output, paired = paired_check)