diff k2n.R @ 0:83dfe38f6a09 draft

Uploaded
author nikos
date Tue, 04 Nov 2014 14:28:45 -0500
parents
children 2ae336f19de0
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/k2n.R	Tue Nov 04 14:28:45 2014 -0500
@@ -0,0 +1,74 @@
+suppressMessages(library('getopt'))
+
+options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
+args <- commandArgs(trailingOnly = TRUE)
+
+#get options, using the spec as defined by the enclosed list.
+#we read the options from the default: commandArgs(TRUE).
+spec = matrix( c(
+  'verbose', 'v', 2, "integer",
+  'help' , 'h', 0, "logical",
+  'merged', 'm', 1, "character",
+  'read_counts', 'c', 1, "character",
+  'max_observed', 'b', 1, "integer",
+  'barcode_length', 'l', 1, "integer",
+  'output', 'o', 1, "character"
+  ), byrow = TRUE, ncol = 4 );
+
+opt = getopt( spec );
+
+# if help was asked for print a friendly message
+# and exit with a non-zero error code
+if ( !is.null( opt$help ) ) {
+    cat( getopt( spec, usage=TRUE ) ) ;
+    q( status = 1 );
+}
+
+# Read inputs
+merged <- read.table( opt$merged )
+read_counts <- read.table( opt$read_counts )
+
+barcodes_nt <- merged[ do.call( paste, as.list( merged[ ,1:3 ] ) ) %in% do.call( paste, as.list( read_counts[ ( read_counts[ , 4 ] ) <= summary( read_counts[ , 4 ] )[ 5 ], 1:3 ] ) ) , 4 ]
+
+##make the matrix with the nucleotide freqs per position:
+nt_counts <- matrix( nrow = 4, ncol = opt$barcode_length )
+for( h in 1:ncol( nt_counts ) ){
+    j <- 1
+    for( nt_local in c( "A","C","G","T" ) ) {
+        nt_counts[ j, h ] <- sum( substr( as.character( barcodes_nt ), h, h) == nt_local )
+        j <- j + 1
+    }
+}
+
+# Calculate frequencies
+nt_freqs <- nt_counts / colSums( nt_counts )
+
+nt_values <- list()
+for( i in 1:ncol( nt_freqs ) ) {
+    nt_values[[ i ]] <- nt_freqs[ , i ]
+}
+
+
+all_posible_comb <- expand.grid( nt_values )
+
+probs <- apply( all_posible_comb, 1, prod )
+
+###Create Mf_to_Sf:
+results <- c()
+
+i <- 1
+results[ i ] <- sum( 1 - ( 1 - probs )**i )
+
+while( results[ i ] <= opt$max_observed ) {
+    i <- i + 1
+    results[ i ] <- sum( 1 - ( 1 - probs )**i )   #Mf to Sf
+}
+
+#assign molecules number to unique barcode number:
+Uf_to_Mf <- c()
+for( i in 1:floor( max( results ) ) ) {
+abs( results - i ) -> difference
+Uf_to_Mf[ i ] <- which( difference == min( difference ) ) #if you want to know how many molecules underlie n unique barcodes ask Uf_to_Mf[n]
+}
+
+write( Uf_to_Mf, file = opt$output )