Mercurial > repos > devteam > dwt_ivc_all

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/execute_dwt_IvC_all.pl	Thu Jan 23 12:31:01 2014 -0500
@@ -0,0 +1,210 @@
+#!/usr/bin/perl -w
+use warnings;
+use IO::Handle;
+
+$usage = "execute_dwt_IvC_all.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out]  \n";
+die $usage unless @ARGV == 4;
+
+#get the input arguments
+my $firstInputFile = $ARGV[0];
+my $secondInputFile = $ARGV[1];
+my $firstOutputFile = $ARGV[2];
+my $secondOutputFile = $ARGV[3];
+
+open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n");
+open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n");
+open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n");
+open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n");
+open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
+
+#save all error messages into the error file $errorFile using the error file handle ERROR
+STDERR -> fdopen( \*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \n");
+
+
+print "There are two input data files: \n";
+print "The input data file is: $firstInputFile \n";
+print "The control data file is: $secondInputFile \n";
+
+# IvC test
+$test = "IvC";
+
+# construct an R script to implement the IvC test
+print "\n";
+
+$r_script = "get_dwt_IvC_test.r";
+print "$r_script \n";
+
+# R script
+open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
+print Rcmd "
+        ###########################################################################################
+        # code to do wavelet Indel vs. Control
+        # signal is the difference I-C; function is second moment i.e. variance from zero not mean
+        # to perform wavelet transf. of signal, scale-by-scale analysis of the function
+        # create null bands by permuting the original data series
+        # generate plots and table matrix of correlation coefficients including p-values
+        ############################################################################################
+        library(\"Rwave\");
+        library(\"wavethresh\");
+        library(\"waveslim\");
+
+        options(echo = FALSE)
+
+        # normalize data
+        norm <- function(data){
+            v <- (data - mean(data))/sd(data);
+            if(sum(is.na(v)) >= 1){
+                v <- data;
+            }
+            return(v);
+        }
+
+        dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", wf = \"haar\", boundary = \"reflection\") {
+            print(test);
+            print(pdf);
+            print(table);
+
+            pdf(file = pdf);
+            final_pvalue = NULL;
+            title = NULL;
+
+            short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels;
+            title <- c(\"motif\");
+            for (i in 1:short.levels){
+            	title <- c(title, paste(i, \"moment2\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"), paste(i, \"test\", sep = \"_\"));
+            }
+            print(title);
+
+            # loop to compare a vs a
+            for(i in 1:length(names.short)){
+        		wave1.dwt = NULL;
+        		m2.dwt = diff = var.dwt = NULL;
+        		out = NULL;
+                out <- vector(length = length(title));
+
+        		print(names.short[i]);
+        		print(names.long[i]);
+
+        		# need exit if not comparing motif(a) vs motif(a)
+        		if (names.short[i] != names.long[i]){
+                	stop(paste(\"motif\", names.short[i], \"is not the same as\", names.long[i], sep = \" \"));
+        		}
+        		else {
+                	# signal is the difference I-C data sets
+                    diff<-data.short[,i]-data.long[,i];
+
+                    # normalize the signal
+                    diff<-norm(diff);
+
+                    # function is 2nd moment
+                    # 2nd moment m_j = 1/N[sum_N(W_j + V_J)^2] = 1/N sum_N(W_j)^2 + (X_bar)^2
+            		wave1.dwt <- dwt(diff, wf = wf, short.levels, boundary = boundary);
+            		var.dwt <- wave.variance(wave1.dwt);
+                	m2.dwt <- vector(length = short.levels)
+                    for(level in 1:short.levels){
+                    	m2.dwt[level] <- var.dwt[level, 1] + (mean(diff)^2);
+                    }
+
+            		# CI bands by permutation of time series
+            		feature1 = feature2 = NULL;
+            		feature1 = data.short[, i];
+            		feature2 = data.long[, i];
+            		null = results = med = NULL;
+            		m2_25 = m2_975 = NULL;
+
+            		for (k in 1:1000) {
+                		nk_1 = nk_2 = NULL;
+                		m2_null = var_null = NULL;
+                		null.levels = null_wave1 = null_diff = NULL;
+                		nk_1 <- sample(feature1, length(feature1), replace = FALSE);
+                		nk_2 <- sample(feature2, length(feature2), replace = FALSE);
+                		null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels;
+                		null_diff <- nk_1-nk_2;
+                		null_diff <- norm(null_diff);
+                		null_wave1 <- dwt(null_diff, wf = wf, short.levels, boundary = boundary);
+                        var_null <- wave.variance(null_wave1);
+                		m2_null <- vector(length = null.levels);
+                		for(level in 1:null.levels){
+                        	m2_null[level] <- var_null[level, 1] + (mean(null_diff)^2);
+                		}
+                		null= rbind(null, m2_null);
+            		}
+
+            		null <- apply(null, 2, sort, na.last = TRUE);
+            		m2_25 <- null[25,];
+            		m2_975 <- null[975,];
+            		med <- apply(null, 2, median, na.rm = TRUE);
+
+            		# plot
+            		results <- cbind(m2.dwt, m2_25, m2_975);
+            		matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), xlab = \"Wavelet Scale\", ylab = c(\"Wavelet 2nd Moment\", test), main = (names.short[i]), cex.main = 0.75);
+            		abline(h = 1);
+
+            		# get pvalues by comparison to null distribution
+            		out <- c(names.short[i]);
+            		for (m in 1:length(m2.dwt)){
+                    	print(paste(\"scale\", m, sep = \" \"));
+                        print(paste(\"m2\", m2.dwt[m], sep = \" \"));
+                        print(paste(\"median\", med[m], sep = \" \"));
+                        out <- c(out, format(m2.dwt[m], digits = 4));
+                        pv = NULL;
+                        if(is.na(m2.dwt[m])){
+                        	pv <- \"NA\";
+                        }
+                        else {
+                        	if (m2.dwt[m] >= med[m]){
+                            	# R tail test
+                                tail <- \"R\";
+                                pv <- (length(which(null[, m] >= m2.dwt[m])))/(length(na.exclude(null[, m])));
+                            }
+                            else{
+                                if (m2.dwt[m] < med[m]){
+                                	# L tail test
+                                    tail <- \"L\";
+                                    pv <- (length(which(null[, m] <= m2.dwt[m])))/(length(na.exclude(null[, m])));
+                                }
+                            }
+                        }
+                        out <- c(out, pv);
+                        print(pv);
+                        out <- c(out, tail);
+                    }
+                    final_pvalue <-rbind(final_pvalue, out);
+                    print(out);
+                }
+            }
+
+            colnames(final_pvalue) <- title;
+            write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE);
+            dev.off();
+        }\n";
+
+print Rcmd "
+        # execute
+        # read in data
+
+        inputData <- read.delim(\"$firstInputFile\");
+        inputDataNames <- colnames(inputData);
+
+        controlData <- read.delim(\"$secondInputFile\");
+        controlDataNames <- colnames(controlData);
+
+        # call the test function to implement IvC test
+        dwt_cor(inputData, inputDataNames, controlData, controlDataNames, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\");
+        print (\"done with the correlation test\");
+\n";
+
+print Rcmd "#eof\n";
+
+close Rcmd;
+
+system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n");
+system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n");
+system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n");
+
+#close the input and output and error files
+close(ERROR);
+close(OUTPUT2);
+close(OUTPUT1);
+close(INPUT2);
+close(INPUT1);
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/execute_dwt_IvC_all.xml	Thu Jan 23 12:31:01 2014 -0500
@@ -0,0 +1,112 @@
+<tool id="compute_p-values_second_moments_feature_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Second Moments for Feature Occurrences" version="1.0.0">
+  <description>between two datasets using Discrete Wavelet Transfoms</description>
+
+  <command interpreter="perl">
+  	execute_dwt_IvC_all.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2
+  </command>
+
+  <inputs>
+  	<param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>
+  	<param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
+  </inputs>
+
+  <outputs>
+    <data format="tabular" name="outputFile1"/>
+    <data format="pdf" name="outputFile2"/>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+This program generates plots and computes table matrix of second moments, p-values, and test orientations at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique.
+
+The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
+
+The program has two input files obtained as follows:
+
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program.
+
+The program gives two output files:
+
+- The first output file is a TABULAR format file representing the second moments, p-values, and test orientations for each feature at each scale.
+- The second output file is a PDF file consisting of as many figures as the number of features, such that each figure represents the values of the second moment for that feature at every scale.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file.
+
+-----
+
+**Example**
+
+Counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S1 gives the following tabular file::
+
+	deletionHoptspot	insertionHoptspot	dnaPolPauseFrameshift	topoisomeraseCleavageSite	translinTarget
+		226			403			416			221				1165
+		236			444			380			241				1223
+		242			496			391			195				1116
+		243			429			364			191				1118
+		244			410			371			236				1063
+		230			386			370			217				1087
+		275			404			402			214				1044
+		265			443			365			231				1086
+		255			390			354			246				1114
+		281			384			406			232				1102
+		263			459			369			251				1135
+		280			433			400			251				1159
+		278			385			382			231				1147
+		248			393			389			211				1162
+		251			403			385			246				1114
+		239			383			347			227				1172
+
+And counting the occurrences of 5 features (motifs) in 16 intervals (one line per interval) of the DNA sequences in S2 gives the following tabular file::
+
+	deletionHoptspot	insertionHoptspot	dnaPolPauseFrameshift	topoisomeraseCleavageSite	translinTarget
+		235			374			407			257				1159
+		244			356			353			212				1128
+		233			343			322			204				1110
+		222			329			398			253				1054
+		216			325			328			253				1129
+		257			368			352			221				1115
+		238			360			346			224				1102
+		225			350			377			248				1107
+		230			330			365			236				1132
+		241			389			357			220				1120
+		274			354			392			235				1120
+		250			379			354			210				1102
+		254			329			320			251				1080
+		221			355			406			279				1127
+		224			330			390			249				1129
+		246			366			364			218				1176
+
+
+We notice that the number of scales here is 4 because 16 = 2^4. Runnig the program on the above input files gives the following output:
+
+The first output file::
+
+	motif				1_moment2	1_pval	1_test	2_moment2	2_pval	2_test	3_moment2	3_pval	3_test	4_moment2	4_pval	4_test
+
+	deletionHoptspot		0.8751		0.376	L	1.549		0.168	R	0.6152		0.434	L	0.5735		0.488	R
+	insertionHoptspot		0.902		0.396	L	1.172		0.332	R	0.6843		0.456	L	1.728		0.213	R
+	dnaPolPauseFrameshift		1.65		0.013	R	0.267		0.055	L	0.1387		0.124	L	0.4516		0.498	L
+	topoisomeraseCleavageSite	0.7443		0.233	L	1.023		0.432	R	1.933		0.155	R	1.09		0.3	R
+	translinTarget			0.5084		0.057	L	0.8219		0.446	L	3.604		0.019	R	0.4377		0.492	L
+
+The second output file:
+
+.. image:: ${static_path}/operation_icons/dwt_IvC_1.png
+.. image:: ${static_path}/operation_icons/dwt_IvC_2.png
+.. image:: ${static_path}/operation_icons/dwt_IvC_3.png
+.. image:: ${static_path}/operation_icons/dwt_IvC_4.png
+.. image:: ${static_path}/operation_icons/dwt_IvC_5.png
+
+  </help>
+
+</tool>