changeset 2:27b1e4252ce0 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/dwt_cor_ava_perclass commit f929353ffb0623f2218d7dec459c7da62f3b0d24"
author devteam
date Mon, 06 Jul 2020 18:11:43 +0000
parents 631cad17338d
children
files execute_dwt_cor_aVa_perClass.R execute_dwt_cor_aVa_perClass.pl execute_dwt_cor_aVa_perClass.xml test-data/in1.tsv test-data/in2.tsv test-data/out2.pdf
diffstat 6 files changed, 245 insertions(+), 240 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/execute_dwt_cor_aVa_perClass.R	Mon Jul 06 18:11:43 2020 +0000
@@ -0,0 +1,178 @@
+##################################################################################
+## code to do all correlation tests of form: motif(a) vs. motif(a)
+## add code to create null bands by permuting the original data series
+## generate plots and table matrix of correlation coefficients including p-values
+##################################################################################
+library("wavethresh");
+library("waveslim");
+
+options(echo = FALSE)
+
+## normalize data
+norm <- function(data) {
+    v <- (data - mean(data)) / sd(data);
+    if (sum(is.na(v)) >= 1) {
+        v <- data;
+    }
+    return(v);
+}
+
+dwt_cor <- function(data_short, names_short, data_long, names_long, test, pdf, table, filter = 4, bc = "symmetric", method = "kendall", wf = "haar", boundary = "reflection") {
+    print(test);
+    print(pdf);
+    print(table);
+
+    pdf(file = pdf);
+    final_pvalue <- NULL;
+    title <- NULL;
+
+    short_levels <- wavethresh::wd(data_short[, 1], filter.number = filter, bc = bc)$nlevels;
+    title <- c("motif");
+    for (i in 1:short_levels) {
+        title <- c(title, paste(i, "cor", sep = "_"), paste(i, "pval", sep = "_"));
+    }
+    print(title);
+
+    ## normalize the raw data
+    data_short <- apply(data_short, 2, norm);
+    data_long <- apply(data_long, 2, norm);
+
+    for (i in seq_len(length(names_short))) {
+        ## Kendall Tau
+        ## DWT wavelet correlation function
+        ## include significance to compare
+        wave1_dwt <- NULL;
+        wave2_dwt <- NULL;
+        tau_dwt <- NULL;
+        out <- NULL;
+
+        print(names_short[i]);
+        print(names_long[i]);
+
+        ## need exit if not comparing motif(a) vs motif(a)
+        if (names_short[i] != names_long[i]) {
+            stop(paste("motif", names_short[i], "is not the same as", names_long[i], sep = " "));
+        }
+        else {
+            wave1_dwt <- waveslim::dwt(data_short[, i], wf = wf, short_levels, boundary = boundary);
+            wave2_dwt <- waveslim::dwt(data_long[, i], wf = wf, short_levels, boundary = boundary);
+            tau_dwt <- vector(length = short_levels)
+
+            ## perform cor test on wavelet coefficients per scale
+            for (level in 1:short_levels) {
+                w1_level <- NULL;
+                w2_level <- NULL;
+                w1_level <- (wave1_dwt[[level]]);
+                w2_level <- (wave2_dwt[[level]]);
+                tau_dwt[level] <- cor.test(w1_level, w2_level, method = method)$estimate;
+            }
+
+            ## CI bands by permutation of time series
+            feature1 <- NULL;
+            feature2 <- NULL;
+            feature1 <- data_short[, i];
+            feature2 <- data_long[, i];
+            null <- NULL;
+            results <- NULL;
+            med <- NULL;
+            cor_25 <- NULL;
+            cor_975 <- NULL;
+
+            for (k in 1:1000) {
+                nk_1 <- NULL;
+                nk_2 <- NULL;
+                null_levels <- NULL;
+                cor <- NULL;
+                null_wave1 <- NULL;
+                null_wave2 <- NULL;
+
+                nk_1 <- sample(feature1, length(feature1), replace = FALSE);
+                nk_2 <- sample(feature2, length(feature2), replace = FALSE);
+                null_levels <- wavethresh::wd(nk_1, filter.number = filter, bc = bc)$nlevels;
+                cor <- vector(length = null_levels);
+                null_wave1 <- waveslim::dwt(nk_1, wf = wf, short_levels, boundary = boundary);
+                null_wave2 <- waveslim::dwt(nk_2, wf = wf, short_levels, boundary = boundary);
+
+                for (level in 1:null_levels) {
+                    null_level1 <- NULL;
+                    null_level2 <- NULL;
+                    null_level1 <- (null_wave1[[level]]);
+                    null_level2 <- (null_wave2[[level]]);
+                    cor[level] <- cor.test(null_level1, null_level2, method = method)$estimate;
+                }
+                null <- rbind(null, cor);
+            }
+
+            null <- apply(null, 2, sort, na.last = TRUE);
+            print(paste("NAs", length(which(is.na(null))), sep = " "));
+            cor_25 <- null[25, ];
+            cor_975 <- null[975, ];
+            med <- (apply(null, 2, median, na.rm = TRUE));
+
+            ## plot
+            results <- cbind(tau_dwt, cor_25, cor_975);
+            matplot(results, type = "b", pch = "*", lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = "Wavelet Scale", ylab = "Wavelet Correlation Kendall's Tau", main = (paste(test, names_short[i], sep = " ")), cex.main = 0.75);
+            abline(h = 0);
+
+            ## get pvalues by comparison to null distribution
+            ### modify pval calculation for error type II of T test ####
+            out <- (names_short[i]);
+            for (m in seq_len(length(tau_dwt))) {
+                print(paste("scale", m, sep = " "));
+                print(paste("tau", tau_dwt[m], sep = " "));
+                print(paste("med", med[m], sep = " "));
+                out <- c(out, format(tau_dwt[m], digits = 3));
+                pv <- NULL;
+                if (is.na(tau_dwt[m])) {
+                    pv <- "NA";
+                }
+                else {
+                    if (tau_dwt[m] >= med[m]) {
+                        ## R tail test
+                        print(paste("R"));
+                        ### per sv ok to use inequality not strict
+                        pv <- (length(which(null[, m] >= tau_dwt[m]))) / (length(na.exclude(null[, m])));
+                        if (tau_dwt[m] == med[m]) {
+                            print("tau == med");
+                            print(summary(null[, m]));
+                        }
+                }
+                    else if (tau_dwt[m] < med[m]) {
+                        ## L tail test
+                        print(paste("L"));
+                        pv <- (length(which(null[, m] <= tau_dwt[m]))) / (length(na.exclude(null[, m])));
+                    }
+                }
+                out <- c(out, pv);
+                print(paste("pval", pv, sep = " "));
+            }
+            final_pvalue <- rbind(final_pvalue, out);
+            print(out);
+        }
+    }
+    colnames(final_pvalue) <- title;
+    write.table(final_pvalue, file = table, sep = "\\t", quote = FALSE, row.names = FALSE)
+    dev.off();
+}
+## execute
+## read in data
+args <- commandArgs(trailingOnly = TRUE)
+
+input_data1 <- NULL;
+input_data2 <- NULL;
+input_data_short1 <- NULL;
+input_data_short2 <- NULL;
+input_data_names_short1 <- NULL;
+input_data_names_short2 <- NULL;
+
+input_data1 <- read.delim(args[1]);
+input_data_short1 <- input_data1[, +c(seq_len(ncol(input_data1)))];
+input_data_names_short1 <- colnames(input_data_short1);
+
+input_data2 <- read.delim(args[2]);
+input_data_short2 <- input_data2[, +c(seq_len(ncol(input_data2)))];
+input_data_names_short2 <- colnames(input_data_short2);
+
+## cor test for motif(a) in input_data1 vs motif(a) in input_data2
+dwt_cor(input_data_short1, input_data_names_short1, input_data_short2, input_data_names_short2, test = "cor_aVa", pdf = args[3], table = args[4]);
+print("done with the correlation test");
--- a/execute_dwt_cor_aVa_perClass.pl	Tue Jun 03 14:17:08 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,221 +0,0 @@
-#!/usr/bin/perl -w
-
-use warnings;
-use IO::Handle;
-
-$usage = "execute_dwt_cor_aVa_perClass.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] [PDF.out]  \n";
-die $usage unless @ARGV == 4;
-
-#get the input arguments
-my $firstInputFile = $ARGV[0];
-my $secondInputFile = $ARGV[1];
-my $firstOutputFile = $ARGV[2];
-my $secondOutputFile = $ARGV[3];
-
-open (INPUT1, "<", $firstInputFile) || die("Could not open file $firstInputFile \n");
-open (INPUT2, "<", $secondInputFile) || die("Could not open file $secondInputFile \n");
-open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n");
-open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n");
-open (ERROR,  ">", "error.txt")  or die ("Could not open file error.txt \n");
-
-#save all error messages into the error file $errorFile using the error file handle ERROR
-STDERR -> fdopen( \*ERROR,  "w" ) or die ("Could not direct errors to the error file error.txt \n");
-
-print "There are two input data files: \n";
-print "The input data file is: $firstInputFile \n";
-print "The control data file is: $secondInputFile \n";
-
-# IvC test
-$test = "cor_aVa";
-
-# construct an R script to implement the IvC test
-print "\n";
-
-$r_script = "get_dwt_cor_aVa_test.r"; 
-print "$r_script \n";
-
-open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n";
-print Rcmd "
-	##################################################################################
-	# code to do all correlation tests of form: motif(a) vs. motif(a)
-	# add code to create null bands by permuting the original data series
-	# generate plots and table matrix of correlation coefficients including p-values
-	##################################################################################
-	library(\"Rwave\");
-	library(\"wavethresh\");
-	library(\"waveslim\");
-	
-	options(echo = FALSE)
-	
-	# normalize data
-	norm <- function(data){
-        v <- (data - mean(data))/sd(data);
-        if(sum(is.na(v)) >= 1){
-        	v <- data;
-        }
-        return(v);
-	}
-	
-	dwt_cor <- function(data.short, names.short, data.long, names.long, test, pdf, table, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") {
-		print(test);
-	    print(pdf);
-		print(table);
-		
-	    pdf(file = pdf);   
-	    final_pvalue = NULL;
-		title = NULL;
-		
-	    short.levels <- wd(data.short[, 1], filter.number = filter, bc = bc)\$nlevels;
-		title <- c(\"motif\");
-        for (i in 1:short.levels){
-	        title <- c(title, paste(i, \"cor\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"));
-        }
-        print(title);
-	
-        # normalize the raw data
-        data.short <- apply(data.short, 2, norm);
-        data.long <- apply(data.long, 2, norm);
-        
-        for(i in 1:length(names.short)){
-        	# Kendall Tau
-            # DWT wavelet correlation function
-            # include significance to compare
-            wave1.dwt = wave2.dwt = NULL;
-            tau.dwt = NULL;
-            out = NULL;
-
-            print(names.short[i]);
-            print(names.long[i]);
-            
-            # need exit if not comparing motif(a) vs motif(a)
-            if (names.short[i] != names.long[i]){
-            	stop(paste(\"motif\", names.short[i], \"is not the same as\", names.long[i], sep = \" \"));
-            }
-            else {
-            	wave1.dwt <- dwt(data.short[, i], wf = wf, short.levels, boundary = boundary);
-                wave2.dwt <- dwt(data.long[, i], wf = wf, short.levels, boundary = boundary);
-                tau.dwt <- vector(length=short.levels)
-                       
-				#perform cor test on wavelet coefficients per scale 
-				for(level in 1:short.levels){
-                	w1_level = w2_level = NULL;
-                    w1_level <- (wave1.dwt[[level]]);
-                    w2_level <- (wave2.dwt[[level]]);
-                    tau.dwt[level] <- cor.test(w1_level, w2_level, method = method)\$estimate;
-                }
-                
-                # CI bands by permutation of time series
-                feature1 = feature2 = NULL;
-                feature1 = data.short[, i];
-                feature2 = data.long[, i];
-                null = results = med = NULL; 
-                cor_25 = cor_975 = NULL;
-                
-                for (k in 1:1000) {
-                	nk_1 = nk_2 = NULL;
-                    null.levels = NULL;
-                    cor = NULL;
-                    null_wave1 = null_wave2 = NULL;
-                    
-                    nk_1 <- sample(feature1, length(feature1), replace = FALSE);
-                    nk_2 <- sample(feature2, length(feature2), replace = FALSE);
-                    null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels;
-                    cor <- vector(length = null.levels);
-                    null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary);
-                    null_wave2 <- dwt(nk_2, wf = wf, short.levels, boundary = boundary);
-
-                    for(level in 1:null.levels){
-                    	null_level1 = null_level2 = NULL;
-                        null_level1 <- (null_wave1[[level]]);
-                        null_level2 <- (null_wave2[[level]]);
-                        cor[level] <- cor.test(null_level1, null_level2, method = method)\$estimate;
-                    }
-                    null = rbind(null, cor);
-                }
-                
-                null <- apply(null, 2, sort, na.last = TRUE);
-                print(paste(\"NAs\", length(which(is.na(null))), sep = \" \"));
-                cor_25 <- null[25,];
-                cor_975 <- null[975,];
-                med <- (apply(null, 2, median, na.rm = TRUE));
-
-				# plot
-                results <- cbind(tau.dwt, cor_25, cor_975);
-                matplot(results, type = \"b\", pch = \"*\" , lty = 1, col = c(1, 2, 2), ylim = c(-1, 1), xlab = \"Wavelet Scale\", ylab = \"Wavelet Correlation Kendall's Tau\", main = (paste(test, names.short[i], sep = \" \")), cex.main = 0.75);
-                abline(h = 0);
-
-                # get pvalues by comparison to null distribution
- 			    ### modify pval calculation for error type II of T test ####
-                out <- (names.short[i]);
-                for (m in 1:length(tau.dwt)){
-                	print(paste(\"scale\", m, sep = \" \"));
-                    print(paste(\"tau\", tau.dwt[m], sep = \" \"));
-                    print(paste(\"med\", med[m], sep = \" \"));
-					out <- c(out, format(tau.dwt[m], digits = 3));	
-                    pv = NULL;
-                    if(is.na(tau.dwt[m])){
-                    	pv <- \"NA\"; 
-                    } 
-                    else {
-                    	if (tau.dwt[m] >= med[m]){
-                        	# R tail test
-                            print(paste(\"R\"));
-                            ### per sv ok to use inequality not strict
-                            pv <- (length(which(null[, m] >= tau.dwt[m])))/(length(na.exclude(null[, m])));
-                            if (tau.dwt[m] == med[m]){
-								print(\"tau == med\");
-                                print(summary(null[, m]));
-                            }
-                    	}
-                        else if (tau.dwt[m] < med[m]){
-                        	# L tail test
-                            print(paste(\"L\"));
-                            pv <- (length(which(null[, m] <= tau.dwt[m])))/(length(na.exclude(null[, m])));
-                        }
-					}
-					out <- c(out, pv);
-                    print(paste(\"pval\", pv, sep = \" \"));
-                }
-                final_pvalue <- rbind(final_pvalue, out);
-				print(out);
-        	}
-        }
-        colnames(final_pvalue) <- title;
-        write.table(final_pvalue, file = table, sep = \"\\t\", quote = FALSE, row.names = FALSE)
-        dev.off();
-	}\n";
-
-print Rcmd "
-	# execute
-	# read in data 
-		
-	inputData1 = inputData2 = NULL;
-	inputData.short1 = inputData.short2 = NULL;
-	inputDataNames.short1 = inputDataNames.short2 = NULL;
-		
-	inputData1 <- read.delim(\"$firstInputFile\");
-	inputData.short1 <- inputData1[, +c(1:ncol(inputData1))];
-	inputDataNames.short1 <- colnames(inputData.short1);
-		
-	inputData2 <- read.delim(\"$secondInputFile\");
-	inputData.short2 <- inputData2[, +c(1:ncol(inputData2))];
-	inputDataNames.short2 <- colnames(inputData.short2);
-	
-	# cor test for motif(a) in inputData1 vs motif(a) in inputData2
-	dwt_cor(inputData.short1, inputDataNames.short1, inputData.short2, inputDataNames.short2, test = \"$test\", pdf = \"$secondOutputFile\", table = \"$firstOutputFile\");
-	print (\"done with the correlation test\");
-	
-	#eof\n";
-close Rcmd;
-
-system("echo \"wavelet IvC test started on \`hostname\` at \`date\`\"\n");
-system("R --no-restore --no-save --no-readline < $r_script > $r_script.out\n");
-system("echo \"wavelet IvC test ended on \`hostname\` at \`date\`\"\n");
-
-#close the input and output and error files
-close(ERROR);
-close(OUTPUT2);
-close(OUTPUT1);
-close(INPUT2);
-close(INPUT1);
-
--- a/execute_dwt_cor_aVa_perClass.xml	Tue Jun 03 14:17:08 2014 -0400
+++ b/execute_dwt_cor_aVa_perClass.xml	Mon Jul 06 18:11:43 2020 +0000
@@ -1,33 +1,47 @@
-<tool id="compute_p-values_correlation_coefficients_feature_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Correlation Coefficients for Feature Occurrences" version="1.0.0">
+<tool id="compute_p-values_correlation_coefficients_feature_occurrences_between_two_datasets_using_discrete_wavelet_transfom" name="Compute P-values and Correlation Coefficients for Feature Occurrences" version="1.0.1">
   <description>between two datasets using Discrete Wavelet Transfoms</description>
-  
-  <command interpreter="perl">
-  	execute_dwt_cor_aVa_perClass.pl $inputFile1 $inputFile2 $outputFile1 $outputFile2
+  <requirements>
+    <requirement type="package" version="1.7.5">r-waveslim</requirement>
+    <requirement type="package" version="4.6.8">r-wavethresh</requirement>
+  </requirements>
+  <command detect_errors="exit_code">
+    Rscript --vanilla '$__tool_directory__/execute_dwt_cor_aVa_perClass.R'
+      '$inputFile1'
+      '$inputFile2'
+      '$outputFile2'
+      '$outputFile1'
   </command>
-
   <inputs>
-  	<param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>	
-  	<param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
+    <param format="tabular" name="inputFile1" type="data" label="Select the first input file"/>
+  <param format="tabular" name="inputFile2" type="data" label="Select the second input file"/>
   </inputs>
-  
   <outputs>
-    <data format="tabular" name="outputFile1"/> 
-    <data format="pdf" name="outputFile2"/>
+    <data format="tabular" name="outputFile1" label="${tool.name} on ${on_string}: statistics"/>
+    <data format="pdf" name="outputFile2" label="${tool.name} on ${on_string}: pdf"/>
   </outputs>
-  	
-  <help> 
-
+  <tests>
+    <test>
+      <param ftype="tabular" name="inputFile1" value="in1.tsv"/>
+      <param ftype="tabular" name="inputFile2" value="in2.tsv"/>
+      <output name="outputFile1" ftype="tabular">
+        <assert_contents><has_line_matching expression="^translinTarget.*"/></assert_contents>
+        <assert_contents><has_line_matching expression="^deletionHoptspot.*" /></assert_contents>
+      </output>
+      <output name="outputFile2" ftype="pdf" file="out2.pdf" compare="sim_size"/>
+    </test>
+  </tests>
+  <help>
 .. class:: infomark
 
 **What it does**
 
-This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique. 
+This program generates plots and computes table matrix of coefficient correlations and p-values at multiple scales for the correlation between the occurrences of features in one dataset and their occurrences in another using multiscale wavelet analysis technique.
 
 The program assumes that the user has two sets of DNA sequences, S1 and S1, each of which consists of one or more sequences of equal length. Each sequence in each set is divided into the same number of multiple intervals n such that n = 2^k, where k is a positive integer and  k >= 1. Thus, n could be any value of the set {2, 4, 8, 16, 32, 64, 128, ...}. k represents the number of scales.
 
 The program has two input files obtained as follows:
 
-For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program. 
+For a given set of features, say motifs, the user counts the number of occurrences of each feature in each interval of each sequence in S1 and S1, and builds two tabular files representing the count results in each interval of S1 and S1. These are the input files of the program.
 
 The program gives two output files:
 
@@ -40,7 +54,7 @@
 
 **Note**
 
-In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file. 
+In order to obtain empirical p-values, a random perumtation test is implemented by the program, which results in the fact that the program gives slightly different results each time it is run on the same input file.
 
 -----
 
@@ -86,7 +100,7 @@
 		111			150			138			102				451
 		94			128			151			138				481
 
-  
+
 We notice that the number of scales here is 4 because 16 = 2^4. Running the program on the above input files gives the following output:
 
 The first output file::
@@ -107,6 +121,6 @@
 .. image:: dwt_cor_aVa_4.png
 .. image:: dwt_cor_aVa_5.png
 
-  </help>  
-  
+  </help>
+
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/in1.tsv	Mon Jul 06 18:11:43 2020 +0000
@@ -0,0 +1,17 @@
+deletionHoptspot	insertionHoptspot	dnaPolPauseFrameshift	topoisomeraseCleavageSite	translinTarget
+269	366	330	238	1129
+239	328	327	283	1188
+254	351	358	297	1151
+262	371	355	256	1107
+254	361	352	234	1192
+265	354	367	240	1182
+255	359	333	235	1217
+271	389	387	272	1241
+240	305	341	249	1159
+272	351	337	257	1169
+275	351	337	233	1158
+305	331	361	253	1172
+277	341	343	253	1113
+266	362	355	267	1162
+235	326	329	241	1230
+254	335	360	251	1172
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/in2.tsv	Mon Jul 06 18:11:43 2020 +0000
@@ -0,0 +1,17 @@
+deletionHoptspot	insertionHoptspot	dnaPolPauseFrameshift	topoisomeraseCleavageSite	translinTarget
+104	146	142	113	478
+89	146	151	94	495
+100	176	151	88	435
+96	163	128	114	468
+99	138	144	91	513
+112	126	162	106	468
+86	127	145	83	491
+104	145	171	110	496
+91	121	147	104	469
+103	141	145	98	458
+92	134	142	117	468
+97	146	145	107	471
+115	121	136	109	470
+113	135	138	101	491
+111	150	138	102	451
+94	128	151	138	481
Binary file test-data/out2.pdf has changed