# HG changeset patch
# User devteam
# Date 1396357986 14400
# Node ID d8c414b9d7745cf95f88b110580313464f2507f4
Imported from capsule None
diff -r 000000000000 -r d8c414b9d774 rcve.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rcve.py Tue Apr 01 09:13:06 2014 -0400
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+import sys
+from rpy import *
+import numpy
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+
+def sscombs(s):
+ if len(s) == 1:
+ return [s]
+ else:
+ ssc = sscombs(s[1:])
+ return [s[0]] + [s[0]+comb for comb in ssc] + ssc
+
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+
+print "Predictor columns: %s; Response column: %d" % ( x_cols, y_col+1 )
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+ line = line.rstrip('\r\n')
+ if len( line )>0 and not line.startswith( '#' ):
+ elems = line.split( '\t' )
+ break
+ if i == 30:
+ break # Hopefully we'll never get here...
+
+if len( elems )<1:
+ stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k, col in enumerate(x_cols):
+ x_cols[k] = int(col)-1
+ x_vals.append([])
+ """
+ try:
+ float( elems[x_cols[k]] )
+ except:
+ try:
+ msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." % ( col, elems[x_cols[k]] )
+ except:
+ msg = "This operation cannot be performed on non-numeric data."
+ stop_err( msg )
+ """
+NA = 'NA'
+for ind, line in enumerate( file( infile )):
+ if line and not line.startswith( '#' ):
+ try:
+ fields = line.split("\t")
+ try:
+ yval = float(fields[y_col])
+ except Exception, ey:
+ yval = r('NA')
+ #print >>sys.stderr, "ey = %s" %ey
+ y_vals.append(yval)
+ for k, col in enumerate(x_cols):
+ try:
+ xval = float(fields[col])
+ except Exception, ex:
+ xval = r('NA')
+ #print >>sys.stderr, "ex = %s" %ex
+ x_vals[k].append(xval)
+ except:
+ pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+dat = r.list( x=array(x_vals1), y=y_vals )
+
+set_default_mode(NO_CONVERSION)
+try:
+ full = r.lm( r("y ~ x"), data=r.na_exclude(dat) ) #full model includes all the predictor variables specified by the user
+except RException, rex:
+ stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(full)
+fullr2 = summary.get('r.squared','NA')
+
+if fullr2 == 'NA':
+ stop_err("Error in linear regression")
+
+if len(x_vals) < 10:
+ s = ""
+ for ch in range(len(x_vals)):
+ s += str(ch)
+else:
+ stop_err("This tool only works with less than 10 predictors.")
+
+print >> fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
+all_combos = sorted(sscombs(s), key=len)
+all_combos.reverse()
+for j, cols in enumerate(all_combos):
+ #if len(cols) == len(s): #Same as the full model above
+ # continue
+ if len(cols) == 1:
+ x_vals1 = x_vals[int(cols)]
+ else:
+ x_v = []
+ for col in cols:
+ x_v.append(x_vals[int(col)])
+ x_vals1 = numpy.asarray(x_v).transpose()
+ dat = r.list(x=array(x_vals1), y=y_vals)
+ set_default_mode(NO_CONVERSION)
+ red = r.lm(r("y ~ x"), data= dat) #Reduced model
+ set_default_mode(BASIC_CONVERSION)
+ summary = r.summary(red)
+ redr2 = summary.get('r.squared','NA')
+ try:
+ rcve = (float(fullr2)-float(redr2))/float(fullr2)
+ except:
+ rcve = 'NA'
+ col_str = ""
+ for col in cols:
+ col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
+ col_str.strip()
+ rcve_col_str = ""
+ for col in s:
+ if col not in cols:
+ rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
+ rcve_col_str.strip()
+ if len(cols) == len(s): #full model
+ rcve_col_str = "-"
+ rcve = "-"
+ try:
+ redr2 = "%.4f" % (float(redr2))
+ except:
+ pass
+ try:
+ rcve = "%.4f" % (float(rcve))
+ except:
+ pass
+ print >> fout, "%s\t%s\t%s\t%s" % ( col_str, redr2, rcve_col_str, rcve )
diff -r 000000000000 -r d8c414b9d774 rcve.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rcve.xml Tue Apr 01 09:13:06 2014 -0400
@@ -0,0 +1,75 @@
+
+
+
+ R
+ numpy
+ rpy
+
+
+ rcve.py
+ $input1
+ $response_col
+ $predictor_cols
+ $out_file1
+ 1>/dev/null
+
+
+
+
+
+
+
+
+
+
+
+
+ rpy
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula:
+
+**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**,
+which denotes the case where the 'i'th predictor is dropped.
+
+
+In general,
+**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**,
+where,
+
+- {X,X+} denotes the set of all predictors,
+- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one),
+- {X} is the set of the predictors that are left in the reduced model after excluding {X+}
+
+
+The 4 columns in the output are described below:
+
+- Column 1 (Model): denotes the variables present in the model ({X})
+- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1
+- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model.
+- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model.
+
+
+
+
diff -r 000000000000 -r d8c414b9d774 test-data/rcve_out.dat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rcve_out.dat Tue Apr 01 09:13:06 2014 -0400
@@ -0,0 +1,8 @@
+#Model R-sq RCVE_Terms RCVE_Value
+2 3 4 0.3997 - -
+3 4 0.3319 2 0.1697
+2 4 0.2974 3 0.2561
+2 3 0.3985 4 0.0031
+4 0.1226 2 3 0.6934
+3 0.2733 2 4 0.3164
+2 0.2972 3 4 0.2564
diff -r 000000000000 -r d8c414b9d774 test-data/reg_inp.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reg_inp.tab Tue Apr 01 09:13:06 2014 -0400
@@ -0,0 +1,100 @@
+2.04 2.01 1070 5
+2.56 3.40 1254 6
+3.75 3.68 1466 6
+1.10 1.54 706 4
+3.00 3.32 1160 5
+0.05 0.33 756 3
+1.38 0.36 1058 2
+1.50 1.97 1008 7
+1.38 2.03 1104 4
+4.01 2.05 1200 7
+1.50 2.13 896 7
+1.29 1.34 848 3
+1.90 1.51 958 5
+3.11 3.12 1246 6
+1.92 2.14 1106 4
+0.81 2.60 790 5
+1.01 1.90 954 4
+3.66 3.06 1500 6
+2.00 1.60 1046 5
+2.05 1.96 1054 4
+2.60 1.96 1198 6
+2.55 1.56 940 3
+0.38 1.60 456 6
+2.48 1.92 1150 7
+2.74 3.09 636 6
+1.77 0.78 744 5
+1.61 2.12 644 5
+0.99 1.85 842 3
+1.62 1.78 852 5
+2.03 1.03 1170 3
+3.50 3.44 1034 10
+3.18 2.42 1202 5
+2.39 1.74 1018 5
+1.48 1.89 1180 5
+1.54 1.43 952 3
+1.57 1.64 1038 4
+2.46 2.69 1090 6
+2.42 1.79 694 5
+2.11 2.72 1096 6
+2.04 2.15 1114 5
+1.68 2.22 1256 6
+1.64 1.55 1208 5
+2.41 2.34 820 6
+2.10 2.92 1222 4
+1.40 2.10 1120 5
+2.03 1.64 886 4
+1.99 2.83 1126 7
+2.24 1.76 1158 4
+0.45 1.81 676 6
+2.31 2.68 1214 7
+2.41 2.55 1136 6
+2.56 2.70 1264 6
+2.50 1.66 1116 3
+2.92 2.23 1292 4
+2.35 2.01 604 5
+2.82 1.24 854 6
+1.80 1.95 814 6
+1.29 1.73 778 3
+1.68 1.08 800 2
+3.44 3.46 1424 7
+1.90 3.01 950 6
+2.06 0.54 1056 3
+3.30 3.20 956 8
+1.80 1.50 1352 5
+2.00 1.71 852 5
+1.68 1.99 1168 5
+1.94 2.76 970 6
+0.97 1.56 776 4
+1.12 1.78 854 6
+1.31 1.32 1232 5
+1.68 0.87 1140 6
+3.09 1.75 1084 4
+1.87 1.41 954 2
+2.00 2.77 1000 4
+2.39 1.78 1084 4
+1.50 1.34 1058 4
+1.82 1.52 816 5
+1.80 2.97 1146 7
+2.01 1.75 1000 6
+1.88 1.64 856 4
+1.64 1.80 798 4
+2.42 3.37 1324 6
+0.22 1.15 704 6
+2.31 1.72 1222 5
+0.95 2.27 948 6
+1.99 2.85 1182 8
+1.86 2.21 1000 6
+1.79 1.94 910 6
+3.02 4.25 1374 9
+1.85 1.83 1014 6
+1.98 2.75 1420 7
+2.15 1.71 400 6
+1.46 2.20 998 7
+2.29 2.13 776 6
+2.39 2.38 1134 7
+1.80 1.64 772 4
+2.64 1.87 1304 6
+2.08 2.53 1212 4
+0.70 1.78 818 6
+0.89 1.20 864 2
\ No newline at end of file
diff -r 000000000000 -r d8c414b9d774 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Tue Apr 01 09:13:06 2014 -0400
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+