Mercurial > repos > devteam > partialr_square
changeset 0:51e06a12e2ad draft default tip
Imported from capsule None
| author | devteam | 
|---|---|
| date | Tue, 01 Apr 2014 09:13:02 -0400 | 
| parents | |
| children | |
| files | partialR_square.py partialR_square.xml test-data/partialR_result.tabular test-data/regr_inp.tabular tool_dependencies.xml | 
| diffstat | 5 files changed, 244 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/partialR_square.py Tue Apr 01 09:13:02 2014 -0400 @@ -0,0 +1,145 @@ +#!/usr/bin/env python + +import sys +from rpy import * +import numpy + +#export PYTHONPATH=~/galaxy/lib/ +#running command python partialR_square.py reg_inp.tab 4 1,2,3 partialR_result.tabular + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def sscombs(s): + if len(s) == 1: + return [s] + else: + ssc = sscombs(s[1:]) + return [s[0]] + [s[0]+comb for comb in ssc] + ssc + + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] + +print "Predictor columns: %s; Response column: %d" % ( x_cols, y_col+1 ) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k, col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + """ + try: + float( elems[x_cols[k]] ) + except: + try: + msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." % ( col, elems[x_cols[k]] ) + except: + msg = "This operation cannot be performed on non-numeric data." + stop_err( msg ) + """ +NA = 'NA' +for ind, line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + #print >> sys.stderr, "ey = %s" %ey + y_vals.append(yval) + for k, col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + #print >> sys.stderr, "ex = %s" %ex + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() +dat = r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(full) +fullr2 = summary.get('r.squared','NA') + +if fullr2 == 'NA': + stop_err("Error in linear regression") + +if len(x_vals) < 10: + s = "" + for ch in range(len(x_vals)): + s += str(ch) +else: + stop_err("This tool only works with less than 10 predictors.") + +print >> fout, "#Model\tR-sq\tpartial_R_Terms\tpartial_R_Value" +all_combos = sorted(sscombs(s), key=len) +all_combos.reverse() +for j, cols in enumerate(all_combos): + #if len(cols) == len(s): #Same as the full model above + # continue + if len(cols) == 1: + x_vals1 = x_vals[int(cols)] + else: + x_v = [] + for col in cols: + x_v.append(x_vals[int(col)]) + x_vals1 = numpy.asarray(x_v).transpose() + dat = r.list(x=array(x_vals1), y=y_vals) + set_default_mode(NO_CONVERSION) + red = r.lm(r("y ~ x"), data= dat) #Reduced model + set_default_mode(BASIC_CONVERSION) + summary = r.summary(red) + redr2 = summary.get('r.squared','NA') + try: + partial_R = (float(fullr2)-float(redr2))/(1-float(redr2)) + except: + partial_R = 'NA' + col_str = "" + for col in cols: + col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " + col_str.strip() + partial_R_col_str = "" + for col in s: + if col not in cols: + partial_R_col_str = partial_R_col_str + str(int(x_cols[int(col)]) + 1) + " " + partial_R_col_str.strip() + if len(cols) == len(s): #full model + partial_R_col_str = "-" + partial_R = "-" + try: + redr2 = "%.4f" % (float(redr2)) + except: + pass + try: + partial_R = "%.4f" % (float(partial_R)) + except: + pass + print >> fout, "%s\t%s\t%s\t%s" % ( col_str, redr2, partial_R_col_str, partial_R )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/partialR_square.xml Tue Apr 01 09:13:02 2014 -0400 @@ -0,0 +1,73 @@ +<tool id="partialRsq" name="Compute partial R square" version="1.0.0"> + <description> </description> + <requirements> + <requirement type="package" version="2.11.0">R</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="1.0.3">rpy</requirement> + </requirements> + <command interpreter="python"> + partialR_square.py + $input1 + $response_col + $predictor_cols + $out_file1 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true"> + <validator type="no_options" message="Please select at least one column."/> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> + <!-- Test data with vlid values --> + <test> + <param name="input1" value="regr_inp.tabular"/> + <param name="response_col" value="3"/> + <param name="predictor_cols" value="1,2"/> + <output name="out_file1" file="partialR_result.tabular"/> + </test> + + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool computes the Partial R squared for all possible variable subsets using the following formula: + +**Partial R squared = [SSE(without i: 1,2,...,p-1) - SSE (full: 1,2,..,i..,p-1) / SSE(without i: 1,2,...,p-1)]**, which denotes the case where the 'i'th predictor is dropped. + + + +In general, **Partial R squared = [SSE(without i: 1,2,...,p-1) - SSE (full: 1,2,..,i..,p-1) / SSE(without i: 1,2,...,p-1)]**, where, + +- SSE (full: 1,2,..,i..,p-1) = Sum of Squares left out by the full set of predictors SSE(X1, X2 … Xp) +- SSE (full: 1,2,..,i..,p-1) = Sum of Squares left out by the set of predictors excluding; for example, if we omit the first predictor, it will be SSE(X2 … Xp). + + +The 4 columns in the output are described below: + +- Column 1 (Model): denotes the variables present in the model +- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1 +- Column 3 (Partial R squared_Terms): denotes the variable/s for which Partial R squared is computed. These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model. +- Column 4 (Partial R squared): denotes the Partial R squared value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model. + +*R Development Core Team (2010). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.* + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/partialR_result.tabular Tue Apr 01 09:13:02 2014 -0400 @@ -0,0 +1,4 @@ +#Model R-sq partial_R_Terms partial_R_Value +1 2 0.9388 - - +2 0.7280 1 0.7750 +1 0.9104 2 0.3167 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regr_inp.tabular Tue Apr 01 09:13:02 2014 -0400 @@ -0,0 +1,10 @@ +7 33 42 +4 41 33 +16 7 75 +3 49 28 +21 5 91 +8 31 55 +7 35 52 +5 30 16 +15 10 69 +20 10 94
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 01 09:13:02 2014 -0400 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="numpy" version="1.7.1"> + <repository changeset_revision="55a7a5e9d63f" name="package_numpy_1_7" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="rpy" version="1.0.3"> + <repository changeset_revision="c0eb80864491" name="package_rpy_1_0_3" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="R" version="2.11.0"> + <repository changeset_revision="2c0a13200a73" name="package_r_2_11_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
