Mercurial > repos > kellrott > dichotomize

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dichotomize/dichotomize.py	Fri Dec 21 16:55:36 2012 -0500
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import re
+from argparse import ArgumentParser
+
+
+class FloatSplitter:
+	def __init__(self, name, column, splitval):
+		self.name = name
+		self.column = column
+		self.splitval = splitval
+
+	def split(self, val):
+		return float(val) > self.splitval
+
+	def __repr__(self):
+		return self.name
+
+	def __repr__(self):
+		return self.name
+
+class SetMember:
+	def __init__(self, name, column, label):
+		self.name = name
+		self.column = column
+		self.label = label
+
+	def split(self, val):
+		return val == self.label
+
+	def __str__(self):
+		return self.name
+
+	def __repr__(self):
+		return self.name
+
+
+class SplitGenerator:
+	def __init__(self, args, name, values):
+		self.args = args
+		self.name = name
+		self.values = values
+
+	def float_values(self):
+		fvals = []
+		for v in self.values[i]:
+			if v not in NA_SET:
+				t = float(v)
+				fvals.append( float(v) )
+		return fvals
+
+	def value_set(self):
+		val_set = {}
+		for v in self.values:
+			val_set[v] = True
+		return val_set
+
+
+	def is_valid(self):
+		try:
+			isFloat = True
+			t = self.float_values()
+		except ValueError:
+			isFloat = False
+		return isFloat
+
+class EnumerationGen(SplitGenerator):
+
+	def is_valid(self):
+		v = self.value_set()
+		if len(v) < self.args.max and len(v) > 1:
+			return True
+		return False
+
+	def __iter__(self):
+		"""
+		if len(vals) == 2:
+			v = vals.keys()[0]
+			t_new.append(SetMember( "%s:label=%s" % (header[i], v), i, v ))
+		else:
+		"""
+		for v in self.value_set():
+			yield SetMember( "%s:label=%s" % (self.name, v), i, v )
+
+class MeanGen(SplitGenerator):
+
+	def __iter__(self):
+		fvals = self.float_values()
+		yield FloatSplitter( "%s:mean" % (self.name), i, sum(fvals) / float(len(fvals)) )
+
+class MedianGen(SplitGenerator):
+
+	def __iter__(self):
+		fvals = self.float_values()
+		fvals.sort()
+		median = (fvals[len(fvals)/2]+fvals[(len(fvals)/2)-1])/2.0
+		yield FloatSplitter("%s:median" % (self.name), i, median)
+
+class_map = {
+	"enumerate" : EnumerationGen,
+	"mean" : MeanGen,
+	"median" : MedianGen
+}
+
+
+NA_SET = [ 'nan', 'na', 'n/a', '' ]
+
+if __name__ == "__main__":
+	parser = ArgumentParser()
+	parser.add_argument("src", help="Source Phenotype Matrix", default=None)
+	parser.add_argument("-p", "--pos", help="Positive Set", default="1")
+	parser.add_argument("-n", "--neg", help="Negative Set", default="0")
+	parser.add_argument("-m", "--max", help="Max group dichotomizations", type=int, default=3)
+	parser.add_argument("-g", "--group-min", dest="min", help="Min members of group", type=int, default=1)
+	parser.add_argument("-s", "--script-file", dest="script_file", help="Split Script", default=None)
+
+	args = parser.parse_args()
+
+
+	handle = open(args.src)
+	reader = csv.reader(handle, delimiter="\t")
+	header = None
+	colMap = None
+	colVals = None
+	rowCount = 0
+	for row in reader:
+		if header is None:
+			header = {}
+			isFloat = {}
+			colVals = {}
+			colMap = {}
+			for i, a in enumerate(row):
+				header[i] = a
+				colMap[a] = i
+				isFloat[i] = True
+				colVals[i] = []
+		else:
+			rowCount += 1
+			for col in header:
+				colVals[col].append(row[col])
+	handle.close()
+
+	start_splitter = []
+	if args.script_file is None:
+		for i in range(1,len(header)):
+
+			mean = MeanGen(args, header[i], colVals[i])
+
+			if mean.is_valid():
+				for split in mean:
+					start_splitter.append(split)
+			else:
+				enum = EnumerationGen(args, header[i], colVals[i])
+				if enum.is_valid():
+					for split in enum:
+						start_splitter.append(split)
+
+	else:
+		handle = open(args.script_file)
+		for line in handle:
+			row = line.rstrip().split(",")
+			if len(row) == 2:
+				cls = class_map[row[0]]
+				if row[1] in colMap:
+					index = colMap[row[1]]
+				else:
+					res = re.search(r'^c(\d+)', row[1])
+					if res:
+						index = int(res.group(1))
+				obj = cls(args, header[index], colVals[index])
+				for split in obj:
+					start_splitter.append(split)
+
+	theories = []
+	for t in start_splitter:
+		set_count = { True : 0, False : 0}
+		for i in range(rowCount):
+			if colVals[t.column][i] not in NA_SET:
+				v = t.split(colVals[ t.column ][ i ])
+				set_count[v] = set_count[v] + 1
+		if min(set_count.values()) >= args.min:
+			theories.append(t)
+
+
+	writer = csv.writer( sys.stdout, delimiter="\t", lineterminator="\n" )
+	out = ["sample"]
+	for t in theories:
+		out.append( str(t) )
+	writer.writerow(out)
+
+	for i in range(rowCount):
+		out = [colVals[0][i]]
+		for t in theories:
+			if colVals[t.column][i] not in NA_SET:
+				if t.split(colVals[ t.column ][ i ]):
+					out.append( args.pos )
+				else:
+					out.append( args.neg )
+			else:
+				out.append("")
+		writer.writerow(out)
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dichotomize/dichotomize.xml	Fri Dec 21 16:55:36 2012 -0500
@@ -0,0 +1,83 @@
+<tool id="dichotomize" name="Dichotomize" version="1.0.0">
+  <description> a clinical tabular file</description>
+  <command interpreter="python">dichotomize.py
+#if $dichot_mode.mode=="manual":
+-s ${script_file}
+#else
+-p '${dichot_mode.pos_string}' -n '${dichot_mode.neg_string}' -m ${dichot_mode.max_dichot}
+#end if
+$infile > $outfile
+  </command>
+  <inputs>
+	  <param name="infile" type="data" format="tabular" label="Input Phenotype File"/>
+    <conditional name="dichot_mode">
+      <param name="mode" type="select" label="Dichotimize Method">
+        <option value="auto">Auto</option>
+        <option value="manual">Manual</option>
+      </param>
+      <when value="auto">
+        <param name="max_dichot" type="integer" value="3" label="Max Enumerated Dichotomies"/>
+        <param name="pos_string" type="text" value="+" label="Positive Label"/>
+        <param name="neg_string" type="text" value="-" label="Negative Label"/>
+      </when>
+      <when value="manual">
+        <repeat name="rule" title="Dichtomize Rule" min="1">
+          <param name="col" type="text" value="c1" label="Column"/>
+          <param name="mode" type="select" label="Mode">
+            <option value="mean">Mean Split</option>
+            <option value="median">Median Split</option>
+            <option value="enumerate">Enumeration</option>
+          </param>
+        </repeat>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+      <data name="outfile" format="tabular"/>
+  </outputs>
+  <configfiles>
+    <configfile name="script_file">
+#if $dichot_mode.mode=="manual":
+#for r in $dichot_mode.rule:
+${r.mode},${r.col}
+#end for
+#end if
+    </configfile>
+  </configfiles>
+  <help>
+Turn a phenotype file into a dichotomous feature file.
+The standard tests are:
+
+ - Split float values by mean
+ - Set membership for enumerated labels
+
+*Input Phenotype File*
+    A tabular file of clinical features, colums=feature, row=samples. The first column is assumed to be the sample name
+
+*Positive Label*
+    The label printed into the file for postive entries in the dichotomous matrix
+
+*Negative Label*
+    The label printed into the file for negative entries in the dichotomous matrix
+
+*Dichotimize Method*
+
+    *Auto*
+
+        *Max Enumerated Dichotomies*
+            In the case that a column is not all numbers (and thus, no mean can be found), how many different ways of setting up
+            dichotomous groups should be tried? Each grouping is split along samples matching and not-matching a label.
+            Non-numeric columns with more then 'max' different values are ignored
+
+    *Manual*
+
+        *Column*
+            Either the column name, or a 'c1', 'c2', ... in order to select a column
+            by its number
+
+
+        *Mode*
+            Dichotimizing method: mean, median, enumeration
+
+  </help>
+</tool>