Mercurial > repos > kellrott > dichotomize
changeset 0:40e5c7e8265e draft default tip
Uploaded
author | kellrott |
---|---|
date | Fri, 21 Dec 2012 16:55:36 -0500 |
parents | |
children | |
files | dichotomize/dichotomize.py dichotomize/dichotomize.xml |
diffstat | 2 files changed, 290 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dichotomize/dichotomize.py Fri Dec 21 16:55:36 2012 -0500 @@ -0,0 +1,207 @@ +#!/usr/bin/env python + +import sys +import csv +import re +from argparse import ArgumentParser + + +class FloatSplitter: + def __init__(self, name, column, splitval): + self.name = name + self.column = column + self.splitval = splitval + + def split(self, val): + return float(val) > self.splitval + + def __repr__(self): + return self.name + + def __repr__(self): + return self.name + +class SetMember: + def __init__(self, name, column, label): + self.name = name + self.column = column + self.label = label + + def split(self, val): + return val == self.label + + def __str__(self): + return self.name + + def __repr__(self): + return self.name + + +class SplitGenerator: + def __init__(self, args, name, values): + self.args = args + self.name = name + self.values = values + + def float_values(self): + fvals = [] + for v in self.values[i]: + if v not in NA_SET: + t = float(v) + fvals.append( float(v) ) + return fvals + + def value_set(self): + val_set = {} + for v in self.values: + val_set[v] = True + return val_set + + + def is_valid(self): + try: + isFloat = True + t = self.float_values() + except ValueError: + isFloat = False + return isFloat + +class EnumerationGen(SplitGenerator): + + def is_valid(self): + v = self.value_set() + if len(v) < self.args.max and len(v) > 1: + return True + return False + + def __iter__(self): + """ + if len(vals) == 2: + v = vals.keys()[0] + t_new.append(SetMember( "%s:label=%s" % (header[i], v), i, v )) + else: + """ + for v in self.value_set(): + yield SetMember( "%s:label=%s" % (self.name, v), i, v ) + +class MeanGen(SplitGenerator): + + def __iter__(self): + fvals = self.float_values() + yield FloatSplitter( "%s:mean" % (self.name), i, sum(fvals) / float(len(fvals)) ) + +class MedianGen(SplitGenerator): + + def __iter__(self): + fvals = self.float_values() + fvals.sort() + median = (fvals[len(fvals)/2]+fvals[(len(fvals)/2)-1])/2.0 + yield FloatSplitter("%s:median" % (self.name), i, median) + +class_map = { + "enumerate" : EnumerationGen, + "mean" : MeanGen, + "median" : MedianGen +} + + +NA_SET = [ 'nan', 'na', 'n/a', '' ] + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("src", help="Source Phenotype Matrix", default=None) + parser.add_argument("-p", "--pos", help="Positive Set", default="1") + parser.add_argument("-n", "--neg", help="Negative Set", default="0") + parser.add_argument("-m", "--max", help="Max group dichotomizations", type=int, default=3) + parser.add_argument("-g", "--group-min", dest="min", help="Min members of group", type=int, default=1) + parser.add_argument("-s", "--script-file", dest="script_file", help="Split Script", default=None) + + args = parser.parse_args() + + + handle = open(args.src) + reader = csv.reader(handle, delimiter="\t") + header = None + colMap = None + colVals = None + rowCount = 0 + for row in reader: + if header is None: + header = {} + isFloat = {} + colVals = {} + colMap = {} + for i, a in enumerate(row): + header[i] = a + colMap[a] = i + isFloat[i] = True + colVals[i] = [] + else: + rowCount += 1 + for col in header: + colVals[col].append(row[col]) + handle.close() + + start_splitter = [] + if args.script_file is None: + for i in range(1,len(header)): + + mean = MeanGen(args, header[i], colVals[i]) + + if mean.is_valid(): + for split in mean: + start_splitter.append(split) + else: + enum = EnumerationGen(args, header[i], colVals[i]) + if enum.is_valid(): + for split in enum: + start_splitter.append(split) + + else: + handle = open(args.script_file) + for line in handle: + row = line.rstrip().split(",") + if len(row) == 2: + cls = class_map[row[0]] + if row[1] in colMap: + index = colMap[row[1]] + else: + res = re.search(r'^c(\d+)', row[1]) + if res: + index = int(res.group(1)) + obj = cls(args, header[index], colVals[index]) + for split in obj: + start_splitter.append(split) + + theories = [] + for t in start_splitter: + set_count = { True : 0, False : 0} + for i in range(rowCount): + if colVals[t.column][i] not in NA_SET: + v = t.split(colVals[ t.column ][ i ]) + set_count[v] = set_count[v] + 1 + if min(set_count.values()) >= args.min: + theories.append(t) + + + writer = csv.writer( sys.stdout, delimiter="\t", lineterminator="\n" ) + out = ["sample"] + for t in theories: + out.append( str(t) ) + writer.writerow(out) + + for i in range(rowCount): + out = [colVals[0][i]] + for t in theories: + if colVals[t.column][i] not in NA_SET: + if t.split(colVals[ t.column ][ i ]): + out.append( args.pos ) + else: + out.append( args.neg ) + else: + out.append("") + writer.writerow(out) + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dichotomize/dichotomize.xml Fri Dec 21 16:55:36 2012 -0500 @@ -0,0 +1,83 @@ +<tool id="dichotomize" name="Dichotomize" version="1.0.0"> + <description> a clinical tabular file</description> + <command interpreter="python">dichotomize.py +#if $dichot_mode.mode=="manual": +-s ${script_file} +#else +-p '${dichot_mode.pos_string}' -n '${dichot_mode.neg_string}' -m ${dichot_mode.max_dichot} +#end if +$infile > $outfile + </command> + <inputs> + <param name="infile" type="data" format="tabular" label="Input Phenotype File"/> + <conditional name="dichot_mode"> + <param name="mode" type="select" label="Dichotimize Method"> + <option value="auto">Auto</option> + <option value="manual">Manual</option> + </param> + <when value="auto"> + <param name="max_dichot" type="integer" value="3" label="Max Enumerated Dichotomies"/> + <param name="pos_string" type="text" value="+" label="Positive Label"/> + <param name="neg_string" type="text" value="-" label="Negative Label"/> + </when> + <when value="manual"> + <repeat name="rule" title="Dichtomize Rule" min="1"> + <param name="col" type="text" value="c1" label="Column"/> + <param name="mode" type="select" label="Mode"> + <option value="mean">Mean Split</option> + <option value="median">Median Split</option> + <option value="enumerate">Enumeration</option> + </param> + </repeat> + </when> + </conditional> + </inputs> + <outputs> + <data name="outfile" format="tabular"/> + </outputs> + <configfiles> + <configfile name="script_file"> +#if $dichot_mode.mode=="manual": +#for r in $dichot_mode.rule: +${r.mode},${r.col} +#end for +#end if + </configfile> + </configfiles> + <help> +Turn a phenotype file into a dichotomous feature file. +The standard tests are: + + - Split float values by mean + - Set membership for enumerated labels + +*Input Phenotype File* + A tabular file of clinical features, colums=feature, row=samples. The first column is assumed to be the sample name + +*Positive Label* + The label printed into the file for postive entries in the dichotomous matrix + +*Negative Label* + The label printed into the file for negative entries in the dichotomous matrix + +*Dichotimize Method* + + *Auto* + + *Max Enumerated Dichotomies* + In the case that a column is not all numbers (and thus, no mean can be found), how many different ways of setting up + dichotomous groups should be tried? Each grouping is split along samples matching and not-matching a label. + Non-numeric columns with more then 'max' different values are ignored + + *Manual* + + *Column* + Either the column name, or a 'c1', 'c2', ... in order to select a column + by its number + + + *Mode* + Dichotimizing method: mean, median, enumeration + + </help> +</tool>