Mercurial > repos > kellrott > matrix_manipulate
changeset 0:eeaa112c9ee0 draft
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_flatten.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import sys +import csv +from optparse import OptionParser + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-m", "--missing", dest="missing", help="Missing", default='') + opts, args = parser.parse_args() + + pivot_col = args[0] + rep_cols = args[1:] + + reader = csv.reader(sys.stdin, delimiter="\t") + writer = csv.writer(sys.stdout, delimiter="\t", lineterminator="\n") + + missing = 'Normal' + + ##Read in the matrix + header = None + cols = [] + rows = [] + col_pos = {} + for row in reader: + if header is None: + header = {} + for i,c in enumerate(row): + header[i] = c + cols.append(c) + col_pos[c] = i + else: + nrow = {} + for i,c in enumerate(cols): + nrow[c] = row[i] + rows.append(nrow) + + ##determine all values in the pivot column + pivot_vals = {} + for r in rows: + if len(r[cols[col_pos[pivot_col]]]): + pivot_vals[ r[cols[col_pos[pivot_col]]] ] = True + + o_cols = [0] + o_names = [header[0]] + o_pivot = [None] + for p in pivot_vals: + for rc in rep_cols: + o_pivot.append(p) + if rc in col_pos: + o_cols.append(cols[col_pos[rc]]) + o_names.append(p + ":" + rc) + writer.writerow(o_names) + + ids = {} + for r in rows: + ids[r[cols[0]]] = True + + for i in ids: + out_row = [opts.missing] * len(o_cols) + out_row[0] = i + for r in rows: + if r[cols[0]] == i: + for c in range(1, len(o_cols)): + if r[pivot_col] == o_pivot[c]: + out_row[c] = r[o_cols[c]] + #print out_row + writer.writerow(out_row) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_flatten.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,48 @@ +<tool id="matrix_flatten" name="Matrix Flatten" version="1.0.0"> + <description> using a state column</description> + <command interpreter="python">matrix_flatten.py $input_tabular + </command> + <inputs> + <param name="input_tabular" type="data" label="Matrix File"/> + <param name="pivot_column" type="text" size="90" label="Pivot Column" value="c1"/> + <repeat name="flatten" title="Flatten Columns Rule" min="1"> + <param name="column" type="text" size="90" label="Flatten Column" value="c2"/> + </repeat> + <param name="missing" type="text" label="Missing Value String" value=""/> + </inputs> + <outputs> + <data name="out" format="tabular" label="Flattened Matrix"/> + </outputs> + <help> +Use the state of one column to flatten out other columns into descrete columns. +The first column is the primary key, with the next column starting as c1. (So in the example below, c1 is DrugType). +columns can be referenced by the cN value or their proper name (ie 'c1' or 'DrugType') + +Example File + ++--------+----------+-----------+ +|#sample | DrugType| DrugDose | ++--------+----------+-----------+ +|sample_1| DrugA | 1 | ++--------+----------+-----------+ +|sample_1| DrugB | 2 | ++--------+----------+-----------+ +|sample_2| DrugA | 3 | ++--------+----------+-----------+ +|sample_2| DrugC | 5 | ++--------+----------+-----------+ + + +One might want to flatten the DrugDose column (c2), using the DrugType column (c1). Using Matrix Flatten, the matrix would becomes + + ++--------+---------------+----------------+----------------+ +|#sample |DrugA:DrugDose |DrugB:DrugDose |DrugC:DrugDose | ++--------+---------------+----------------+----------------+ +|sample_1| 1 | 2 | | ++--------+---------------+----------------+----------------+ +|sample_2| 3 | | 5 | ++--------+---------------+----------------+----------------+ + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_join.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,128 @@ +#!/usr/bin/env python +"""join.py: + +Usage: + join.py [options] file1 file2 [file3 ...] + +Options: + -h header + -i output only features in common with all files + -f use 'float' mode to save memory + -q run quietly +""" +import os, os.path, sys, getopt, re +import array + +delim = "\t" +verbose = True + +def usage(code = 0): + sys.stderr.write( __doc__ ) + if code != None: sys.exit(code) + +def log(msg, die = False): + if (verbose): + sys.stderr.write(msg) + if die: + sys.exit(1) + +def readFile(inFile, header = True, use_float = False): + dataWidth = None + dataMap = {} + f = open(inFile, "r") + if header: + line = f.readline() + if line.isspace(): + log("ERROR: missing header\n", die = True) + pline = re.split(delim, line.rstrip("\n\r")) + dataMap["HEADER"] = pline + dataWidth = len(pline[1:]) + for line in f: + if line.isspace(): + continue + pline = re.split(delim, line.rstrip("\n\r")) + if dataWidth is None: + dataWidth = len(pline[1:]) + assert(len(pline[1:]) == dataWidth) + if use_float: + out = array.array("f") + for a in pline[1:]: + try: + out.append(float(a)) + except ValueError: + out.append(float('nan')) + else: + out = pline[1:] + dataMap[pline[0]] = out + f.close() + return (dataMap, dataWidth) + +def main(args): + ## parse arguments + try: + opts, args = getopt.getopt(args, "hiqfo:") + except getopt.GetoptError, err: + sys.stderr.write( str(err) + "\n" ) + usage(2) + + if len(args) > 0: + files = args + else: + files = [] + for i in sys.stdin: + files.append(i.rstrip("\n\r")) + + if len(files) < 2: + sys.stderr.write("incorrect number of arguments\n") + usage(1) + + header = False + useIntersection = False + output = None + use_float = False + global verbose + for o, a in opts: + if o == "-h": + header = True + elif o == "-i": + useIntersection = True + elif o == "-q": + verbose = False + elif o == "-o": + output = a + elif o == "-f": + use_float = True + + ## read files + fileData = {} + fileWidth = {} + for file in files: + (fileData[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float) + features = list(set(fileData[files[0]].keys()) - set(["HEADER"])) + if useIntersection: + for file in files: + features = list(set(fileData[file].keys()) & set(features)) + features.sort() + + if output is not None: + ohandle = open(output, "w") + else: + ohandle = sys.stdout + + ## output + if header: + lineElements = [fileData[files[0]]["HEADER"][0]] + for file in files: + lineElements += fileData[file]["HEADER"][1:] + ohandle.write("%s\n" % (delim.join(lineElements))) + for feature in features: + lineElements = [] + for file in files: + if feature in fileData[file]: + lineElements += fileData[file][feature] + else: + lineElements += ["" for i in range(fileWidth[file])] + ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) )) + +if __name__ == "__main__": + main(sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_join.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,24 @@ +<tool id="matrix_join" name="Matrix Join" version="1.0.0"> + <description>Join Matrices using row labels</description> + <command interpreter="python">matrix_join.py -h +-o $out +#if $is_float: +-f +#end if +#for $a in $in_mats: +$a.file +#end for + </command> + <inputs> + <repeat name="in_mats" title="Input Matrix" min="1"> + <param name="file" type="data" label="Matrix File"/> + </repeat> + <param name="is_float" type="boolean" label="Float Values" help="If all matrices are floating point numbers, use to save memory"/> + </inputs> + <outputs> + <data name="out" format="tabular" label="Joined Matrix" help="Joined Matrix"/> + </outputs> + <help> +Join matricies by row labels + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_merge.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,139 @@ +#!/usr/bin/env python + +import sys +import csv + +class ClinicalMatrix: + corner_name = "sample" + def load(self, path): + self.col_map = {} + self.row_map = {} + pos_hash = None + + handle = open(path) + + self.matrix = [] + for row in csv.reader(handle, delimiter="\t"): + if pos_hash is None: + pos_hash = {} + pos = 0 + for name in row[1:]: + i = 1 + orig_name = name + while name in pos_hash: + name = orig_name + "#" + str(i) + i += 1 + pos_hash[name] = pos + pos += 1 + else: + newRow = [] + newRow = [""] * (len(pos_hash)) + for col in pos_hash: + i = pos_hash[col] + 1 + newRow[i - 1] = row[i] + self.row_map[row[0]] = len(self.matrix) + self.matrix.append(newRow) + + self.col_map = {} + for col in pos_hash: + self.col_map[col] = pos_hash[col] + + def get_row_list(self): + """ + Returns names of rows + """ + out = self.row_map.keys() + out.sort( lambda x,y: self.row_map[x]-self.row_map[y]) + return out + + def get_col_list(self): + """ + Returns names of columns + """ + out = self.col_map.keys() + out.sort( lambda x,y: self.col_map[x]-self.col_map[y]) + return out + + def get_row(self, row_name): + return self.matrix[ self.row_map[row_name] ] + + def set_val(self, col_name, row_name, value): + """ + Set cell value based on row and column names + """ + self.matrix[self.row_map[row_name]][self.col_map[col_name]] = value + + def get_val(self, col_name, row_name): + """ + Get cell value based on row and column names + """ + return self.matrix[self.row_map[row_name]][self.col_map[col_name]] + + + def init_blank(self, cols, rows): + """ + Initlize matrix with NA (or nan) values using row/column names + provided by user. User can also force usage of native python objects + (which is useful for string based matrices, and numpy matrices fix cel string length) + """ + self.matrix = [] + self.col_map = {} + self.row_map = {} + for i in range(len(rows)): + self.matrix.append([""]*len(cols)) + for i, c in enumerate(cols): + self.col_map[c] = i + for i, r in enumerate(rows): + self.row_map[r] = i + self.loaded = True + + def write(self, handle, missing=''): + write = csv.writer(handle, delimiter="\t", lineterminator='\n') + col_list = self.get_col_list() + + write.writerow([self.corner_name] + col_list) + for rowName in self.row_map: + out = [rowName] + row = self.get_row(rowName) + for col in col_list: + val = row[self.col_map[col]] + out.append(val) + write.writerow(out) + + def merge(self, other): + rows = {} + #get the rows that part of the original matrix + for r in self.get_row_list(): + rows[r] = None + if other is not None: + for r in other.get_row_list(): + rows[r] = None + cols = {} + #get the cols that part of the original matrix + for r in self.get_col_list(): + cols[r] = None + if other is not None: + for r in other.get_col_list(): + cols[r] = None + out = ClinicalMatrix() + out.init_blank(cols=cols, rows=rows) + for row in self.get_row_list(): + for col in self.get_col_list(): + out.set_val(col_name=col, row_name=row, value=self.get_val(col_name=col, row_name=row)) + if other is not None: + for row in other.get_row_list(): + for col in other.get_col_list(): + out.set_val(col_name=col, row_name=row, value=other.get_val(col_name=col, row_name=row)) + return out + + +if __name__ == "__main__" : + + matrix = None + for p in sys.argv[1:]: + nmatrix = ClinicalMatrix() + nmatrix.load(p) + matrix = nmatrix.merge(matrix) + + matrix.write(sys.stdout, missing="") +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_merge.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,21 @@ +<tool id="matrix_merge" name="Matrix Merge" version="1.0.0"> + <description>Merge multiple matrices</description> + <command interpreter="python">matrix_merge.py +#for $a in $in_mats: +$a.file +#end for +> $out + </command> + <inputs> + <repeat name="in_mats" title="Input Matrix" min="1"> + <param name="file" type="data" label="Matrix File"/> + </repeat> + </inputs> + <outputs> + <data name="out" format="tabular" label="Merged Matrix"/> + </outputs> + <help> +Join matricies by row and column labels. Unlike 'Matrix Join', which simple concantinates matrices +so that their row labels match (without enforcing unique column labels), 'Matrix Merge' joins all the values using unique row and column labels. The last value for a row/column combination is used. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_rank_normalize.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +import sys +import time +import array +import csv +import math +import ctypes +import ctypes.util +import argparse + +libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c")) + +def rankTransform(rank, total): + return rank / float(total) + +def py_cmp_float(a_ptr, b_ptr): + a = a_ptr.contents.value + b = b_ptr.contents.value + return (a > b) - (a < b) + +CMPFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float)) + +cmp_float = CMPFUNC(py_cmp_float) + +def csort(buf): + """ + This is an inplace sort of an array.array float class using the C qsort function + """ + addr, count = buf.buffer_info() + libc.qsort( ctypes.cast(addr, ctypes.POINTER(ctypes.c_float)), count, ctypes.sizeof(ctypes.c_float), cmp_float) + + +def transformFile(fh, ofh, sep="\t", zero_drop=False, na2zero=False): + + floatValues = None + cols = None + rows = None + totalValues = 0 + reader = csv.reader(fh, delimiter=sep) + for row in reader: + if cols is None: + cols = row[1:] + numCols = len(cols) + rows = [] + floatValues = array.array('f') + else: + rows.append(row[0]) + assert(len(row)-1 == numCols) + for val in row[1:]: + try: + v = float(val) + floatValues.append(v) + if not zero_drop or v != 0.0: + totalValues += 1 + except ValueError: + floatValues.append(float('nan')) + + numRows = len(rows) + if (numRows == 0): + sys.stderr.write("Empty input\n") + exit(10) + + if totalValues == 0: + assert False, "did not read any values" + + sortedValues = array.array('f') + for f in floatValues: + if f == f: + sortedValues.append(f) + + csort(sortedValues) + + i = 0 + rankDict = dict() + for val in sortedValues: + if not math.isnan(val) and (not zero_drop or val != 0.0): + rankDict[val] = rankTransform(i, totalValues) + i += 1 + + def rowString(rowNum): + def matrixVal(colNum): + val = floatValues[rowNum*numCols + colNum] + if val in rankDict: + return ("%5g" % rankDict[val]) + else: + if na2zero: + return "0" + else: + return "NA" + return "\t".join(map(matrixVal, range(numCols))) + + ofh.write( "probe\t%s\n" % ("\t".join(cols)) ) + for j in range(numRows): + ofh.write("%s\t%s\n" % (rows[j], rowString(j))) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-s", "--sep", help="Seperator", default="\t") + parser.add_argument("-o", "--out", help="Output", default=None) + parser.add_argument("-n", "--na2zero", help="Change NAs to Zero", action="store_true", default=False) + parser.add_argument("-z", "--zero-drop", help="Drop Zero Values", action="store_true", default=False) + parser.add_argument("input", help="Input Matrix", default=None) + + args = parser.parse_args() + + if args.input == "-": + fh = sys.stdin + else: + fh = open(args.input) + if args.out is None: + ofh = sys.stdout + else: + ofh = open(args.out, "w") + + transformFile(fh, ofh, args.sep, args.zero_drop, args.na2zero) + fh.close() + ofh.close() + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_rank_normalize.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,24 @@ +<tool id="rank_normalize" name="Rank Normalize" version="1.0.0"> + <description>Perform a rank normalized transform to a number matrix</description> + <command interpreter="python">rank_normalize.py +#if $dropZeros: +-z +#end if +#if $na2zero: +-n +#end if +$inMatrix -o $outMatrix + </command> + <inputs> + <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/> + <param name="dropZeros" type="boolean" label="Drop Input Zeros" checked="True"/> + <param name="na2zero" type="boolean" label="Set Output NAs to Zero" checked="False"/> + </inputs> + <outputs> + <data name="outMatrix" format="tabular"/> + </outputs> + <help> +Determines the ranking of every number in a matrix and then normalizes them into a +distribution between 0 and 1. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_rank_segment.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import sys +import time +import array +import csv +import math +import ctypes +import ctypes.util +import argparse + + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", help="Input", default=None) + parser.add_argument("-o", "--output", help="Output", default=None) + + parser.add_argument("-b", dest="bounds", help="Boundaries", type=float, nargs="*", default=None) + parser.add_argument("-v", dest="vals", help="Group Values", nargs="*", default=None) + + args = parser.parse_args() + + if len(args.bounds) != len(args.vals) - 1: + sys.stderr.write("Number of groups created by boundaries and group values does not match\n") + sys.exit(1) + + head = True + ihandle = open(args.input) + ohandle = open(args.output, "w") + for line in ihandle: + if head: + ohandle.write(line) + head = False + else: + row = line.rstrip().split("\t") + out = [row[0]] + for a in row[1:]: + v = float(a) + if v <= args.bounds[0]: + o = args.vals[0] + if v > args.bounds[-1]: + o = args.vals[-1] + for i in range(1, len(args.bounds)): + if v > args.bounds[i-1] and v <= args.bounds[i]: + o = args.vals[i] + out.append(o) + ohandle.write("%s\n" % ("\t".join(out))) + ihandle.close() + ohandle.close() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_rank_segment.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,28 @@ +<tool id="rank_segment" name="Matrix Rank Segment" version="1.0.0"> + <description>Perform a rank normalized transform to a number matrix</description> + <command interpreter="python">rank_segment.py +-v $value +#for v in $bounds +${v.value} +#end for +-b +#for v in $bounds +${v.range} +#end for +-i $inMatrix -o $outMatrix + </command> + <inputs> + <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/> + <param name="value" type="text" label="Below Label" value="0"/> + <repeat name="bounds" title="Boundaries" min="1"> + <param name="range" type="float" label="Boundary" value="0.5"/> + <param name="value" type="text" label="Above Label" value="1"/> + </repeat> + </inputs> + <outputs> + <data name="outMatrix" format="tabular"/> + </outputs> + <help> +Use a set of boundaries to transform all the values in a matrix into a set of discrete states. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_saturate.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,336 @@ +#!/usr/bin/env python + +import sys +import csv +import argparse + + +class IDDag: + def __init__(self): + self.graph = None + + def load(self, path): + handle = open(path) + self.parents = {} + for line in handle: + tmp = line.rstrip().split("\t") + if tmp[0] not in self.parents: + self.parents[tmp[0]] = [ tmp[1] ] + else: + self.parents[tmp[0]].append( tmp[1] ) + + handle.close() + + def get_key_list(self): + return self.parents.keys() + + def get_by(self, key): + return self.parents[key] + + def _build_graph(self): + self.graph = {} + self.rev_graph = {} + for pid in self.get_key_list(): + if pid not in self.graph: + self.graph[pid] = {} + p = self.get_by(pid) + for cid in p: + self.graph[pid][cid.child] = True + if cid.child not in self.rev_graph: + self.rev_graph[cid.child] = {} + self.rev_graph[cid.child][cid.id] = True + + def is_descendant(self, parent, child): + if self.graph is None: + self._build_graph() + + cid = child + while cid in self.rev_graph: + cid = self.rev_graph[cid].keys()[0] + if cid == parent: + return True + return False + + def _desc_crawl(self, parent): + out = {} + if parent in self.graph: + for node in self.graph[parent]: + if node is not None and len(node) and node != parent: + out[node] = True + for c in self._desc_crawl(node): + out[c] = True + return out.keys() + + + def get_descendants(self, parent): + if self.graph is None: + self._build_graph() + return self._desc_crawl(parent) + + def get_children(self, node): + if self.graph is None: + self._build_graph() + if node in self.graph: + return self.graph[node] + return [] + + def get_parents(self, node): + if self.graph is None: + self._build_graph() + if node in self.rev_graph: + return self.rev_graph[node] + return [] + + def in_graph(self, name): + if self.graph is None: + self._build_graph() + + if name in self.graph or name in self.rev_graph: + return True + return False + + +class IDReducer(object): + """ + The IDReducer class uses an IDDag to 'reduce' id's and objects to + common parent objects. + + Assume Matrix 1 has aliquot ids like + - sample1-aliquot1 + - sample2-aliquot1 + - sample2-aliquot1 + And that Matrix 1 has aliquot ids like + - sample1-aliquot2 + - sample2-aliquot2 + - sample2-aliquot2 + + Both files deal with the same samples, but different aliquots were + ran on different machines, producing matrices of different datatypes. + But for data integration perposes, we need to refer to aliquots by their + parent sample name. + + The idDag file for this data would be:: + + sample1 sample1-aliquot1 + sample1 sample1-aliquot2 + sample2 sample2-aliquot1 + sample2 sample2-aliquot2 + sample3 sample2-aliquot1 + sample3 sample2-aliquot2 + + If this file was loaded into an idDag class, and used to initialize an IDReducer + the following transformatins would be valid:: + + > idReducer.reduce_id( 'sample1-aliquot1' ) + 'sample1' + > idReducer.reduce_id( 'sample1-aliquot2' ) + 'sample1' + + """ + def __init__(self, idDag): + self.revGraph = {} + for pid in idDag.get_key_list(): + p = idDag.get_by(pid) + for cid in p: + if cid.child not in self.revGraph: + self.revGraph[cid.child] = {} + self.revGraph[cid.child][cid.id] = cid.edgeType + + def reduce_id(self, id, edgeStop=None): + outID = id + while outID in self.revGraph: + pn = None + for p in self.revGraph[outID]: + if edgeStop is None or edgeStop != self.revGraph[outID][p]: + pn = p + if pn is None: + return outID + outID = pn + return outID + + def reduce_matrix(self, matrix, edgeStop=None): + ncols = {} + rmap = {} + for col in matrix.get_col_list(): + rval = self.reduce_id(col, edgeStop) + if rval not in ncols: + ncols[rval] = [] + ncols[rval].append(col) + rmap[col] = rval + out = CGData.GenomicMatrix.GenomicMatrix() + out.init_blank( cols=ncols.keys(), rows=matrix.get_row_list() ) + for row in matrix.get_row_list(): + for col in ncols: + tmp = [] + for nc in ncols[col]: + tmp.append( matrix.get_val( col_name=nc, row_name=row ) ) + v = sum(tmp) / float(len(tmp)) + out.set_val(row_name=row, col_name=col, value=v) + return out + +class IDExpander(object): + + def __init__(self, idDag): + self.expGraph = {} + for pid in idDag.get_key_list(): + p = idDag.get_by(pid) + if pid not in self.expGraph: + self.expGraph[pid] = [] + for cid in p: + self.expGraph[pid].append(cid) + + def expand_id(self, id, leaf_only=False): + out = {} + if id not in self.expGraph or len(self.expGraph[id]) == 0: + return [id] + + for c in self.expGraph[id]: + if not leaf_only: + out[c] = True + for gc in self.expand_id(c, leaf_only): + out[gc] = True + return out.keys() + + def expand_matrix(self, matrix, leaf_only=False): + nrows = {} + for row in matrix.get_row_list(): + #if row in self.expGraph: + for e_val in self.expand_id(row, leaf_only): + if e_val not in nrows: + nrows[e_val] = [] + nrows[e_val].append(row) + out = ClinicalMatrix() + out.init_blank( rows=sorted(nrows.keys()), cols=matrix.get_col_list() ) + + for row in nrows.keys(): + for pid in nrows[row]: + for col in matrix.get_col_list(): + out.set_val( row_name=row, col_name=col, value=matrix.get_val(row_name=pid, col_name=col)) + + #print nrows + return out + + +class ClinicalMatrix: + corner_name = "sample" + def load(self, path): + self.col_map = {} + self.row_map = {} + pos_hash = None + + handle = open(path) + + self.matrix = [] + for row in csv.reader(handle, delimiter="\t"): + if pos_hash is None: + pos_hash = {} + pos = 0 + for name in row[1:]: + i = 1 + orig_name = name + while name in pos_hash: + name = orig_name + "#" + str(i) + i += 1 + pos_hash[name] = pos + pos += 1 + else: + newRow = [] + newRow = [""] * (len(pos_hash)) + for col in pos_hash: + i = pos_hash[col] + 1 + newRow[i - 1] = row[i] + self.row_map[row[0]] = len(self.matrix) + self.matrix.append(newRow) + + self.col_map = {} + for col in pos_hash: + self.col_map[col] = pos_hash[col] + + def get_row_list(self): + """ + Returns names of rows + """ + out = self.row_map.keys() + out.sort( lambda x,y: self.row_map[x]-self.row_map[y]) + return out + + def get_col_list(self): + """ + Returns names of columns + """ + out = self.col_map.keys() + out.sort( lambda x,y: self.col_map[x]-self.col_map[y]) + return out + + def get_row(self, row_name): + return self.matrix[ self.row_map[row_name] ] + + def set_val(self, col_name, row_name, value): + """ + Set cell value based on row and column names + """ + self.matrix[self.row_map[row_name]][self.col_map[col_name]] = value + + def get_val(self, col_name, row_name): + """ + Get cell value based on row and column names + """ + return self.matrix[self.row_map[row_name]][self.col_map[col_name]] + + + def init_blank(self, cols, rows): + """ + Initlize matrix with NA (or nan) values using row/column names + provided by user. User can also force usage of native python objects + (which is useful for string based matrices, and numpy matrices fix cel string length) + """ + self.matrix = [] + self.col_map = {} + self.row_map = {} + for i in range(len(rows)): + self.matrix.append([""]*len(cols)) + for i, c in enumerate(cols): + self.col_map[c] = i + for i, r in enumerate(rows): + self.row_map[r] = i + self.loaded = True + + def write(self, handle, missing=''): + write = csv.writer(handle, delimiter="\t", lineterminator='\n') + col_list = self.get_col_list() + + write.writerow([self.corner_name] + col_list) + for rowName in self.row_map: + out = [rowName] + row = self.get_row(rowName) + for col in col_list: + val = row[self.col_map[col]] + out.append(val) + write.writerow(out) + + +if __name__ == "__main__" : + + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--col-matrix', help='Matrix to saturate by columns', dest="col_matrix", default=None) + #parser.add_argument('-r', '--row-matrix', help='Matrix to censor by rows', dest="row_matrix", default=None) + parser.add_argument('-d', '--iddag', help='IDDag to use for saturation', dest="iddag", default=None) + parser.add_argument("-o", "--out", help="Output File", dest="output", default=None) + parser.add_argument("-l", "--leaf-only", help="Lead Only", dest="leaf_only", action="store_true", default=False) + + args = parser.parse_args() + + + matrix = ClinicalMatrix() + matrix.load(args.col_matrix) + iddag = IDDag() + iddag.load(args.iddag) + + expander = IDExpander(iddag) + out = expander.expand_matrix(matrix, args.leaf_only) + if args.output is None: + out.write(sys.stdout) + else: + handle = open(args.output, "w") + out.write(handle) + handle.close() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_saturate.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,28 @@ +<tool id="matrix_saturate" name="Matrix Saturate" version="1.0.0"> + <description>Saturate all matrix values of a label with values from the parents</description> + <command interpreter="python">matrix_saturate.py +#if $leaf_only +--leaf-only +#end if +-c $matrix -d $idtree -o $outfile</command> + <inputs> + <param name="matrix" type="data" label="Genomic Matrix"/> + <param name="idtree" type="data" label="IDTree" help="A tree describing label relationships"/> + <param name="leaf_only" type="boolean" label="Leaf Only" help="Only output leaf nodes from the tree"/> + </inputs> + <outputs> + <data name="outfile" format="tabular"/> + </outputs> + <help> +Saturate all matrix values of a label with values from the parents. +The example would be an experiment where that are properties of a cell line, and multiple experiments (ie multiple time points) per cell line. Each of the experiments would have a unique example label, but it would be connected to the 'parent' +cell line label. In order to fully populate the information about the experiment (so that timepoint experiments would inherit the +there is a 'tissue_of_origin' data from the cell line), a 'saturation' copies parent data down the tree and to all of the child labels. + + +IDTree: + This is a description of the tree of sample id connections. + Format is two tab seperated columns, in the form of 'parent' 'child' + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_transpose.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,67 @@ +#!/usr/bin/env python + +import string,sys +import getopt +import array + +if __name__ == "__main__": + + opts, args = getopt.getopt(sys.argv[1:], "lf") + if (len(args))!=2: + sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n") + sys.exit(2) + + label_print = True + use_float = True + for o, a in opts: + if o == "-l": + label_print = False + if o == "-f": + use_float = True + + fin= open(args[0],'r') + if args[1] == "-": + fout = sys.stdout + else: + fout= open(args[1],'w') + + col_label = None + row_label = [] + matrix=[] + for line in fin.readlines(): + data = string.split(line.strip(),'\t') + if col_label is None: + col_label = data + else: + row_label.append(data[0]) + if use_float: + o = array.array('f') + for i in data[1:]: + try: + o.append(float(i)) + except ValueError: + o.append(float('nan')) + else: + o = data[1:] + row_label.append(data[0]) + matrix.append(o) + + #header + out = [] + if label_print: + out = [col_label[0]] + row_label + else: + out = row_label + fout.write("\t".join(out) + "\n") + + #body + for col in range(0, len(col_label)): + out = [] + if label_print: + out.append(col_label[col+1]) + for row in matrix: + out.append(str(row[col])) + fout.write("\t".join(out) + "\n") + + fin.close() + fout.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_transpose.py~ Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import string,sys +import getopt +import array + +if __name__ == "__main__": + + opts, args = getopt.getopt(sys.argv[1:], "lf") + if (len(args))!=2: + sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n") + sys.exit(2) + + label_print = True + use_float = True + for o, a in opts: + if o == "-l": + label_print = False + if o == "-f": + use_float = True + + fin= open(args[0],'r') + fout= open(args[1],'w') + + col_label = None + row_label = [] + matrix=[] + for line in fin.readlines(): + data = string.split(line.strip(),'\t') + if col_label is None: + col_label = data + else: + row_label.append(data[0]) + if use_float: + o = array.array('f') + for i in data[1:]: + try: + o.append(float(i)) + except ValueError: + o.append(float('nan')) + else: + o = data[1:] + row_label.append(data[0]) + matrix.append(o) + + #header + out = [] + if label_print: + out = [col_label[0]] + row_label + else: + out = row_label + fout.write("\t".join(out) + "\n") + + #body + for col in range(0, len(col_label)): + out = [] + if label_print: + out.append(col_label[col+1]) + for row in matrix: + out.append(str(row[col])) + fout.write("\t".join(out) + "\n") + + fin.close() + fout.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_transpose.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,19 @@ +<tool id="matrix_transpose" name="Matrix Transpose" version="1.0.0"> + <description>Transpose a matrix</description> + <command interpreter="python">matrix_transpose.py +#if $isfloat: +-f +#end if +$infile $outfile +</command> + <inputs> + <param name="infile" type="data" label="Genomic Matrix"/> + <param name="isfloat" type="boolean" label="Is Float" help="If all matrix values are float, use memory efficient code"/> + </inputs> + <outputs> + <data name="outfile" format="tabular"/> + </outputs> + <help> + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_whitelist.py Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +import sys +import csv +import argparse + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--col-matrix', help='Matrix to censor by columns', dest="col_matrix", default=None) + parser.add_argument('-r', '--row-matrix', help='Matrix to censor by rows', dest="row_matrix", default=None) + parser.add_argument('-b', '--bed', help='BED file to censor', dest="bed", default=None) + parser.add_argument('-w', '--whitelist', help='White list of samples', dest="white_list", default=None) + parser.add_argument("-d", "--delim", help="Field Delimiter (Default \t)", dest="delim", default="\t") + parser.add_argument("-o", "--out", help="Output File", dest="output", default=None) + + args = parser.parse_args() + + if args.white_list is None: + sys.stderr.write("Must Provide whitelist\n") + sys.exit(0) + + whitelist = {} + handle = open(args.white_list) + for line in handle: + key = line.rstrip().split("\t")[0] + whitelist[key] = True + + if args.col_matrix is not None: + handle = open(args.col_matrix) + reader = csv.reader(handle, delimiter=args.delim) + out = sys.stdout + writer = None + head = None + for row in reader: + if head is None: + head = [0] + orow = [row[0]] + for i, a in enumerate(row[1:]): + if a in whitelist: + head.append(i+1) + orow.append(a) + + if len(orow) < 2: + break + if args.output is not None: + out = open(args.output, "w") + writer = csv.writer(out, delimiter="\t", lineterminator="\n") + writer.writerow(orow) + else: + orow = [] + for i in head: + orow.append(row[i]) + writer.writerow(orow) + handle.close() + if args.output is not None: + out.close() + + + if args.row_matrix is not None: + handle = open(args.row_matrix) + reader = csv.reader(handle, delimiter=args.delim) + out = sys.stdout + writer = None + header = None + for row in reader: + if header is None: + header = row + else: + if row[0] in whitelist: + if writer is None: + if args.output is not None: + out = open(args.output, "w") + writer = csv.writer(out, delimiter="\t", lineterminator="\n") + writer.writerow(header) + writer.writerow(row) + handle.close() + if writer is not None: + out.close() + + + if args.bed is not None: + handle = open(args.bed) + reader = csv.reader(handle, delimiter=args.delim) + out = sys.stdout + writer = None + for row in reader: + if row[3] in whitelist: + if writer is None: + if args.output is not None: + out = open(args.output, "w") + writer = csv.writer(out, delimiter="\t", lineterminator="\n") + writer.writerow(row) + handle.close() + if writer is not None: + out.close() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_whitelist.xml Fri Dec 21 16:43:16 2012 -0500 @@ -0,0 +1,26 @@ +<tool id="matrix_whitelist" name="Matrix WhiteList" version="1.0.0"> + <description>Remove samples from matrix that aren't part of whitelist</description> + <command interpreter="python">matrix_whitelist.py -w $whiteList + -o $outfile +#if str($mode) == "col": +-c $matrix +#end if +#if str($mode) == "row": +-r $matrix +#end if + </command> + <inputs> + <param name="matrix" type="data" label="Matrix"/> + <param name="mode" type="select" label="Whitelist Mode"> + <option value="col">Column</option> + <option value="row">Row</option> + </param> + <param name="whiteList" type="data" label="Whitelist"/> + </inputs> + <outputs> + <data name="outfile" format="tabular"/> + </outputs> + <help> +Matrix Whitelist + </help> +</tool>