Mercurial > repos > kellrott > matrix_manipulate

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_flatten.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+from optparse import OptionParser
+
+if __name__ == "__main__":
+	parser = OptionParser()
+	parser.add_option("-m", "--missing", dest="missing", help="Missing", default='')
+	opts, args = parser.parse_args()
+
+	pivot_col = args[0]
+	rep_cols = args[1:]
+
+	reader = csv.reader(sys.stdin, delimiter="\t")
+	writer = csv.writer(sys.stdout, delimiter="\t", lineterminator="\n")
+
+	missing = 'Normal'
+
+	##Read in the matrix
+	header = None
+	cols = []
+	rows = []
+	col_pos = {}
+	for row in reader:
+		if header is None:
+			header = {}
+			for i,c in enumerate(row):
+				header[i] = c
+				cols.append(c)
+				col_pos[c] = i
+		else:
+			nrow = {}
+			for i,c in enumerate(cols):
+				nrow[c] = row[i]
+			rows.append(nrow)
+
+	##determine all values in the pivot column
+	pivot_vals = {}
+	for r in rows:
+		if len(r[cols[col_pos[pivot_col]]]):
+			pivot_vals[ r[cols[col_pos[pivot_col]]] ] = True
+
+	o_cols = [0]
+	o_names = [header[0]]
+	o_pivot = [None]
+	for p in pivot_vals:
+		for rc in rep_cols:
+			o_pivot.append(p)
+			if rc in col_pos:
+				o_cols.append(cols[col_pos[rc]])
+				o_names.append(p + ":" + rc)
+	writer.writerow(o_names)
+
+	ids = {}
+	for r in rows:
+		ids[r[cols[0]]] = True
+
+	for i in ids:
+		out_row = [opts.missing] * len(o_cols)
+		out_row[0] = i
+		for r in rows:
+			if r[cols[0]] == i:
+				for c in range(1, len(o_cols)):
+					if r[pivot_col] == o_pivot[c]:
+						out_row[c] = r[o_cols[c]]
+		#print out_row
+		writer.writerow(out_row)
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_flatten.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,48 @@
+<tool id="matrix_flatten" name="Matrix Flatten" version="1.0.0">
+	<description> using a state column</description>
+	<command interpreter="python">matrix_flatten.py $input_tabular
+	</command>
+	<inputs>
+		<param name="input_tabular" type="data" label="Matrix File"/>
+		<param name="pivot_column" type="text" size="90" label="Pivot Column" value="c1"/>
+		<repeat name="flatten" title="Flatten Columns Rule" min="1">
+			<param name="column" type="text" size="90" label="Flatten Column" value="c2"/>
+		</repeat>
+		<param name="missing" type="text" label="Missing Value String" value=""/>
+	</inputs>
+	<outputs>
+		<data name="out" format="tabular" label="Flattened Matrix"/>
+	</outputs>
+	<help>
+Use the state of one column to flatten out other columns into descrete columns.
+The first column is the primary key, with the next column starting as c1. (So in the example below, c1 is DrugType).
+columns can be referenced by the cN value or their proper name (ie 'c1' or 'DrugType')
+
+Example File
+
++--------+----------+-----------+
+|#sample |  DrugType|  DrugDose |
++--------+----------+-----------+
+|sample_1|  DrugA   |  1        |
++--------+----------+-----------+
+|sample_1|  DrugB   |  2        |
++--------+----------+-----------+
+|sample_2|  DrugA   |  3        |
++--------+----------+-----------+
+|sample_2|  DrugC   |  5        |
++--------+----------+-----------+
+
+
+One might want to flatten the DrugDose column (c2), using the DrugType column (c1). Using Matrix Flatten, the matrix would becomes
+
+
++--------+---------------+----------------+----------------+
+|#sample |DrugA:DrugDose |DrugB:DrugDose  |DrugC:DrugDose  |
++--------+---------------+----------------+----------------+
+|sample_1|  1            |   2            |                |
++--------+---------------+----------------+----------------+
+|sample_2|  3            |                |  5             |
++--------+---------------+----------------+----------------+
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_join.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+"""join.py:
+
+Usage:
+  join.py [options] file1 file2 [file3 ...]
+
+Options:
+  -h            header
+  -i            output only features in common with all files
+  -f            use 'float' mode to save memory
+  -q            run quietly
+"""
+import os, os.path, sys, getopt, re
+import array
+
+delim = "\t"
+verbose = True
+
+def usage(code = 0):
+    sys.stderr.write( __doc__ )
+    if code != None: sys.exit(code)
+
+def log(msg, die = False):
+    if (verbose):
+        sys.stderr.write(msg)
+    if die:
+        sys.exit(1)
+
+def readFile(inFile, header = True, use_float = False):
+    dataWidth = None
+    dataMap = {}
+    f = open(inFile, "r")
+    if header:
+        line = f.readline()
+        if line.isspace():
+            log("ERROR: missing header\n", die = True)
+        pline = re.split(delim, line.rstrip("\n\r"))
+        dataMap["HEADER"] = pline
+        dataWidth = len(pline[1:])
+    for line in f:
+        if line.isspace():
+            continue
+        pline = re.split(delim, line.rstrip("\n\r"))
+        if dataWidth is None:
+            dataWidth = len(pline[1:])
+        assert(len(pline[1:]) == dataWidth)
+        if use_float:
+            out = array.array("f")
+            for a in pline[1:]:
+                try:
+                    out.append(float(a))
+                except ValueError:
+                    out.append(float('nan'))
+        else:
+            out = pline[1:]
+        dataMap[pline[0]] = out
+    f.close()
+    return (dataMap, dataWidth)
+
+def main(args):
+    ## parse arguments
+    try:
+        opts, args = getopt.getopt(args, "hiqfo:")
+    except getopt.GetoptError, err:
+        sys.stderr.write( str(err) + "\n" )
+        usage(2)
+
+    if len(args) > 0:
+        files = args
+    else:
+        files = []
+        for i in sys.stdin:
+           files.append(i.rstrip("\n\r"))
+
+    if len(files) < 2:
+        sys.stderr.write("incorrect number of arguments\n")
+        usage(1)
+
+    header = False
+    useIntersection = False
+    output = None
+    use_float = False
+    global verbose
+    for o, a in opts:
+        if o == "-h":
+            header = True
+        elif o == "-i":
+            useIntersection = True
+        elif o == "-q":
+            verbose = False
+        elif o == "-o":
+            output = a
+        elif o == "-f":
+            use_float = True
+
+    ## read files
+    fileData = {}
+    fileWidth = {}
+    for file in files:
+        (fileData[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float)
+    features = list(set(fileData[files[0]].keys()) - set(["HEADER"]))
+    if useIntersection:
+        for file in files:
+            features = list(set(fileData[file].keys()) & set(features))
+    features.sort()
+
+    if output is not None:
+        ohandle = open(output, "w")
+    else:
+        ohandle  = sys.stdout
+
+    ## output
+    if header:
+        lineElements = [fileData[files[0]]["HEADER"][0]]
+        for file in files:
+            lineElements += fileData[file]["HEADER"][1:]
+        ohandle.write("%s\n" % (delim.join(lineElements)))
+    for feature in features:
+        lineElements = []
+        for file in files:
+            if feature in fileData[file]:
+                lineElements += fileData[file][feature]
+            else:
+                lineElements += ["" for i in range(fileWidth[file])]
+        ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) ))
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_join.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,24 @@
+<tool id="matrix_join" name="Matrix Join" version="1.0.0">
+	<description>Join Matrices using row labels</description>
+	<command interpreter="python">matrix_join.py -h
+-o $out
+#if $is_float:
+-f
+#end if
+#for $a in $in_mats:
+$a.file
+#end for
+	</command>
+	<inputs>
+		<repeat name="in_mats" title="Input Matrix" min="1">
+			<param name="file" type="data" label="Matrix File"/>
+		</repeat>
+		<param name="is_float" type="boolean" label="Float Values" help="If all matrices are floating point numbers, use to save memory"/>
+	</inputs>
+	<outputs>
+		<data name="out" format="tabular" label="Joined Matrix" help="Joined Matrix"/>
+	</outputs>
+	<help>
+Join matricies by row labels
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_merge.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+
+class ClinicalMatrix:
+    corner_name = "sample"
+    def load(self, path):
+        self.col_map = {}
+        self.row_map = {}
+        pos_hash = None
+
+        handle = open(path)
+
+        self.matrix = []
+        for row in csv.reader(handle, delimiter="\t"):
+            if pos_hash is None:
+                pos_hash = {}
+                pos = 0
+                for name in row[1:]:
+                    i = 1
+                    orig_name = name
+                    while name in pos_hash:
+                        name = orig_name + "#" + str(i)
+                        i += 1
+                    pos_hash[name] = pos
+                    pos += 1
+            else:
+                newRow = []
+                newRow = [""] * (len(pos_hash))
+                for col in pos_hash:
+                    i = pos_hash[col] + 1
+                    newRow[i - 1] = row[i]
+                self.row_map[row[0]] = len(self.matrix)
+                self.matrix.append(newRow)
+
+        self.col_map = {}
+        for col in pos_hash:
+            self.col_map[col] = pos_hash[col]
+
+    def get_row_list(self):
+        """
+        Returns names of rows
+        """
+        out = self.row_map.keys()
+        out.sort( lambda x,y: self.row_map[x]-self.row_map[y])
+        return out
+
+    def get_col_list(self):
+        """
+        Returns names of columns
+        """
+        out = self.col_map.keys()
+        out.sort( lambda x,y: self.col_map[x]-self.col_map[y])
+        return out
+
+    def get_row(self, row_name):
+        return self.matrix[ self.row_map[row_name] ]
+
+    def set_val(self, col_name, row_name, value):
+        """
+        Set cell value based on row and column names
+        """
+        self.matrix[self.row_map[row_name]][self.col_map[col_name]] = value
+
+    def get_val(self, col_name, row_name):
+        """
+        Get cell value based on row and column names
+        """
+        return self.matrix[self.row_map[row_name]][self.col_map[col_name]]
+
+
+    def init_blank(self, cols, rows):
+        """
+        Initlize matrix with NA (or nan) values using row/column names
+        provided by user. User can also force usage of native python objects
+        (which is useful for string based matrices, and numpy matrices fix cel string length)
+        """
+        self.matrix = []
+        self.col_map = {}
+        self.row_map = {}
+        for i in range(len(rows)):
+            self.matrix.append([""]*len(cols))
+        for i, c in enumerate(cols):
+            self.col_map[c] = i
+        for i, r in enumerate(rows):
+            self.row_map[r] = i
+        self.loaded = True
+
+    def write(self, handle, missing=''):
+        write = csv.writer(handle, delimiter="\t", lineterminator='\n')
+        col_list = self.get_col_list()
+
+        write.writerow([self.corner_name] + col_list)
+        for rowName in self.row_map:
+            out = [rowName]
+            row = self.get_row(rowName)
+            for col in col_list:
+                val = row[self.col_map[col]]
+                out.append(val)
+            write.writerow(out)
+
+    def merge(self, other):
+        rows = {}
+        #get the rows that part of the original matrix
+        for r in self.get_row_list():
+            rows[r] = None
+        if other is not None:
+            for r in other.get_row_list():
+                rows[r] = None
+        cols = {}
+        #get the cols that part of the original matrix
+        for r in self.get_col_list():
+            cols[r] = None
+        if other is not None:
+            for r in other.get_col_list():
+                cols[r] = None
+        out = ClinicalMatrix()
+        out.init_blank(cols=cols, rows=rows)
+        for row in self.get_row_list():
+            for col in self.get_col_list():
+                out.set_val(col_name=col, row_name=row, value=self.get_val(col_name=col, row_name=row))
+        if other is not None:
+            for row in other.get_row_list():
+                for col in other.get_col_list():
+                    out.set_val(col_name=col, row_name=row, value=other.get_val(col_name=col, row_name=row))
+        return out
+
+
+if __name__ == "__main__" :
+
+	matrix = None
+	for p in sys.argv[1:]:
+		nmatrix = ClinicalMatrix()
+		nmatrix.load(p)
+		matrix = nmatrix.merge(matrix)
+
+	matrix.write(sys.stdout, missing="")
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_merge.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,21 @@
+<tool id="matrix_merge" name="Matrix Merge" version="1.0.0">
+	<description>Merge multiple matrices</description>
+	<command interpreter="python">matrix_merge.py
+#for $a in $in_mats:
+$a.file
+#end for
+> $out
+	</command>
+	<inputs>
+		<repeat name="in_mats" title="Input Matrix" min="1">
+			<param name="file" type="data" label="Matrix File"/>
+		</repeat>
+	</inputs>
+	<outputs>
+		<data name="out" format="tabular" label="Merged Matrix"/>
+	</outputs>
+	<help>
+Join matricies by row and column labels. Unlike 'Matrix Join', which simple concantinates matrices
+so that their row labels match (without enforcing unique column labels), 'Matrix Merge' joins all the values using unique row and column labels. The last value for a row/column combination is used.
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_normalize.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+import sys
+import time
+import array
+import csv
+import math
+import ctypes
+import ctypes.util
+import argparse
+
+libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c"))
+
+def rankTransform(rank, total):
+    return rank / float(total)
+
+def py_cmp_float(a_ptr, b_ptr):
+    a = a_ptr.contents.value
+    b = b_ptr.contents.value
+    return (a > b) - (a < b)
+
+CMPFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float))
+
+cmp_float = CMPFUNC(py_cmp_float)
+
+def csort(buf):
+    """
+    This is an inplace sort of an array.array float class using the C qsort function
+    """
+    addr, count = buf.buffer_info()
+    libc.qsort( ctypes.cast(addr, ctypes.POINTER(ctypes.c_float)), count, ctypes.sizeof(ctypes.c_float), cmp_float)
+
+
+def transformFile(fh, ofh, sep="\t", zero_drop=False, na2zero=False):
+
+    floatValues = None
+    cols = None
+    rows = None
+    totalValues = 0
+    reader = csv.reader(fh, delimiter=sep)
+    for row in reader:
+        if cols is None:
+            cols = row[1:]
+            numCols = len(cols)
+            rows = []
+            floatValues = array.array('f')
+        else:
+            rows.append(row[0])
+            assert(len(row)-1 == numCols)
+            for val in row[1:]:
+                try:
+                    v = float(val)
+                    floatValues.append(v)
+                    if not zero_drop or v != 0.0:
+                        totalValues += 1
+                except ValueError:
+                    floatValues.append(float('nan'))
+
+    numRows = len(rows)
+    if (numRows == 0):
+        sys.stderr.write("Empty input\n")
+        exit(10)
+
+    if totalValues == 0:
+        assert False, "did not read any values"
+
+    sortedValues = array.array('f')
+    for f in floatValues:
+        if f == f:
+             sortedValues.append(f)
+
+    csort(sortedValues)
+
+    i = 0
+    rankDict = dict()
+    for val in sortedValues:
+        if not math.isnan(val) and (not zero_drop or val != 0.0):
+            rankDict[val] = rankTransform(i, totalValues)
+            i += 1
+
+    def rowString(rowNum):
+        def matrixVal(colNum):
+            val = floatValues[rowNum*numCols + colNum]
+            if val in rankDict:
+                return ("%5g" % rankDict[val])
+            else:
+                if na2zero:
+                    return "0"
+                else:
+                    return "NA"
+        return "\t".join(map(matrixVal, range(numCols)))
+
+    ofh.write( "probe\t%s\n" % ("\t".join(cols)) )
+    for j in range(numRows):
+        ofh.write("%s\t%s\n" % (rows[j], rowString(j)))
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s", "--sep", help="Seperator", default="\t")
+    parser.add_argument("-o", "--out", help="Output", default=None)
+    parser.add_argument("-n", "--na2zero", help="Change NAs to Zero", action="store_true", default=False)
+    parser.add_argument("-z", "--zero-drop", help="Drop Zero Values", action="store_true", default=False)
+    parser.add_argument("input", help="Input Matrix", default=None)
+
+    args = parser.parse_args()
+
+    if args.input == "-":
+        fh = sys.stdin
+    else:
+        fh = open(args.input)
+    if args.out is None:
+        ofh = sys.stdout
+    else:
+        ofh = open(args.out, "w")
+
+    transformFile(fh, ofh, args.sep, args.zero_drop, args.na2zero)
+    fh.close()
+    ofh.close()
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_normalize.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,24 @@
+<tool id="rank_normalize" name="Rank Normalize" version="1.0.0">
+  <description>Perform a rank normalized transform to a number matrix</description>
+  <command interpreter="python">rank_normalize.py
+#if $dropZeros:
+-z
+#end if
+#if $na2zero:
+-n
+#end if
+$inMatrix -o $outMatrix
+  </command>
+  <inputs>
+	    <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/>
+      <param name="dropZeros" type="boolean" label="Drop Input Zeros" checked="True"/>
+      <param name="na2zero" type="boolean" label="Set Output NAs to Zero" checked="False"/>
+  </inputs>
+  <outputs>
+      <data name="outMatrix" format="tabular"/>
+  </outputs>
+  <help>
+Determines the ranking of every number in a matrix and then normalizes them into a
+distribution between 0 and 1.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_segment.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import sys
+import time
+import array
+import csv
+import math
+import ctypes
+import ctypes.util
+import argparse
+
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", help="Input", default=None)
+    parser.add_argument("-o", "--output", help="Output", default=None)
+
+    parser.add_argument("-b", dest="bounds", help="Boundaries", type=float, nargs="*", default=None)
+    parser.add_argument("-v", dest="vals", help="Group Values", nargs="*", default=None)
+
+    args = parser.parse_args()
+
+    if len(args.bounds) != len(args.vals) - 1:
+        sys.stderr.write("Number of groups created by boundaries and group values does not match\n")
+        sys.exit(1)
+
+    head = True
+    ihandle = open(args.input)
+    ohandle = open(args.output, "w")
+    for line in ihandle:
+        if head:
+            ohandle.write(line)
+            head = False
+        else:
+            row = line.rstrip().split("\t")
+            out = [row[0]]
+            for a in row[1:]:
+                v = float(a)
+                if v <= args.bounds[0]:
+                    o = args.vals[0]
+                if v > args.bounds[-1]:
+                    o = args.vals[-1]
+                for i in range(1, len(args.bounds)):
+                    if v > args.bounds[i-1] and v <= args.bounds[i]:
+                        o = args.vals[i]
+                out.append(o)
+            ohandle.write("%s\n" % ("\t".join(out)))
+    ihandle.close()
+    ohandle.close()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_segment.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,28 @@
+<tool id="rank_segment" name="Matrix Rank Segment" version="1.0.0">
+  <description>Perform a rank normalized transform to a number matrix</description>
+  <command interpreter="python">rank_segment.py
+-v $value
+#for v in $bounds
+${v.value}
+#end for
+-b
+#for v in $bounds
+${v.range}
+#end for
+-i $inMatrix -o $outMatrix
+  </command>
+  <inputs>
+	    <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/>
+      <param name="value" type="text" label="Below Label" value="0"/>
+      <repeat name="bounds" title="Boundaries" min="1">
+        <param name="range" type="float" label="Boundary" value="0.5"/>
+        <param name="value" type="text" label="Above Label" value="1"/>
+      </repeat>
+  </inputs>
+  <outputs>
+      <data name="outMatrix" format="tabular"/>
+  </outputs>
+  <help>
+Use a set of boundaries to transform all the values in a matrix into a set of discrete states.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_saturate.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,336 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import argparse
+
+
+class IDDag:
+    def __init__(self):
+        self.graph = None
+
+    def load(self, path):
+        handle = open(path)
+        self.parents = {}
+        for line in handle:
+            tmp = line.rstrip().split("\t")
+            if tmp[0] not in self.parents:
+                self.parents[tmp[0]] = [ tmp[1] ]
+            else:
+                self.parents[tmp[0]].append( tmp[1] )
+
+        handle.close()
+
+    def get_key_list(self):
+        return self.parents.keys()
+
+    def get_by(self, key):
+        return self.parents[key]
+
+    def _build_graph(self):
+        self.graph = {}
+        self.rev_graph = {}
+        for pid in self.get_key_list():
+            if pid not in self.graph:
+                self.graph[pid] = {}
+            p = self.get_by(pid)
+            for cid in p:
+                self.graph[pid][cid.child] = True
+                if cid.child not in self.rev_graph:
+                    self.rev_graph[cid.child] = {}
+                self.rev_graph[cid.child][cid.id] = True
+
+    def is_descendant(self, parent, child):
+        if self.graph is None:
+            self._build_graph()
+
+        cid = child
+        while cid in self.rev_graph:
+            cid = self.rev_graph[cid].keys()[0]
+            if cid == parent:
+                return True
+        return False
+
+    def _desc_crawl(self, parent):
+        out = {}
+        if parent in self.graph:
+            for node in self.graph[parent]:
+                if node is not None and len(node) and node != parent:
+                    out[node] = True
+                    for c in self._desc_crawl(node):
+                        out[c] = True
+        return out.keys()
+
+
+    def get_descendants(self, parent):
+        if self.graph is None:
+            self._build_graph()
+        return self._desc_crawl(parent)
+
+    def get_children(self, node):
+        if self.graph is None:
+            self._build_graph()
+        if node in self.graph:
+            return self.graph[node]
+        return []
+
+    def get_parents(self, node):
+        if self.graph is None:
+            self._build_graph()
+        if node in self.rev_graph:
+            return self.rev_graph[node]
+        return []
+
+    def in_graph(self, name):
+        if self.graph is None:
+            self._build_graph()
+
+        if name in self.graph or name in self.rev_graph:
+            return True
+        return False
+
+
+class IDReducer(object):
+    """
+    The IDReducer class uses an IDDag to 'reduce' id's and objects to
+    common parent objects.
+
+    Assume Matrix 1 has aliquot ids like
+        - sample1-aliquot1
+        - sample2-aliquot1
+        - sample2-aliquot1
+    And that Matrix 1 has aliquot ids like
+        - sample1-aliquot2
+        - sample2-aliquot2
+        - sample2-aliquot2
+
+    Both files deal with the same samples, but different aliquots were
+    ran on different machines, producing matrices of different datatypes.
+    But for data integration perposes, we need to refer to aliquots by their
+    parent sample name.
+
+    The idDag file for this data would be::
+
+        sample1 sample1-aliquot1
+        sample1 sample1-aliquot2
+        sample2 sample2-aliquot1
+        sample2 sample2-aliquot2
+        sample3 sample2-aliquot1
+        sample3 sample2-aliquot2
+
+    If this file was loaded into an idDag class, and used to initialize an IDReducer
+    the following transformatins would be valid::
+
+        > idReducer.reduce_id( 'sample1-aliquot1' )
+        'sample1'
+        > idReducer.reduce_id( 'sample1-aliquot2' )
+        'sample1'
+
+    """
+    def __init__(self, idDag):
+        self.revGraph = {}
+        for pid in idDag.get_key_list():
+            p = idDag.get_by(pid)
+            for cid in p:
+                if cid.child not in self.revGraph:
+                    self.revGraph[cid.child] = {}
+                self.revGraph[cid.child][cid.id] = cid.edgeType
+
+    def reduce_id(self, id, edgeStop=None):
+        outID = id
+        while outID in self.revGraph:
+            pn = None
+            for p in self.revGraph[outID]:
+                if edgeStop is None or edgeStop != self.revGraph[outID][p]:
+                    pn = p
+            if pn is None:
+                return outID
+            outID = pn
+        return outID
+
+    def reduce_matrix(self, matrix, edgeStop=None):
+        ncols = {}
+        rmap = {}
+        for col in matrix.get_col_list():
+            rval = self.reduce_id(col, edgeStop)
+            if rval not in ncols:
+                ncols[rval] = []
+            ncols[rval].append(col)
+            rmap[col] = rval
+        out = CGData.GenomicMatrix.GenomicMatrix()
+        out.init_blank( cols=ncols.keys(), rows=matrix.get_row_list() )
+        for row in matrix.get_row_list():
+            for col in ncols:
+                tmp = []
+                for nc in ncols[col]:
+                    tmp.append( matrix.get_val( col_name=nc, row_name=row ) )
+                v = sum(tmp) / float(len(tmp))
+                out.set_val(row_name=row, col_name=col, value=v)
+        return out
+
+class IDExpander(object):
+
+    def __init__(self, idDag):
+        self.expGraph = {}
+        for pid in idDag.get_key_list():
+            p = idDag.get_by(pid)
+            if pid not in self.expGraph:
+                self.expGraph[pid] = []
+            for cid in p:
+                self.expGraph[pid].append(cid)
+
+    def expand_id(self, id, leaf_only=False):
+        out = {}
+        if id not in self.expGraph or len(self.expGraph[id]) == 0:
+            return [id]
+
+        for c in self.expGraph[id]:
+            if not leaf_only:
+                out[c] = True
+            for gc in self.expand_id(c, leaf_only):
+                out[gc] = True
+        return out.keys()
+
+    def expand_matrix(self, matrix, leaf_only=False):
+        nrows = {}
+        for row in matrix.get_row_list():
+            #if row in self.expGraph:
+            for e_val in self.expand_id(row, leaf_only):
+                if e_val not in nrows:
+                    nrows[e_val] = []
+                nrows[e_val].append(row)
+        out = ClinicalMatrix()
+        out.init_blank( rows=sorted(nrows.keys()), cols=matrix.get_col_list() )
+
+        for row in nrows.keys():
+            for pid in nrows[row]:
+                for col in matrix.get_col_list():
+                    out.set_val( row_name=row, col_name=col, value=matrix.get_val(row_name=pid, col_name=col))
+
+        #print nrows
+        return out
+
+
+class ClinicalMatrix:
+    corner_name = "sample"
+    def load(self, path):
+        self.col_map = {}
+        self.row_map = {}
+        pos_hash = None
+
+        handle = open(path)
+
+        self.matrix = []
+        for row in csv.reader(handle, delimiter="\t"):
+            if pos_hash is None:
+                pos_hash = {}
+                pos = 0
+                for name in row[1:]:
+                    i = 1
+                    orig_name = name
+                    while name in pos_hash:
+                        name = orig_name + "#" + str(i)
+                        i += 1
+                    pos_hash[name] = pos
+                    pos += 1
+            else:
+                newRow = []
+                newRow = [""] * (len(pos_hash))
+                for col in pos_hash:
+                    i = pos_hash[col] + 1
+                    newRow[i - 1] = row[i]
+                self.row_map[row[0]] = len(self.matrix)
+                self.matrix.append(newRow)
+
+        self.col_map = {}
+        for col in pos_hash:
+            self.col_map[col] = pos_hash[col]
+
+    def get_row_list(self):
+        """
+        Returns names of rows
+        """
+        out = self.row_map.keys()
+        out.sort( lambda x,y: self.row_map[x]-self.row_map[y])
+        return out
+
+    def get_col_list(self):
+        """
+        Returns names of columns
+        """
+        out = self.col_map.keys()
+        out.sort( lambda x,y: self.col_map[x]-self.col_map[y])
+        return out
+
+    def get_row(self, row_name):
+        return self.matrix[ self.row_map[row_name] ]
+
+    def set_val(self, col_name, row_name, value):
+        """
+        Set cell value based on row and column names
+        """
+        self.matrix[self.row_map[row_name]][self.col_map[col_name]] = value
+
+    def get_val(self, col_name, row_name):
+        """
+        Get cell value based on row and column names
+        """
+        return self.matrix[self.row_map[row_name]][self.col_map[col_name]]
+
+
+    def init_blank(self, cols, rows):
+        """
+        Initlize matrix with NA (or nan) values using row/column names
+        provided by user. User can also force usage of native python objects
+        (which is useful for string based matrices, and numpy matrices fix cel string length)
+        """
+        self.matrix = []
+        self.col_map = {}
+        self.row_map = {}
+        for i in range(len(rows)):
+            self.matrix.append([""]*len(cols))
+        for i, c in enumerate(cols):
+            self.col_map[c] = i
+        for i, r in enumerate(rows):
+            self.row_map[r] = i
+        self.loaded = True
+
+    def write(self, handle, missing=''):
+        write = csv.writer(handle, delimiter="\t", lineterminator='\n')
+        col_list = self.get_col_list()
+
+        write.writerow([self.corner_name] + col_list)
+        for rowName in self.row_map:
+            out = [rowName]
+            row = self.get_row(rowName)
+            for col in col_list:
+                val = row[self.col_map[col]]
+                out.append(val)
+            write.writerow(out)
+
+
+if __name__ == "__main__" :
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--col-matrix', help='Matrix to saturate by columns', dest="col_matrix", default=None)
+    #parser.add_argument('-r', '--row-matrix', help='Matrix to censor by rows', dest="row_matrix", default=None)
+    parser.add_argument('-d', '--iddag', help='IDDag to use for saturation', dest="iddag", default=None)
+    parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+    parser.add_argument("-l", "--leaf-only", help="Lead Only", dest="leaf_only", action="store_true", default=False)
+
+    args = parser.parse_args()
+
+
+    matrix = ClinicalMatrix()
+    matrix.load(args.col_matrix)
+    iddag = IDDag()
+    iddag.load(args.iddag)
+
+    expander = IDExpander(iddag)
+    out = expander.expand_matrix(matrix, args.leaf_only)
+    if args.output is None:
+        out.write(sys.stdout)
+    else:
+        handle = open(args.output, "w")
+        out.write(handle)
+        handle.close()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_saturate.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,28 @@
+<tool id="matrix_saturate" name="Matrix Saturate" version="1.0.0">
+  <description>Saturate all matrix values of a label with values from the parents</description>
+  <command interpreter="python">matrix_saturate.py
+#if $leaf_only
+--leaf-only
+#end if
+-c $matrix -d $idtree -o $outfile</command>
+  <inputs>
+	  <param name="matrix" type="data" label="Genomic Matrix"/>
+	  <param name="idtree" type="data" label="IDTree" help="A tree describing label relationships"/>
+    <param name="leaf_only" type="boolean" label="Leaf Only" help="Only output leaf nodes from the tree"/>
+  </inputs>
+  <outputs>
+      <data name="outfile" format="tabular"/>
+  </outputs>
+  <help>
+Saturate all matrix values of a label with values from the parents.
+The example would be an experiment where that are properties of a cell line, and multiple experiments (ie multiple time points) per cell line. Each of the experiments would have a unique example label, but it would be connected to the 'parent'
+cell line label. In order to fully populate the information about the experiment (so that timepoint experiments would inherit the
+there is a 'tissue_of_origin' data from the cell line), a 'saturation' copies parent data down the tree and to all of the child labels.
+
+
+IDTree:
+   This is a description of the tree of sample id connections.
+   Format is two tab seperated columns, in the form of 'parent' 'child'
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_transpose.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import string,sys
+import getopt
+import array
+
+if __name__ == "__main__":
+
+    opts, args = getopt.getopt(sys.argv[1:], "lf")
+    if (len(args))!=2:
+        sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n")
+        sys.exit(2)
+
+    label_print = True
+    use_float = True
+    for o, a in opts:
+            if o == "-l":
+                label_print = False
+            if o == "-f":
+                use_float = True
+
+    fin= open(args[0],'r')
+    if args[1] == "-":
+        fout = sys.stdout
+    else:
+        fout= open(args[1],'w')
+
+    col_label = None
+    row_label = []
+    matrix=[]
+    for line in fin.readlines():
+        data = string.split(line.strip(),'\t')
+        if col_label is None:
+            col_label = data
+        else:
+            row_label.append(data[0])
+            if use_float:
+                o = array.array('f')
+                for i in data[1:]:
+                    try:
+                        o.append(float(i))
+                    except ValueError:
+                        o.append(float('nan'))
+            else:
+                o = data[1:]
+            row_label.append(data[0])
+            matrix.append(o)
+
+    #header
+    out = []
+    if label_print:
+        out = [col_label[0]] + row_label
+    else:
+        out = row_label
+    fout.write("\t".join(out) + "\n")
+
+    #body
+    for col in range(0, len(col_label)):
+        out = []
+        if label_print:
+            out.append(col_label[col+1])
+        for row in matrix:
+            out.append(str(row[col]))
+        fout.write("\t".join(out) + "\n")
+
+    fin.close()
+    fout.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_transpose.py~	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+import string,sys
+import getopt
+import array
+
+if __name__ == "__main__":
+
+    opts, args = getopt.getopt(sys.argv[1:], "lf")
+    if (len(args))!=2:
+        sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n")
+        sys.exit(2)
+
+    label_print = True
+    use_float = True
+    for o, a in opts:
+            if o == "-l":
+                label_print = False
+            if o == "-f":
+                use_float = True
+
+    fin= open(args[0],'r')
+    fout= open(args[1],'w')
+
+    col_label = None
+    row_label = []
+    matrix=[]
+    for line in fin.readlines():
+        data = string.split(line.strip(),'\t')
+        if col_label is None:
+            col_label = data
+        else:
+            row_label.append(data[0])
+            if use_float:
+                o = array.array('f')
+                for i in data[1:]:
+                    try:
+                        o.append(float(i))
+                    except ValueError:
+                        o.append(float('nan'))
+            else:
+                o = data[1:]
+            row_label.append(data[0])
+            matrix.append(o)
+
+    #header
+    out = []
+    if label_print:
+        out = [col_label[0]] + row_label
+    else:
+        out = row_label
+    fout.write("\t".join(out) + "\n")
+
+    #body
+    for col in range(0, len(col_label)):
+        out = []
+        if label_print:
+            out.append(col_label[col+1])
+        for row in matrix:
+            out.append(str(row[col]))
+        fout.write("\t".join(out) + "\n")
+
+    fin.close()
+    fout.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_transpose.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,19 @@
+<tool id="matrix_transpose" name="Matrix Transpose" version="1.0.0">
+  <description>Transpose a matrix</description>
+  <command interpreter="python">matrix_transpose.py
+#if $isfloat:
+-f
+#end if
+$infile $outfile
+</command>
+  <inputs>
+	  <param name="infile" type="data" label="Genomic Matrix"/>
+	  <param name="isfloat" type="boolean" label="Is Float" help="If all matrix values are float, use memory efficient code"/>
+  </inputs>
+  <outputs>
+      <data name="outfile" format="tabular"/>
+  </outputs>
+  <help>
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_whitelist.py	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import argparse
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--col-matrix', help='Matrix to censor by columns', dest="col_matrix", default=None)
+    parser.add_argument('-r', '--row-matrix', help='Matrix to censor by rows', dest="row_matrix", default=None)
+    parser.add_argument('-b', '--bed', help='BED file to censor', dest="bed", default=None)
+    parser.add_argument('-w', '--whitelist', help='White list of samples', dest="white_list", default=None)
+    parser.add_argument("-d", "--delim", help="Field Delimiter (Default \t)", dest="delim", default="\t")
+    parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+
+    args = parser.parse_args()
+
+    if args.white_list is None:
+        sys.stderr.write("Must Provide whitelist\n")
+        sys.exit(0)
+
+    whitelist = {}
+    handle = open(args.white_list)
+    for line in handle:
+        key = line.rstrip().split("\t")[0]
+        whitelist[key] = True
+
+    if args.col_matrix is not None:
+        handle = open(args.col_matrix)
+        reader = csv.reader(handle, delimiter=args.delim)
+        out = sys.stdout
+        writer = None
+        head = None
+        for row in reader:
+            if head is None:
+                head = [0]
+                orow = [row[0]]
+                for i, a in enumerate(row[1:]):
+                    if a in whitelist:
+                        head.append(i+1)
+                        orow.append(a)
+
+                if len(orow) < 2:
+                    break
+                if args.output is not None:
+                    out = open(args.output, "w")
+                writer = csv.writer(out, delimiter="\t", lineterminator="\n")
+                writer.writerow(orow)
+            else:
+                orow = []
+                for i in head:
+                    orow.append(row[i])
+                writer.writerow(orow)
+        handle.close()
+        if args.output is not None:
+            out.close()
+
+
+    if args.row_matrix is not None:
+        handle = open(args.row_matrix)
+        reader = csv.reader(handle, delimiter=args.delim)
+        out = sys.stdout
+        writer = None
+        header = None
+        for row in reader:
+            if header is None:
+                header = row
+            else:
+                if row[0] in whitelist:
+                    if writer is None:
+                        if args.output is not None:
+                            out = open(args.output, "w")
+                        writer = csv.writer(out, delimiter="\t", lineterminator="\n")
+                        writer.writerow(header)
+                    writer.writerow(row)
+        handle.close()
+        if writer is not None:
+            out.close()
+
+
+    if args.bed is not None:
+        handle = open(args.bed)
+        reader = csv.reader(handle, delimiter=args.delim)
+        out = sys.stdout
+        writer = None
+        for row in reader:
+            if row[3] in whitelist:
+                if writer is None:
+                    if args.output is not None:
+                        out = open(args.output, "w")
+                    writer = csv.writer(out, delimiter="\t", lineterminator="\n")
+                writer.writerow(row)
+        handle.close()
+        if writer is not None:
+            out.close()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_whitelist.xml	Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,26 @@
+<tool id="matrix_whitelist" name="Matrix WhiteList" version="1.0.0">
+  <description>Remove samples from matrix that aren't part of whitelist</description>
+  <command interpreter="python">matrix_whitelist.py -w $whiteList
+  -o $outfile
+#if str($mode) == "col":
+-c $matrix
+#end if
+#if str($mode) == "row":
+-r $matrix
+#end if
+  </command>
+  <inputs>
+	  <param name="matrix" type="data" label="Matrix"/>
+    <param name="mode" type="select" label="Whitelist Mode">
+      <option value="col">Column</option>
+      <option value="row">Row</option>
+    </param>
+    <param name="whiteList" type="data" label="Whitelist"/>
+  </inputs>
+  <outputs>
+      <data name="outfile" format="tabular"/>
+  </outputs>
+  <help>
+Matrix Whitelist
+  </help>
+</tool>