# HG changeset patch
# User kellrott
# Date 1356126196 18000
# Node ID eeaa112c9ee07997afb9bd77bd41728241ee6a23
Uploaded
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_flatten.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_flatten.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+from optparse import OptionParser
+
+if __name__ == "__main__":
+ parser = OptionParser()
+ parser.add_option("-m", "--missing", dest="missing", help="Missing", default='')
+ opts, args = parser.parse_args()
+
+ pivot_col = args[0]
+ rep_cols = args[1:]
+
+ reader = csv.reader(sys.stdin, delimiter="\t")
+ writer = csv.writer(sys.stdout, delimiter="\t", lineterminator="\n")
+
+ missing = 'Normal'
+
+ ##Read in the matrix
+ header = None
+ cols = []
+ rows = []
+ col_pos = {}
+ for row in reader:
+ if header is None:
+ header = {}
+ for i,c in enumerate(row):
+ header[i] = c
+ cols.append(c)
+ col_pos[c] = i
+ else:
+ nrow = {}
+ for i,c in enumerate(cols):
+ nrow[c] = row[i]
+ rows.append(nrow)
+
+ ##determine all values in the pivot column
+ pivot_vals = {}
+ for r in rows:
+ if len(r[cols[col_pos[pivot_col]]]):
+ pivot_vals[ r[cols[col_pos[pivot_col]]] ] = True
+
+ o_cols = [0]
+ o_names = [header[0]]
+ o_pivot = [None]
+ for p in pivot_vals:
+ for rc in rep_cols:
+ o_pivot.append(p)
+ if rc in col_pos:
+ o_cols.append(cols[col_pos[rc]])
+ o_names.append(p + ":" + rc)
+ writer.writerow(o_names)
+
+ ids = {}
+ for r in rows:
+ ids[r[cols[0]]] = True
+
+ for i in ids:
+ out_row = [opts.missing] * len(o_cols)
+ out_row[0] = i
+ for r in rows:
+ if r[cols[0]] == i:
+ for c in range(1, len(o_cols)):
+ if r[pivot_col] == o_pivot[c]:
+ out_row[c] = r[o_cols[c]]
+ #print out_row
+ writer.writerow(out_row)
+
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_flatten.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_flatten.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,48 @@
+
+ using a state column
+ matrix_flatten.py $input_tabular
+
+
+
+
+
+
+
+
+
+
+
+
+
+Use the state of one column to flatten out other columns into descrete columns.
+The first column is the primary key, with the next column starting as c1. (So in the example below, c1 is DrugType).
+columns can be referenced by the cN value or their proper name (ie 'c1' or 'DrugType')
+
+Example File
+
++--------+----------+-----------+
+|#sample | DrugType| DrugDose |
++--------+----------+-----------+
+|sample_1| DrugA | 1 |
++--------+----------+-----------+
+|sample_1| DrugB | 2 |
++--------+----------+-----------+
+|sample_2| DrugA | 3 |
++--------+----------+-----------+
+|sample_2| DrugC | 5 |
++--------+----------+-----------+
+
+
+One might want to flatten the DrugDose column (c2), using the DrugType column (c1). Using Matrix Flatten, the matrix would becomes
+
+
++--------+---------------+----------------+----------------+
+|#sample |DrugA:DrugDose |DrugB:DrugDose |DrugC:DrugDose |
++--------+---------------+----------------+----------------+
+|sample_1| 1 | 2 | |
++--------+---------------+----------------+----------------+
+|sample_2| 3 | | 5 |
++--------+---------------+----------------+----------------+
+
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_join.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_join.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+"""join.py:
+
+Usage:
+ join.py [options] file1 file2 [file3 ...]
+
+Options:
+ -h header
+ -i output only features in common with all files
+ -f use 'float' mode to save memory
+ -q run quietly
+"""
+import os, os.path, sys, getopt, re
+import array
+
+delim = "\t"
+verbose = True
+
+def usage(code = 0):
+ sys.stderr.write( __doc__ )
+ if code != None: sys.exit(code)
+
+def log(msg, die = False):
+ if (verbose):
+ sys.stderr.write(msg)
+ if die:
+ sys.exit(1)
+
+def readFile(inFile, header = True, use_float = False):
+ dataWidth = None
+ dataMap = {}
+ f = open(inFile, "r")
+ if header:
+ line = f.readline()
+ if line.isspace():
+ log("ERROR: missing header\n", die = True)
+ pline = re.split(delim, line.rstrip("\n\r"))
+ dataMap["HEADER"] = pline
+ dataWidth = len(pline[1:])
+ for line in f:
+ if line.isspace():
+ continue
+ pline = re.split(delim, line.rstrip("\n\r"))
+ if dataWidth is None:
+ dataWidth = len(pline[1:])
+ assert(len(pline[1:]) == dataWidth)
+ if use_float:
+ out = array.array("f")
+ for a in pline[1:]:
+ try:
+ out.append(float(a))
+ except ValueError:
+ out.append(float('nan'))
+ else:
+ out = pline[1:]
+ dataMap[pline[0]] = out
+ f.close()
+ return (dataMap, dataWidth)
+
+def main(args):
+ ## parse arguments
+ try:
+ opts, args = getopt.getopt(args, "hiqfo:")
+ except getopt.GetoptError, err:
+ sys.stderr.write( str(err) + "\n" )
+ usage(2)
+
+ if len(args) > 0:
+ files = args
+ else:
+ files = []
+ for i in sys.stdin:
+ files.append(i.rstrip("\n\r"))
+
+ if len(files) < 2:
+ sys.stderr.write("incorrect number of arguments\n")
+ usage(1)
+
+ header = False
+ useIntersection = False
+ output = None
+ use_float = False
+ global verbose
+ for o, a in opts:
+ if o == "-h":
+ header = True
+ elif o == "-i":
+ useIntersection = True
+ elif o == "-q":
+ verbose = False
+ elif o == "-o":
+ output = a
+ elif o == "-f":
+ use_float = True
+
+ ## read files
+ fileData = {}
+ fileWidth = {}
+ for file in files:
+ (fileData[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float)
+ features = list(set(fileData[files[0]].keys()) - set(["HEADER"]))
+ if useIntersection:
+ for file in files:
+ features = list(set(fileData[file].keys()) & set(features))
+ features.sort()
+
+ if output is not None:
+ ohandle = open(output, "w")
+ else:
+ ohandle = sys.stdout
+
+ ## output
+ if header:
+ lineElements = [fileData[files[0]]["HEADER"][0]]
+ for file in files:
+ lineElements += fileData[file]["HEADER"][1:]
+ ohandle.write("%s\n" % (delim.join(lineElements)))
+ for feature in features:
+ lineElements = []
+ for file in files:
+ if feature in fileData[file]:
+ lineElements += fileData[file][feature]
+ else:
+ lineElements += ["" for i in range(fileWidth[file])]
+ ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) ))
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_join.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_join.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,24 @@
+
+ Join Matrices using row labels
+ matrix_join.py -h
+-o $out
+#if $is_float:
+-f
+#end if
+#for $a in $in_mats:
+$a.file
+#end for
+
+
+
+
+
+
+
+
+
+
+
+Join matricies by row labels
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_merge.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_merge.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+
+class ClinicalMatrix:
+ corner_name = "sample"
+ def load(self, path):
+ self.col_map = {}
+ self.row_map = {}
+ pos_hash = None
+
+ handle = open(path)
+
+ self.matrix = []
+ for row in csv.reader(handle, delimiter="\t"):
+ if pos_hash is None:
+ pos_hash = {}
+ pos = 0
+ for name in row[1:]:
+ i = 1
+ orig_name = name
+ while name in pos_hash:
+ name = orig_name + "#" + str(i)
+ i += 1
+ pos_hash[name] = pos
+ pos += 1
+ else:
+ newRow = []
+ newRow = [""] * (len(pos_hash))
+ for col in pos_hash:
+ i = pos_hash[col] + 1
+ newRow[i - 1] = row[i]
+ self.row_map[row[0]] = len(self.matrix)
+ self.matrix.append(newRow)
+
+ self.col_map = {}
+ for col in pos_hash:
+ self.col_map[col] = pos_hash[col]
+
+ def get_row_list(self):
+ """
+ Returns names of rows
+ """
+ out = self.row_map.keys()
+ out.sort( lambda x,y: self.row_map[x]-self.row_map[y])
+ return out
+
+ def get_col_list(self):
+ """
+ Returns names of columns
+ """
+ out = self.col_map.keys()
+ out.sort( lambda x,y: self.col_map[x]-self.col_map[y])
+ return out
+
+ def get_row(self, row_name):
+ return self.matrix[ self.row_map[row_name] ]
+
+ def set_val(self, col_name, row_name, value):
+ """
+ Set cell value based on row and column names
+ """
+ self.matrix[self.row_map[row_name]][self.col_map[col_name]] = value
+
+ def get_val(self, col_name, row_name):
+ """
+ Get cell value based on row and column names
+ """
+ return self.matrix[self.row_map[row_name]][self.col_map[col_name]]
+
+
+ def init_blank(self, cols, rows):
+ """
+ Initlize matrix with NA (or nan) values using row/column names
+ provided by user. User can also force usage of native python objects
+ (which is useful for string based matrices, and numpy matrices fix cel string length)
+ """
+ self.matrix = []
+ self.col_map = {}
+ self.row_map = {}
+ for i in range(len(rows)):
+ self.matrix.append([""]*len(cols))
+ for i, c in enumerate(cols):
+ self.col_map[c] = i
+ for i, r in enumerate(rows):
+ self.row_map[r] = i
+ self.loaded = True
+
+ def write(self, handle, missing=''):
+ write = csv.writer(handle, delimiter="\t", lineterminator='\n')
+ col_list = self.get_col_list()
+
+ write.writerow([self.corner_name] + col_list)
+ for rowName in self.row_map:
+ out = [rowName]
+ row = self.get_row(rowName)
+ for col in col_list:
+ val = row[self.col_map[col]]
+ out.append(val)
+ write.writerow(out)
+
+ def merge(self, other):
+ rows = {}
+ #get the rows that part of the original matrix
+ for r in self.get_row_list():
+ rows[r] = None
+ if other is not None:
+ for r in other.get_row_list():
+ rows[r] = None
+ cols = {}
+ #get the cols that part of the original matrix
+ for r in self.get_col_list():
+ cols[r] = None
+ if other is not None:
+ for r in other.get_col_list():
+ cols[r] = None
+ out = ClinicalMatrix()
+ out.init_blank(cols=cols, rows=rows)
+ for row in self.get_row_list():
+ for col in self.get_col_list():
+ out.set_val(col_name=col, row_name=row, value=self.get_val(col_name=col, row_name=row))
+ if other is not None:
+ for row in other.get_row_list():
+ for col in other.get_col_list():
+ out.set_val(col_name=col, row_name=row, value=other.get_val(col_name=col, row_name=row))
+ return out
+
+
+if __name__ == "__main__" :
+
+ matrix = None
+ for p in sys.argv[1:]:
+ nmatrix = ClinicalMatrix()
+ nmatrix.load(p)
+ matrix = nmatrix.merge(matrix)
+
+ matrix.write(sys.stdout, missing="")
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_merge.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_merge.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,21 @@
+
+ Merge multiple matrices
+ matrix_merge.py
+#for $a in $in_mats:
+$a.file
+#end for
+> $out
+
+
+
+
+
+
+
+
+
+
+Join matricies by row and column labels. Unlike 'Matrix Join', which simple concantinates matrices
+so that their row labels match (without enforcing unique column labels), 'Matrix Merge' joins all the values using unique row and column labels. The last value for a row/column combination is used.
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_rank_normalize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_normalize.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+import sys
+import time
+import array
+import csv
+import math
+import ctypes
+import ctypes.util
+import argparse
+
+libc = ctypes.cdll.LoadLibrary(ctypes.util.find_library("c"))
+
+def rankTransform(rank, total):
+ return rank / float(total)
+
+def py_cmp_float(a_ptr, b_ptr):
+ a = a_ptr.contents.value
+ b = b_ptr.contents.value
+ return (a > b) - (a < b)
+
+CMPFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float))
+
+cmp_float = CMPFUNC(py_cmp_float)
+
+def csort(buf):
+ """
+ This is an inplace sort of an array.array float class using the C qsort function
+ """
+ addr, count = buf.buffer_info()
+ libc.qsort( ctypes.cast(addr, ctypes.POINTER(ctypes.c_float)), count, ctypes.sizeof(ctypes.c_float), cmp_float)
+
+
+def transformFile(fh, ofh, sep="\t", zero_drop=False, na2zero=False):
+
+ floatValues = None
+ cols = None
+ rows = None
+ totalValues = 0
+ reader = csv.reader(fh, delimiter=sep)
+ for row in reader:
+ if cols is None:
+ cols = row[1:]
+ numCols = len(cols)
+ rows = []
+ floatValues = array.array('f')
+ else:
+ rows.append(row[0])
+ assert(len(row)-1 == numCols)
+ for val in row[1:]:
+ try:
+ v = float(val)
+ floatValues.append(v)
+ if not zero_drop or v != 0.0:
+ totalValues += 1
+ except ValueError:
+ floatValues.append(float('nan'))
+
+ numRows = len(rows)
+ if (numRows == 0):
+ sys.stderr.write("Empty input\n")
+ exit(10)
+
+ if totalValues == 0:
+ assert False, "did not read any values"
+
+ sortedValues = array.array('f')
+ for f in floatValues:
+ if f == f:
+ sortedValues.append(f)
+
+ csort(sortedValues)
+
+ i = 0
+ rankDict = dict()
+ for val in sortedValues:
+ if not math.isnan(val) and (not zero_drop or val != 0.0):
+ rankDict[val] = rankTransform(i, totalValues)
+ i += 1
+
+ def rowString(rowNum):
+ def matrixVal(colNum):
+ val = floatValues[rowNum*numCols + colNum]
+ if val in rankDict:
+ return ("%5g" % rankDict[val])
+ else:
+ if na2zero:
+ return "0"
+ else:
+ return "NA"
+ return "\t".join(map(matrixVal, range(numCols)))
+
+ ofh.write( "probe\t%s\n" % ("\t".join(cols)) )
+ for j in range(numRows):
+ ofh.write("%s\t%s\n" % (rows[j], rowString(j)))
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-s", "--sep", help="Seperator", default="\t")
+ parser.add_argument("-o", "--out", help="Output", default=None)
+ parser.add_argument("-n", "--na2zero", help="Change NAs to Zero", action="store_true", default=False)
+ parser.add_argument("-z", "--zero-drop", help="Drop Zero Values", action="store_true", default=False)
+ parser.add_argument("input", help="Input Matrix", default=None)
+
+ args = parser.parse_args()
+
+ if args.input == "-":
+ fh = sys.stdin
+ else:
+ fh = open(args.input)
+ if args.out is None:
+ ofh = sys.stdout
+ else:
+ ofh = open(args.out, "w")
+
+ transformFile(fh, ofh, args.sep, args.zero_drop, args.na2zero)
+ fh.close()
+ ofh.close()
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_rank_normalize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_normalize.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,24 @@
+
+ Perform a rank normalized transform to a number matrix
+ rank_normalize.py
+#if $dropZeros:
+-z
+#end if
+#if $na2zero:
+-n
+#end if
+$inMatrix -o $outMatrix
+
+
+
+
+
+
+
+
+
+
+Determines the ranking of every number in a matrix and then normalizes them into a
+distribution between 0 and 1.
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_rank_segment.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_segment.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import sys
+import time
+import array
+import csv
+import math
+import ctypes
+import ctypes.util
+import argparse
+
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input", help="Input", default=None)
+ parser.add_argument("-o", "--output", help="Output", default=None)
+
+ parser.add_argument("-b", dest="bounds", help="Boundaries", type=float, nargs="*", default=None)
+ parser.add_argument("-v", dest="vals", help="Group Values", nargs="*", default=None)
+
+ args = parser.parse_args()
+
+ if len(args.bounds) != len(args.vals) - 1:
+ sys.stderr.write("Number of groups created by boundaries and group values does not match\n")
+ sys.exit(1)
+
+ head = True
+ ihandle = open(args.input)
+ ohandle = open(args.output, "w")
+ for line in ihandle:
+ if head:
+ ohandle.write(line)
+ head = False
+ else:
+ row = line.rstrip().split("\t")
+ out = [row[0]]
+ for a in row[1:]:
+ v = float(a)
+ if v <= args.bounds[0]:
+ o = args.vals[0]
+ if v > args.bounds[-1]:
+ o = args.vals[-1]
+ for i in range(1, len(args.bounds)):
+ if v > args.bounds[i-1] and v <= args.bounds[i]:
+ o = args.vals[i]
+ out.append(o)
+ ohandle.write("%s\n" % ("\t".join(out)))
+ ihandle.close()
+ ohandle.close()
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_rank_segment.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_rank_segment.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,28 @@
+
+ Perform a rank normalized transform to a number matrix
+ rank_segment.py
+-v $value
+#for v in $bounds
+${v.value}
+#end for
+-b
+#for v in $bounds
+${v.range}
+#end for
+-i $inMatrix -o $outMatrix
+
+
+
+
+
+
+
+
+
+
+
+
+
+Use a set of boundaries to transform all the values in a matrix into a set of discrete states.
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_saturate.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_saturate.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,336 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import argparse
+
+
+class IDDag:
+ def __init__(self):
+ self.graph = None
+
+ def load(self, path):
+ handle = open(path)
+ self.parents = {}
+ for line in handle:
+ tmp = line.rstrip().split("\t")
+ if tmp[0] not in self.parents:
+ self.parents[tmp[0]] = [ tmp[1] ]
+ else:
+ self.parents[tmp[0]].append( tmp[1] )
+
+ handle.close()
+
+ def get_key_list(self):
+ return self.parents.keys()
+
+ def get_by(self, key):
+ return self.parents[key]
+
+ def _build_graph(self):
+ self.graph = {}
+ self.rev_graph = {}
+ for pid in self.get_key_list():
+ if pid not in self.graph:
+ self.graph[pid] = {}
+ p = self.get_by(pid)
+ for cid in p:
+ self.graph[pid][cid.child] = True
+ if cid.child not in self.rev_graph:
+ self.rev_graph[cid.child] = {}
+ self.rev_graph[cid.child][cid.id] = True
+
+ def is_descendant(self, parent, child):
+ if self.graph is None:
+ self._build_graph()
+
+ cid = child
+ while cid in self.rev_graph:
+ cid = self.rev_graph[cid].keys()[0]
+ if cid == parent:
+ return True
+ return False
+
+ def _desc_crawl(self, parent):
+ out = {}
+ if parent in self.graph:
+ for node in self.graph[parent]:
+ if node is not None and len(node) and node != parent:
+ out[node] = True
+ for c in self._desc_crawl(node):
+ out[c] = True
+ return out.keys()
+
+
+ def get_descendants(self, parent):
+ if self.graph is None:
+ self._build_graph()
+ return self._desc_crawl(parent)
+
+ def get_children(self, node):
+ if self.graph is None:
+ self._build_graph()
+ if node in self.graph:
+ return self.graph[node]
+ return []
+
+ def get_parents(self, node):
+ if self.graph is None:
+ self._build_graph()
+ if node in self.rev_graph:
+ return self.rev_graph[node]
+ return []
+
+ def in_graph(self, name):
+ if self.graph is None:
+ self._build_graph()
+
+ if name in self.graph or name in self.rev_graph:
+ return True
+ return False
+
+
+class IDReducer(object):
+ """
+ The IDReducer class uses an IDDag to 'reduce' id's and objects to
+ common parent objects.
+
+ Assume Matrix 1 has aliquot ids like
+ - sample1-aliquot1
+ - sample2-aliquot1
+ - sample2-aliquot1
+ And that Matrix 1 has aliquot ids like
+ - sample1-aliquot2
+ - sample2-aliquot2
+ - sample2-aliquot2
+
+ Both files deal with the same samples, but different aliquots were
+ ran on different machines, producing matrices of different datatypes.
+ But for data integration perposes, we need to refer to aliquots by their
+ parent sample name.
+
+ The idDag file for this data would be::
+
+ sample1 sample1-aliquot1
+ sample1 sample1-aliquot2
+ sample2 sample2-aliquot1
+ sample2 sample2-aliquot2
+ sample3 sample2-aliquot1
+ sample3 sample2-aliquot2
+
+ If this file was loaded into an idDag class, and used to initialize an IDReducer
+ the following transformatins would be valid::
+
+ > idReducer.reduce_id( 'sample1-aliquot1' )
+ 'sample1'
+ > idReducer.reduce_id( 'sample1-aliquot2' )
+ 'sample1'
+
+ """
+ def __init__(self, idDag):
+ self.revGraph = {}
+ for pid in idDag.get_key_list():
+ p = idDag.get_by(pid)
+ for cid in p:
+ if cid.child not in self.revGraph:
+ self.revGraph[cid.child] = {}
+ self.revGraph[cid.child][cid.id] = cid.edgeType
+
+ def reduce_id(self, id, edgeStop=None):
+ outID = id
+ while outID in self.revGraph:
+ pn = None
+ for p in self.revGraph[outID]:
+ if edgeStop is None or edgeStop != self.revGraph[outID][p]:
+ pn = p
+ if pn is None:
+ return outID
+ outID = pn
+ return outID
+
+ def reduce_matrix(self, matrix, edgeStop=None):
+ ncols = {}
+ rmap = {}
+ for col in matrix.get_col_list():
+ rval = self.reduce_id(col, edgeStop)
+ if rval not in ncols:
+ ncols[rval] = []
+ ncols[rval].append(col)
+ rmap[col] = rval
+ out = CGData.GenomicMatrix.GenomicMatrix()
+ out.init_blank( cols=ncols.keys(), rows=matrix.get_row_list() )
+ for row in matrix.get_row_list():
+ for col in ncols:
+ tmp = []
+ for nc in ncols[col]:
+ tmp.append( matrix.get_val( col_name=nc, row_name=row ) )
+ v = sum(tmp) / float(len(tmp))
+ out.set_val(row_name=row, col_name=col, value=v)
+ return out
+
+class IDExpander(object):
+
+ def __init__(self, idDag):
+ self.expGraph = {}
+ for pid in idDag.get_key_list():
+ p = idDag.get_by(pid)
+ if pid not in self.expGraph:
+ self.expGraph[pid] = []
+ for cid in p:
+ self.expGraph[pid].append(cid)
+
+ def expand_id(self, id, leaf_only=False):
+ out = {}
+ if id not in self.expGraph or len(self.expGraph[id]) == 0:
+ return [id]
+
+ for c in self.expGraph[id]:
+ if not leaf_only:
+ out[c] = True
+ for gc in self.expand_id(c, leaf_only):
+ out[gc] = True
+ return out.keys()
+
+ def expand_matrix(self, matrix, leaf_only=False):
+ nrows = {}
+ for row in matrix.get_row_list():
+ #if row in self.expGraph:
+ for e_val in self.expand_id(row, leaf_only):
+ if e_val not in nrows:
+ nrows[e_val] = []
+ nrows[e_val].append(row)
+ out = ClinicalMatrix()
+ out.init_blank( rows=sorted(nrows.keys()), cols=matrix.get_col_list() )
+
+ for row in nrows.keys():
+ for pid in nrows[row]:
+ for col in matrix.get_col_list():
+ out.set_val( row_name=row, col_name=col, value=matrix.get_val(row_name=pid, col_name=col))
+
+ #print nrows
+ return out
+
+
+class ClinicalMatrix:
+ corner_name = "sample"
+ def load(self, path):
+ self.col_map = {}
+ self.row_map = {}
+ pos_hash = None
+
+ handle = open(path)
+
+ self.matrix = []
+ for row in csv.reader(handle, delimiter="\t"):
+ if pos_hash is None:
+ pos_hash = {}
+ pos = 0
+ for name in row[1:]:
+ i = 1
+ orig_name = name
+ while name in pos_hash:
+ name = orig_name + "#" + str(i)
+ i += 1
+ pos_hash[name] = pos
+ pos += 1
+ else:
+ newRow = []
+ newRow = [""] * (len(pos_hash))
+ for col in pos_hash:
+ i = pos_hash[col] + 1
+ newRow[i - 1] = row[i]
+ self.row_map[row[0]] = len(self.matrix)
+ self.matrix.append(newRow)
+
+ self.col_map = {}
+ for col in pos_hash:
+ self.col_map[col] = pos_hash[col]
+
+ def get_row_list(self):
+ """
+ Returns names of rows
+ """
+ out = self.row_map.keys()
+ out.sort( lambda x,y: self.row_map[x]-self.row_map[y])
+ return out
+
+ def get_col_list(self):
+ """
+ Returns names of columns
+ """
+ out = self.col_map.keys()
+ out.sort( lambda x,y: self.col_map[x]-self.col_map[y])
+ return out
+
+ def get_row(self, row_name):
+ return self.matrix[ self.row_map[row_name] ]
+
+ def set_val(self, col_name, row_name, value):
+ """
+ Set cell value based on row and column names
+ """
+ self.matrix[self.row_map[row_name]][self.col_map[col_name]] = value
+
+ def get_val(self, col_name, row_name):
+ """
+ Get cell value based on row and column names
+ """
+ return self.matrix[self.row_map[row_name]][self.col_map[col_name]]
+
+
+ def init_blank(self, cols, rows):
+ """
+ Initlize matrix with NA (or nan) values using row/column names
+ provided by user. User can also force usage of native python objects
+ (which is useful for string based matrices, and numpy matrices fix cel string length)
+ """
+ self.matrix = []
+ self.col_map = {}
+ self.row_map = {}
+ for i in range(len(rows)):
+ self.matrix.append([""]*len(cols))
+ for i, c in enumerate(cols):
+ self.col_map[c] = i
+ for i, r in enumerate(rows):
+ self.row_map[r] = i
+ self.loaded = True
+
+ def write(self, handle, missing=''):
+ write = csv.writer(handle, delimiter="\t", lineterminator='\n')
+ col_list = self.get_col_list()
+
+ write.writerow([self.corner_name] + col_list)
+ for rowName in self.row_map:
+ out = [rowName]
+ row = self.get_row(rowName)
+ for col in col_list:
+ val = row[self.col_map[col]]
+ out.append(val)
+ write.writerow(out)
+
+
+if __name__ == "__main__" :
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--col-matrix', help='Matrix to saturate by columns', dest="col_matrix", default=None)
+ #parser.add_argument('-r', '--row-matrix', help='Matrix to censor by rows', dest="row_matrix", default=None)
+ parser.add_argument('-d', '--iddag', help='IDDag to use for saturation', dest="iddag", default=None)
+ parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+ parser.add_argument("-l", "--leaf-only", help="Lead Only", dest="leaf_only", action="store_true", default=False)
+
+ args = parser.parse_args()
+
+
+ matrix = ClinicalMatrix()
+ matrix.load(args.col_matrix)
+ iddag = IDDag()
+ iddag.load(args.iddag)
+
+ expander = IDExpander(iddag)
+ out = expander.expand_matrix(matrix, args.leaf_only)
+ if args.output is None:
+ out.write(sys.stdout)
+ else:
+ handle = open(args.output, "w")
+ out.write(handle)
+ handle.close()
\ No newline at end of file
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_saturate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_saturate.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,28 @@
+
+ Saturate all matrix values of a label with values from the parents
+ matrix_saturate.py
+#if $leaf_only
+--leaf-only
+#end if
+-c $matrix -d $idtree -o $outfile
+
+
+
+
+
+
+
+
+
+Saturate all matrix values of a label with values from the parents.
+The example would be an experiment where that are properties of a cell line, and multiple experiments (ie multiple time points) per cell line. Each of the experiments would have a unique example label, but it would be connected to the 'parent'
+cell line label. In order to fully populate the information about the experiment (so that timepoint experiments would inherit the
+there is a 'tissue_of_origin' data from the cell line), a 'saturation' copies parent data down the tree and to all of the child labels.
+
+
+IDTree:
+ This is a description of the tree of sample id connections.
+ Format is two tab seperated columns, in the form of 'parent' 'child'
+
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_transpose.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_transpose.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import string,sys
+import getopt
+import array
+
+if __name__ == "__main__":
+
+ opts, args = getopt.getopt(sys.argv[1:], "lf")
+ if (len(args))!=2:
+ sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n")
+ sys.exit(2)
+
+ label_print = True
+ use_float = True
+ for o, a in opts:
+ if o == "-l":
+ label_print = False
+ if o == "-f":
+ use_float = True
+
+ fin= open(args[0],'r')
+ if args[1] == "-":
+ fout = sys.stdout
+ else:
+ fout= open(args[1],'w')
+
+ col_label = None
+ row_label = []
+ matrix=[]
+ for line in fin.readlines():
+ data = string.split(line.strip(),'\t')
+ if col_label is None:
+ col_label = data
+ else:
+ row_label.append(data[0])
+ if use_float:
+ o = array.array('f')
+ for i in data[1:]:
+ try:
+ o.append(float(i))
+ except ValueError:
+ o.append(float('nan'))
+ else:
+ o = data[1:]
+ row_label.append(data[0])
+ matrix.append(o)
+
+ #header
+ out = []
+ if label_print:
+ out = [col_label[0]] + row_label
+ else:
+ out = row_label
+ fout.write("\t".join(out) + "\n")
+
+ #body
+ for col in range(0, len(col_label)):
+ out = []
+ if label_print:
+ out.append(col_label[col+1])
+ for row in matrix:
+ out.append(str(row[col]))
+ fout.write("\t".join(out) + "\n")
+
+ fin.close()
+ fout.close()
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_transpose.py~
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_transpose.py~ Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+import string,sys
+import getopt
+import array
+
+if __name__ == "__main__":
+
+ opts, args = getopt.getopt(sys.argv[1:], "lf")
+ if (len(args))!=2:
+ sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n")
+ sys.exit(2)
+
+ label_print = True
+ use_float = True
+ for o, a in opts:
+ if o == "-l":
+ label_print = False
+ if o == "-f":
+ use_float = True
+
+ fin= open(args[0],'r')
+ fout= open(args[1],'w')
+
+ col_label = None
+ row_label = []
+ matrix=[]
+ for line in fin.readlines():
+ data = string.split(line.strip(),'\t')
+ if col_label is None:
+ col_label = data
+ else:
+ row_label.append(data[0])
+ if use_float:
+ o = array.array('f')
+ for i in data[1:]:
+ try:
+ o.append(float(i))
+ except ValueError:
+ o.append(float('nan'))
+ else:
+ o = data[1:]
+ row_label.append(data[0])
+ matrix.append(o)
+
+ #header
+ out = []
+ if label_print:
+ out = [col_label[0]] + row_label
+ else:
+ out = row_label
+ fout.write("\t".join(out) + "\n")
+
+ #body
+ for col in range(0, len(col_label)):
+ out = []
+ if label_print:
+ out.append(col_label[col+1])
+ for row in matrix:
+ out.append(str(row[col]))
+ fout.write("\t".join(out) + "\n")
+
+ fin.close()
+ fout.close()
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_transpose.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_transpose.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,19 @@
+
+ Transpose a matrix
+ matrix_transpose.py
+#if $isfloat:
+-f
+#end if
+$infile $outfile
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_whitelist.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_whitelist.py Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+import sys
+import csv
+import argparse
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c', '--col-matrix', help='Matrix to censor by columns', dest="col_matrix", default=None)
+ parser.add_argument('-r', '--row-matrix', help='Matrix to censor by rows', dest="row_matrix", default=None)
+ parser.add_argument('-b', '--bed', help='BED file to censor', dest="bed", default=None)
+ parser.add_argument('-w', '--whitelist', help='White list of samples', dest="white_list", default=None)
+ parser.add_argument("-d", "--delim", help="Field Delimiter (Default \t)", dest="delim", default="\t")
+ parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+
+ args = parser.parse_args()
+
+ if args.white_list is None:
+ sys.stderr.write("Must Provide whitelist\n")
+ sys.exit(0)
+
+ whitelist = {}
+ handle = open(args.white_list)
+ for line in handle:
+ key = line.rstrip().split("\t")[0]
+ whitelist[key] = True
+
+ if args.col_matrix is not None:
+ handle = open(args.col_matrix)
+ reader = csv.reader(handle, delimiter=args.delim)
+ out = sys.stdout
+ writer = None
+ head = None
+ for row in reader:
+ if head is None:
+ head = [0]
+ orow = [row[0]]
+ for i, a in enumerate(row[1:]):
+ if a in whitelist:
+ head.append(i+1)
+ orow.append(a)
+
+ if len(orow) < 2:
+ break
+ if args.output is not None:
+ out = open(args.output, "w")
+ writer = csv.writer(out, delimiter="\t", lineterminator="\n")
+ writer.writerow(orow)
+ else:
+ orow = []
+ for i in head:
+ orow.append(row[i])
+ writer.writerow(orow)
+ handle.close()
+ if args.output is not None:
+ out.close()
+
+
+ if args.row_matrix is not None:
+ handle = open(args.row_matrix)
+ reader = csv.reader(handle, delimiter=args.delim)
+ out = sys.stdout
+ writer = None
+ header = None
+ for row in reader:
+ if header is None:
+ header = row
+ else:
+ if row[0] in whitelist:
+ if writer is None:
+ if args.output is not None:
+ out = open(args.output, "w")
+ writer = csv.writer(out, delimiter="\t", lineterminator="\n")
+ writer.writerow(header)
+ writer.writerow(row)
+ handle.close()
+ if writer is not None:
+ out.close()
+
+
+ if args.bed is not None:
+ handle = open(args.bed)
+ reader = csv.reader(handle, delimiter=args.delim)
+ out = sys.stdout
+ writer = None
+ for row in reader:
+ if row[3] in whitelist:
+ if writer is None:
+ if args.output is not None:
+ out = open(args.output, "w")
+ writer = csv.writer(out, delimiter="\t", lineterminator="\n")
+ writer.writerow(row)
+ handle.close()
+ if writer is not None:
+ out.close()
+
diff -r 000000000000 -r eeaa112c9ee0 matrix_manipulate/matrix_whitelist.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_whitelist.xml Fri Dec 21 16:43:16 2012 -0500
@@ -0,0 +1,26 @@
+
+ Remove samples from matrix that aren't part of whitelist
+ matrix_whitelist.py -w $whiteList
+ -o $outfile
+#if str($mode) == "col":
+-c $matrix
+#end if
+#if str($mode) == "row":
+-r $matrix
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+Matrix Whitelist
+
+