Mercurial > repos > kellrott > matrix_manipulate

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/aggregate.pl~	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,219 @@
+#!/usr/bin/perl
+
+##############################################################################
+##############################################################################
+##
+## aggregate.pl
+##
+##############################################################################
+##############################################################################
+##
+## Written by Josh Stuart in the lab of Stuart Kim, Stanford University.
+##
+##  Email address: jstuart@stanford.edu
+##          Phone: (650) 725-7612
+##
+## Postal address: Department of Developmental Biology
+##                 Beckman Center Room B314
+##                 279 Campus Dr.
+##                 Stanford, CA 94305
+##
+##       Web site: http://www.smi.stanford.edu/people/stuart
+##
+##############################################################################
+##############################################################################
+##
+## Written: 00/00/02
+## Updated: 00/00/02
+##
+##############################################################################
+##############################################################################
+
+## Support for MEDIAN calculation added by Alex Williams, 2007
+
+require "$ENV{MYPERLDIR}/lib/libfile.pl";
+require "$ENV{MYPERLDIR}/lib/libstats.pl";
+
+use strict;
+use warnings;
+
+my @flags   = (
+                  [    '-q', 'scalar',     0,     1]
+                , [    '-k', 'scalar',     1, undef]
+                , [    '-d', 'scalar',  "\t", undef]
+                , [    '-h', 'scalar',     0, undef]
+                , [  '-sig', 'scalar',     3, undef]
+                , [    '-f', 'scalar','mean', undef]
+                , ['--file', 'scalar',   '-', undef]
+			    , ['--emptyval', 'scalar',     'NaN',     undef]
+			    , ['--test', 'scalar',     0,     1]
+              );
+
+my %args = %{&parseArgs(\@ARGV, \@flags)};
+
+if(exists($args{'--help'}))
+{
+   print STDOUT <DATA>;
+   exit(0);
+}
+
+my $emptyVal = $args{'--emptyval'};
+my $runTest  = $args{'--test'}; # <-- this doesn't do anything right now. Theoretically it should run a test to make sure the values actually work for a few known cases (sanity check for this program)
+my $verbose  = not($args{'-q'});
+my $col      = int($args{'-k'}) - 1;
+my $delim    = $args{'-d'};
+my $function = lc($args{'-f'}); # lower-case whatever the function name was
+my $headers  = $args{'-h'};
+my $sigs     = $args{'-sig'};
+my $file     = $args{'--file'};
+
+my $sprintf_ctrl = '%.' . $sigs . 'f';
+
+
+if ($function ne 'mean' && $function ne 'median') {
+	die "ERROR in aggregate.pl: You must specify a function ( -f  FUNCTION_NAME ). The supported functions are mean and median.\n";
+}
+
+# my ($ids, $rows) = &readIds($file, $col, $delim);
+# my $data = &readDataMatrix($file, $col, $delim, \$max_cols);
+my ($data, $ids, $rows, $max_cols) = &readDataAndIds($file, $col, $delim);
+
+for(my $i = 0; $i < scalar(@{$rows}) and $i < $headers; $i++) {
+   print $$ids[$i], $delim, join($delim, @{$$data[$i]}), "\n";
+}
+
+for(my $i = $headers; $i < scalar(@{$rows}); $i++)
+{
+   my $id = $$ids[$i];
+
+   my $useMedian = ($function eq 'median');
+   my $useMean   = ($function eq 'mean');
+
+   my @sum;
+   my @count;
+   my @medianCalcArray; # this is actually just a list of all the items in the column for each key
+   # Note: medianCalcArray is an array of ARRAYS.
+
+   for(my $j = 0; $j < $max_cols; $j++) {
+      $sum[$j] = 0;
+      $count[$j] = 0;
+   }
+
+   #my @r = @{$$rows[$i]};
+
+   for(my $k = 0; $k < scalar(@{$$rows[$i]}); $k++) {
+	   my $row = $$rows[$i][$k];
+
+	   #print "Row is $row\n";
+	   for(my $j = 0; $j < $max_cols; $j++) {
+		   my $thisEntry = $$data[$row][$j];
+		   if(defined($thisEntry)) {
+			   if($thisEntry =~ /^\s*[\d+\.eE-]+\s*$/) {
+				   $count[$j]++;
+				   $sum[$j] += $thisEntry;
+				   if ($useMedian) {
+					   if (!defined($medianCalcArray[$j])) {
+						   @{$medianCalcArray[$j]} = ();
+					   }
+					   push(@{$medianCalcArray[$j]}, $thisEntry);
+					   #print "$j: $k: $row: ";
+					   #print @{$medianCalcArray[$j]};
+					   #print "\n";
+				   }
+			   }
+		   }
+	   }
+   }
+
+   my @agg;
+   for(my $j = 0; $j < $max_cols; $j++) {
+      $agg[$j] = ${emptyVal};
+      if($useMean) {
+         $agg[$j] = ($count[$j] > 0) ? sprintf($sprintf_ctrl, ($sum[$j] / $count[$j])) : ${emptyVal};
+      }
+	  if($useMedian) {
+		  # Only calculate the median if we actually specifically want it... otherwise it slows us down
+		  if (defined($medianCalcArray[$j]) && (scalar(@{$medianCalcArray[$j]}) > 0) ) {
+			  $agg[$j] = vec_median(\@{$medianCalcArray[$j]}); # <-- vec_median is in libstats.pl
+		  }
+	  }
+	  if ($useMean && $useMedian) { die "Error in arguments to aggregate.pl: You cannot specify both *mean* AND *median* at the same time! (We would be overwriting the storage variable!) You will have to run the program twice, once with each option.\n"; }
+   }
+
+   print STDOUT $id, (($max_cols > 0) ? ($delim . join($delim, @agg)) : ""), "\n";
+}
+
+exit(0);
+
+
+__DATA__
+syntax: aggregate.pl [OPTIONS]
+
+Combines the numeric data across rows with the same key. Useful if you have experiments
+with replicates. See below for a complete example.
+
+Example usage:  aggregate.pl -f median MYFILE
+
+OPTIONS are:
+
+-q: Quiet mode (default is verbose)
+
+-k COL: Use the column COL as the key column. The script uses the entries found
+        on each line of this column as keys. Duplicates are merged by applying
+        an aggregation function for each value in their records.
+
+-d DELIM: Set the field delimiter to DELIM (default is tab).
+
+-h NUM: The number of headers in the input file (default is 0).
+
+-f FUNCTION: Set the aggregation function to FUNCTION (default is mean). The
+             possible values are:
+
+                 mean: The mean of the values (default)  (-f mean)
+
+                 median: The median of the values.  (-f median)
+
+--emptyval VALUE: Sets the "empty"/"no data" values to VALUE. (default is NaN)
+             If an output value has no input data, then this will be the output.
+
+EXAMPLE:
+
+Works like this:
+
+If the input file INPUTFILE has five columns like below (tab-delimited, although
+spaces are shown here): (Note that Column 3 (C3) is blank for all rows except
+for the last one)
+
+Column_1  Column_2  C3  C4  C5
+v                v   v   v   v
+-------------------------------- (Sample file is below)
+Experiment_Alpha 1       0   77
+Experiment_Alpha 2       0
+Expr_Beta        10
+Expr_Beta        30
+Experiment_Alpha 3       6
+Expr_Beta            5
+
+
+And you type:   aggregate.pl -f mean THE_FILE
+
+Then the output will be the *mean* values for each experiment, across all rows:
+
+Experiment_Alpha   2.0   NaN   3.0   77
+Expr_Beta          20    5     NaN   NaN
+
+Note that the "77" case (the last item in the first row) is the corect mean,
+because the other Experiment_Alpha items for that column do not have any data ta
+all. Even though there are 3 rows labeled "Experiment_Alpha", only one of them
+has data for the last column (column 4), so 77 is the mean. The output is always
+a matrix (although it could be a single-column matrix). Empty values are padded
+with NaN (although you can change this with --emptyval).
+
+
+TO DO / FUTURE WORK:
+
+Future possibility (NOT IMPLEMENTED YET): smean: Standardized mean (mean/stddev).
+
+KNOWN BUGS / ISSUES:
+
+None so far.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/floatMatrix.py	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,335 @@
+import array
+import math
+import csv
+
+def union(*input):
+    o = {}
+    for i in input:
+        for a in i:
+            o[a]= True
+    return o.keys()
+
+def intersect(a, b):
+    o = []
+    for i in a:
+        if i in b:
+            o.append(i)
+    return o
+
+class DataException(Exception):
+    pass
+
+class NamedVector(dict):
+
+    def init_blank(self, names):
+        for a in names:
+            self[a] = float("nan")
+
+    def __getitem__(self, key):
+        if isinstance(key, NamedVector):
+            o = NamedVector()
+            for a in key:
+                if key[a]:
+                    o[a] = self.get(a)
+            return o
+        else:
+            return self.get(key)
+
+
+    def __add__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in union(self, other):
+                o[a] = self.get(a, 0.0) + other.get(a, 0.0)
+        else:
+            for a in self:
+                o[a] = self[a] + other
+        return o
+
+    def __div__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) / other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] / other
+        return o
+
+
+    def __mul__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) * other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] * other
+        return o
+
+    def __sub__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) - other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] - other
+        return o
+
+    def __gt__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) > other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] > other
+        return o
+
+    def __eq__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) == other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] == other
+        return o
+
+    def __ne__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) != other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] != other
+        return o
+
+
+    def __lt__(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = self.get(a) < other.get(a)
+        else:
+            for a in self:
+                o[a] = self[a] < other
+        return o
+
+    def pow(self, other):
+        o = NamedVector()
+        if isinstance(other, NamedVector):
+            for a in intersect(self, other):
+                o[a] = math.pow(self.get(a), other.get(a))
+        else:
+            for a in self:
+                o[a] = math.pow(self[a], other)
+        return o
+
+    def log(self, base=math.e):
+        o = NamedVector()
+        for a in self:
+            o[a] = math.log(self[a], base)
+        return o
+
+    def sqrt(self):
+        o = NamedVector()
+        for a in self:
+            o[a] = math.sqrt(self[a])
+        return o
+
+    def array(self, names):
+        o = array.array('f')
+        for a in names:
+            o.append( self.get(a, float('nan')) )
+        return o
+
+    def set_nan(self, value=0.0):
+        for a in self:
+            if math.isnan(self[a]):
+                self[a] = value
+
+    def sum(self):
+        return sum(self.values())
+
+
+
+class FloatMatrix:
+    """
+    array.array based float matrix class
+    """
+    def __init__(self):
+        self.corner_name = "probe"
+        self.data = None
+        self.nrows = None
+        self.ncols = None
+        self.rowmap = None
+        self.colmap = None
+
+    def read(self, handle):
+        header = None
+        for line in handle:
+            row = line.rstrip().split("\t")
+            if header is None:
+                header = row
+                self.data = array.array("f")
+                self.colmap = {}
+                self.rowmap = {}
+                self.ncols = len(row) - 1
+                self.nrows = 0
+                for i, c in enumerate(row[1:]):
+                    self.colmap[c] = i
+            else:
+                if len(row) - 1 != self.ncols:
+                    raise DataException("Misformed matrix")
+                self.rowmap[row[0]] = len(self.rowmap)
+                a = []
+                for v in row[1:]:
+                    try:
+                        a.append(float(v))
+                    except ValueError:
+                        a.append(float('Nan'))
+                self.data.extend(a)
+                self.nrows += 1
+
+    def init_blank(self, rows, cols):
+        self.data = array.array("f")
+        self.colmap = {}
+        for i,c in enumerate(cols):
+            self.colmap[c] = i
+        self.rowmap = {}
+        for i,r in enumerate(rows):
+            self.rowmap[r] = i
+        self.ncols = len(cols)
+        self.nrows = len(rows)
+        for i in range(self.nrows):
+            self.data.extend([float('nan')] * self.ncols)
+
+
+    def size(self):
+        return {'rows' : len(self.rowmap), 'cols' : len(self.colmap)}
+
+    def has_row(self, row_name):
+        return row_name in self.rowmap
+
+    def has_col(self, col_name):
+        return col_name in self.colmap
+
+    def values(self):
+        return self.data
+
+    def get_value(self, row_name, col_name):
+        return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ]
+
+    def set_value(self, row_name, col_name, value):
+        self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value
+
+    def get_row(self, row_name):
+        if row_name not in self.rowmap:
+            raise KeyError
+        out = NamedVector()
+        out.init_blank( self.colmap )
+        for c in self.colmap:
+            out[c] = self.data[ self.rowmap[row_name] * self.ncols + self.colmap[c] ]
+        return out
+
+    def get_col(self, col_name):
+        if col_name not in self.colmap:
+            raise KeyError
+        out = NamedVector()
+        out.init_blank( self.rowmap )
+        for r in self.rowmap:
+            out[r] = self.data[ self.rowmap[r] * self.ncols + self.colmap[col_name] ]
+        return out
+
+
+    def set_row(self, row_name, row_data):
+        if row_name not in self.rowmap:
+            raise KeyError
+        row_offset = self.rowmap[row_name] * self.ncols
+        for c in row_data:
+            if c in self.colmap:
+                self.data[ row_offset + self.colmap[c] ] = row_data[c]
+
+    def get_cols(self):
+        if self.colmap is None:
+            return None
+        return self.colmap.keys()
+
+    def get_rows(self):
+        if self.rowmap is None:
+            return None
+        return self.rowmap.keys()
+
+    def select(self, rows=None, cols=None):
+        if rows is None:
+            rows = self.get_rows()
+        if cols is None:
+            cols = self.get_cols()
+        out = FloatMatrix()
+        out.init_blank(rows=rows, cols=cols)
+        for row in rows:
+            if self.has_row(row):
+                r = self.get_row(row)
+                out.set_row(row, r)
+        return out
+
+
+    def write(self, handle, missing='NA', row_select=None, col_select=None):
+        write = csv.writer(handle, delimiter="\t", lineterminator='\n')
+
+        col_list = []
+        if col_select is None:
+            col_list = self.get_cols()
+        else:
+            for c in self.get_cols():
+                if c in col_select:
+                    col_list.append(c)
+
+        write.writerow([self.corner_name] + col_list)
+        for rowName in self.rowmap:
+            if row_select is None or rowName in row_select:
+                out = [rowName]
+                row = self.get_row(rowName)
+                for col in col_list:
+                    val = row[col]
+                    if val is None or math.isnan(val):
+                        val = missing
+                    else:
+                        val = "%.5f" % (val)
+                    out.append(val)
+                write.writerow(out)
+
+    def set_nan(self, value=0.0):
+        for i in range(len(self.data)):
+            if math.isnan(self.data[i]):
+                self.data[i] = value
+
+    def row_sums(self):
+        out = NamedVector()
+        for rowName in self.rowmap:
+            row = self.get_row(rowName)
+            out[rowName] = row.sum()
+        return out
+
+    def merge(self, other):
+        out = FloatMatrix()
+        out.init_blank( rows=union(self.get_rows(), other.get_rows()), cols=union( self.get_cols(), other.get_cols()))
+
+        for rowName in self.rowmap:
+            for colName in self.colmap:
+                out.set_value( row_name=rowName, col_name=colName, value=self.get_value(row_name=rowName, col_name=colName))
+
+        for rowName in other.rowmap:
+            for colName in other.colmap:
+                out.set_value( row_name=rowName, col_name=colName, value=other.get_value(row_name=rowName, col_name=colName))
+
+        return out
+
+    def toRmatrix(self, r):
+        out = r.matrix(self.data, ncol=self.ncols, dimnames=[ self.get_rows(), self.get_cols() ], byrow=True)
+        return out
+
+
Binary file matrix_manipulate/floatMatrix.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_calc.py	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import csv
+import re
+import math
+import argparse
+import floatMatrix
+
+def value_eval(code, value):
+    funcmap = {
+        "len":len,
+        "value" : value,
+        "re" : re,
+        "math" : math,
+        "sum" : sum,
+        "float" : float
+    }
+    return eval(code,{"__builtins__":None},funcmap)
+
+def dict_dict_2_table(indata):
+    all_labels = {}
+    for r in indata:
+        for c in indata[r]:
+            all_labels[c] = True
+    head = all_labels.keys()
+    out = []
+    out.append( ["#"] + head )
+    for r in indata:
+        o = [r]
+        for h in head:
+            o.append( str(indata[r].get(h,"")) )
+        out.append(o)
+    return out
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--col-eval', help='Column Eval', dest="col_eval", default=None)
+    parser.add_argument('-r', '--row-eval', help='Row Eval', dest="row_eval", default=None)
+
+    parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+    parser.add_argument("input", help="Input Matrix", default=None)
+
+    args = parser.parse_args()
+
+    if args.input == "-":
+        ihandle = sys.stdin
+    else:
+        ihandle = open(args.input)
+
+    if args.output is None:
+        ohandle = sys.stdout
+    else:
+        ohandle = open(args.output, "w")
+
+    matrix = floatMatrix.FloatMatrix()
+    matrix.read(ihandle)
+    ihandle.close()
+
+    out = {}
+
+    if args.col_eval is not None and len(args.col_eval):
+        for col in matrix.get_cols():
+            value = matrix.get_col(col)
+            out[col] = value_eval(args.col_eval, value)
+        table = dict_dict_2_table(out)
+        writer = csv.writer(ohandle, delimiter="\t", lineterminator="\n")
+        for i in range(len(table[0])):
+            o = []
+            for r in table:
+                o.append(r[i])
+            writer.write_row(o)
+    elif args.row_eval is not None and len(args.row_eval):
+        for row in matrix.get_rows():
+            value = matrix.get_row(row)
+            out[row] = value_eval(args.row_eval, value)
+        table = dict_dict_2_table(out)
+        writer = csv.writer(ohandle, delimiter="\t", lineterminator="\n")
+        writer.writerows(table)
+    ohandle.close()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_calc.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,48 @@
+<tool id="matrix_calc" name="Matrix Caclulate" version="1.0.0">
+	<description>Perform calclations of matrix rows and columns using python statements</description>
+	<command interpreter="python">./matrix_calc.py
+#if len($row_txt) > 0:
+--row-filter-file $row_eval_txt
+#end if
+#if len($col_txt) > 0:
+--col-filter-file $col_eval_txt
+#end if
+$matrix
+-o ${outfile}
+	</command>
+	<inputs>
+		<param name="row_txt" type="text" area="True" size="5x35" label="Row Eval Code" optional="True">
+			<sanitizer>
+				<valid initial="string.printable">
+					<remove value="&quot;"/>
+				</valid>
+				<mapping initial="none">
+					<add source="&quot;" target="\&quot;"/>
+					<add source="\" target="\\"/>
+				</mapping>
+			</sanitizer>
+		</param>
+		<param name="col_txt" type="text" area="True" size="5x35" label="Column Eval Code" optional="True">
+			<sanitizer>
+				<valid initial="string.printable">
+					<remove value="&quot;"/>
+				</valid>
+				<mapping initial="none">
+					<add source="&quot;" target="\&quot;"/>
+				</mapping>
+			</sanitizer>
+		</param>
+		<param name="matrix" type="data" format="tabular" label="Matrix"/>
+	</inputs>
+	<outputs>
+		<data format="tabular" name="outfile" />
+	</outputs>
+	<configfiles>
+       	<configfile name="row_eval_txt">${row_txt}</configfile>
+       	<configfile name="col_eval_txt">${col_txt}</configfile>
+	</configfiles>
+
+	<help>
+This is a utility to perform calculations on the rows and columns of a matrix file.
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_cat.py	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+"""matrix_cat.py:
+
+Usage:
+  matrix_cat.py [options] file1 file2 [file3 ...]
+
+Options:
+  -h            skip header
+  -i            output only features in common with all files
+  -f            use 'float' mode to save memory
+  -o <outfile>  Output file
+  -m <missing>  Missing value string
+  -q            run quietly
+"""
+import os, os.path, sys, getopt, re
+import array
+
+delim = "\t"
+verbose = True
+
+def usage(code = 0):
+    sys.stderr.write( __doc__ )
+    if code != None: sys.exit(code)
+
+def log(msg, die = False):
+    if (verbose):
+        sys.stderr.write(msg)
+    if die:
+        sys.exit(1)
+
+def readFile(inFile, header = True, use_float = False):
+    dataWidth = None
+    dataHeader = None
+    dataMap = {}
+    f = open(inFile, "r")
+    if header:
+        line = f.readline()
+        if line.isspace():
+            log("ERROR: missing header\n", die = True)
+        pline = re.split(delim, line.rstrip("\n\r"))
+        dataHeader = pline
+        dataWidth = len(pline[1:])
+    for line in f:
+        if line.isspace():
+            continue
+        pline = re.split(delim, line.rstrip("\n\r"))
+        if dataWidth is None:
+            dataWidth = len(pline[1:])
+        assert(len(pline[1:]) == dataWidth)
+        if use_float:
+            out = array.array("f")
+            for a in pline[1:]:
+                try:
+                    out.append(float(a))
+                except ValueError:
+                    out.append(float('nan'))
+        else:
+            out = pline[1:]
+        dataMap[pline[0]] = out
+    f.close()
+    return (dataMap, dataHeader, dataWidth)
+
+def main(args):
+    ## parse arguments
+    try:
+        opts, args = getopt.getopt(args, "hiqfo:m:")
+    except getopt.GetoptError, err:
+        sys.stderr.write( str(err) + "\n" )
+        usage(2)
+
+    if len(args) > 0:
+        files = args
+    else:
+        files = []
+        for i in sys.stdin:
+           files.append(i.rstrip("\n\r"))
+
+    if len(files) < 1:
+        sys.stderr.write("incorrect number of arguments\n")
+        usage(1)
+
+    header = True
+    useIntersection = False
+    output = None
+    use_float = False
+    missing = ""
+    global verbose
+    for o, a in opts:
+        if o == "-h":
+            header = False
+        elif o == "-i":
+            useIntersection = True
+        elif o == "-q":
+            verbose = False
+        elif o == "-o":
+            output = a
+        elif o == "-f":
+            use_float = True
+        elif o == "-m":
+            missing = a
+
+    ## read files
+    fileData = {}
+    fileWidth = {}
+    fileHeader = {}
+    for file in files:
+        (fileData[file], fileHeader[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float)
+    header_dict = {}
+    for file in files:
+        for f in fileHeader[file][1:]:
+            header_dict[f] = True
+
+    headers = list(header_dict.keys())
+    if useIntersection:
+        for file in files:
+            features = list(set(fileHeader[file].keys()) & set(headers))
+    headers.sort()
+
+    if output is not None:
+        ohandle = open(output, "w")
+    else:
+        ohandle  = sys.stdout
+
+    ## output
+    if header:
+        ohandle.write("#\t%s\n" % (delim.join(headers)))
+    for file in files:
+        for feature in fileData[file]:
+            lineElements = []
+            for h in headers:
+                try:
+                    index = fileHeader[file].index(h)
+                    lineElements.append(fileData[file][feature][index-1])
+                except ValueError:
+                    lineElements.append(missing)
+            ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) ))
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_cat.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,26 @@
+<tool id="matrix_cat" name="Matrix Cat" version="1.0.0">
+	<description>Join Matrices using column labels</description>
+	<command interpreter="python">matrix_cat.py
+-o $out
+-m '$missing_value'
+#if $is_float:
+-f
+#end if
+#for $a in $in_mats:
+$a.file
+#end for
+	</command>
+	<inputs>
+		<repeat name="in_mats" title="Input Matrix" min="1">
+			<param name="file" type="data" label="Matrix File"/>
+		</repeat>
+		<param name="missing_value" type="text" label="Missing Value" value="NA"/>
+		<param name="is_float" type="boolean" label="Float Values" help="If all matrices are floating point numbers, use to save memory"/>
+	</inputs>
+	<outputs>
+		<data name="out" format="tabular" label="Cat Matrix" help="Cat Matrix"/>
+	</outputs>
+	<help>
+Join matricies by row labels
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_edit.py	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import csv
+import re
+import math
+import argparse
+
+def value_eval(code, value):
+    funcmap = {
+        "len":len,
+        "value" : value,
+        "re" : re,
+        "math" : math,
+        "float" : float
+    }
+    out = eval(code,{"__builtins__":None},funcmap)
+    return str(out)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--col-eval', help='Column Eval', dest="col_eval", default=None)
+    parser.add_argument('-r', '--row-eval', help='Row Eval', dest="row_eval", default=None)
+    parser.add_argument('-m', '--cell-eval', help='Cell Eval', dest="cell_eval", default=None)
+
+    parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+    parser.add_argument("input", help="Input Matrix", default=None)
+
+    args = parser.parse_args()
+
+    if args.input == "-":
+        ihandle = sys.stdin
+    else:
+        ihandle = open(args.input)
+
+    if args.output is None:
+        ohandle = sys.stdout
+    else:
+        ohandle = open(args.output, "w")
+
+    reader = csv.reader(ihandle, delimiter="\t")
+    writer = csv.writer(ohandle, delimiter="\t", lineterminator="\n")
+
+    header = True
+    for row in reader:
+        if header:
+            if args.col_eval is not None and len(args.col_eval):
+                for i, val in enumerate(row[1:]):
+                    row[i+1] = value_eval(args.col_eval, val)
+            header = False
+        else:
+            if args.row_eval is not None and len(args.row_eval):
+                row[0] = value_eval(args.row_eval, row[0])
+            if args.cell_eval is not None and len(args.cell_eval):
+                for i in range(1,len(row)):
+                    row[i] = value_eval(args.cell_eval,row[i])
+        writer.writerow(row)
+
+    ihandle.close()
+    ohandle.close()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_filter.py	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import array
+import math
+import csv
+import floatMatrix
+
+def value_eval(code, values, label, label_set):
+    funcmap = {
+        "len":len,
+        "values" : values,
+        "label" : label,
+        "label_set" : label_set,
+        "math" : math,
+        "sum" : sum,
+        "min" : min,
+        "max" : max
+     }
+    return eval(code,{"__builtins__":None},funcmap)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--col-filter', help='Column Filter', dest="col_filter", default=None)
+    parser.add_argument('--col-filter-file', help='Column Filter File', default=None)
+    parser.add_argument('-r', '--row-filter', help='Row Filter', dest="row_filter", default=None)
+    parser.add_argument('--row-filter-file', help='Row Filter File', default=None)
+
+    parser.add_argument("-o", "--out", help="Output File", dest="output", default=None)
+    parser.add_argument("input", help="Input Matrix", default=None)
+
+    args = parser.parse_args()
+
+    if args.input == "-":
+        handle = sys.stdin
+    else:
+        handle = open(args.input)
+
+    fm = floatMatrix.FloatMatrix()
+    fm.read(handle)
+
+    col_filter = None
+    row_filter = None
+
+    if args.col_filter:
+        col_filter = args.col_filter
+    if args.col_filter_file:
+        handle = open(args.col_filter_file)
+        col_filter = handle.read()
+        handle.close()
+
+    if args.row_filter:
+        row_filter = args.row_filter
+    if args.row_filter_file:
+        handle = open(args.row_filter_file)
+        row_filter = handle.read()
+        handle.close()
+
+
+    if col_filter:
+        cols = []
+        for c in fm.get_cols():
+            v = fm.get_col(c)
+            if value_eval(col_filter, v.values(), c, fm.get_cols()):
+                cols.append(c)
+    else:
+        cols = fm.get_cols()
+
+    if row_filter:
+        rows = []
+        for c in fm.get_rows():
+            v = fm.get_row(c)
+            if value_eval(row_filter, v.values(), c, fm.get_rows()):
+                rows.append(c)
+    else:
+        rows = fm.get_rows()
+
+    ohandle = sys.stdout
+    if args.output is not None:
+        ohandle = open(args.output, "w")
+
+    fm.write(ohandle, row_select=rows, col_select=cols)
+    ohandle.close()
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_filter.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,61 @@
+<tool id="matrix_filter" name="Matrix Filter" version="1.0.0">
+	<description>Filter row/column of a tabular file using python statements</description>
+	<command interpreter="python">./matrix_filter.py
+#if str($row_txt) != '':
+--row-filter-file $row_eval_txt
+#end if
+#if str($col_txt) != '':
+--col-filter-file $col_eval_txt
+#end if
+$matrix
+-o ${outfile}
+	</command>
+	<inputs>
+		<param name="row_txt" type="text" area="True" size="5x35" label="Row Eval Code" optional="True">
+			<sanitizer>
+				<valid initial="string.printable">
+					<remove value="&quot;"/>
+				</valid>
+				<mapping initial="none">
+					<add source="&quot;" target="\&quot;"/>
+					<add source="\" target="\\"/>
+				</mapping>
+			</sanitizer>
+		</param>
+		<param name="col_txt" type="text" area="True" size="5x35" label="Column Eval Code" optional="True">
+			<sanitizer>
+				<valid initial="string.printable">
+					<remove value="&quot;"/>
+				</valid>
+				<mapping initial="none">
+					<add source="&quot;" target="\&quot;"/>
+				</mapping>
+			</sanitizer>
+		</param>
+		<param name="matrix" type="data" format="tabular" label="Matrix"/>
+	</inputs>
+	<outputs>
+		<data format="tabular" name="outfile" />
+	</outputs>
+	<configfiles>
+       	<configfile name="row_eval_txt">${row_txt}</configfile>
+	</configfiles>
+	<configfiles>
+       	<configfile name="col_eval_txt">${col_txt}</configfile>
+	</configfiles>
+
+	<help>
+This is a utility to perform filtering operations on the rows and columns of a tabular file.
+
+- The 'Column Eval Code' operations occur on the first line.
+- The 'Row Eval Code' operations occur on the first cell of every line
+- If any of the code blocks are empty, the operation is skipped and assumed to be true
+
+Avalible values
+
+- values : The array of values across the row or column
+- label  : The label for the row or column
+- label_set :
+
+	</help>
+</tool>
--- a/matrix_manipulate/matrix_join.py	Fri May 10 14:38:51 2013 -0400
+++ b/matrix_manipulate/matrix_join.py	Thu Jun 13 16:54:15 2013 -0400
@@ -2,12 +2,14 @@
 """join.py:

 Usage:
-  join.py [options] file1 file2 [file3 ...]
+  matrix_join.py [options] file1 file2 [file3 ...]

 Options:
-  -h            header
+  -h            skip header
   -i            output only features in common with all files
   -f            use 'float' mode to save memory
+  -o <outfile>  Output file
+  -m <missing>  Missing value string
   -q            run quietly
 """
 import os, os.path, sys, getopt, re
@@ -28,6 +30,7 @@

 def readFile(inFile, header = True, use_float = False):
     dataWidth = None
+    dataHeader = None
     dataMap = {}
     f = open(inFile, "r")
     if header:
@@ -35,7 +38,7 @@
         if line.isspace():
             log("ERROR: missing header\n", die = True)
         pline = re.split(delim, line.rstrip("\n\r"))
-        dataMap["HEADER"] = pline
+        dataHeader = pline
         dataWidth = len(pline[1:])
     for line in f:
         if line.isspace():
@@ -55,12 +58,12 @@
             out = pline[1:]
         dataMap[pline[0]] = out
     f.close()
-    return (dataMap, dataWidth)
+    return (dataMap, dataHeader, dataWidth)

 def main(args):
     ## parse arguments
     try:
-        opts, args = getopt.getopt(args, "hiqfo:")
+        opts, args = getopt.getopt(args, "hiqfo:m:")
     except getopt.GetoptError, err:
         sys.stderr.write( str(err) + "\n" )
         usage(2)
@@ -72,18 +75,19 @@
         for i in sys.stdin:
            files.append(i.rstrip("\n\r"))

-    if len(files) < 2:
+    if len(files) < 1:
         sys.stderr.write("incorrect number of arguments\n")
         usage(1)

-    header = False
+    header = True
     useIntersection = False
     output = None
     use_float = False
+    missing = ""
     global verbose
     for o, a in opts:
         if o == "-h":
-            header = True
+            header = False
         elif o == "-i":
             useIntersection = True
         elif o == "-q":
@@ -92,13 +96,21 @@
             output = a
         elif o == "-f":
             use_float = True
+        elif o == "-m":
+            missing = a

     ## read files
     fileData = {}
     fileWidth = {}
+    fileHeader = {}
     for file in files:
-        (fileData[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float)
-    features = list(set(fileData[files[0]].keys()) - set(["HEADER"]))
+        (fileData[file], fileHeader[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float)
+    feature_dict = {}
+    for file in files:
+        for f in fileData[file]:
+            feature_dict[f] = True
+
+    features = list(feature_dict.keys())
     if useIntersection:
         for file in files:
             features = list(set(fileData[file].keys()) & set(features))
@@ -111,9 +123,9 @@

     ## output
     if header:
-        lineElements = [fileData[files[0]]["HEADER"][0]]
+        lineElements = [fileHeader[files[0]][0]]
         for file in files:
-            lineElements += fileData[file]["HEADER"][1:]
+            lineElements += fileHeader[file][1:]
         ohandle.write("%s\n" % (delim.join(lineElements)))
     for feature in features:
         lineElements = []
@@ -121,7 +133,7 @@
             if feature in fileData[file]:
                 lineElements += fileData[file][feature]
             else:
-                lineElements += ["" for i in range(fileWidth[file])]
+                lineElements += [missing for i in range(fileWidth[file])]
         ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) ))

 if __name__ == "__main__":
--- a/matrix_manipulate/matrix_join.xml	Fri May 10 14:38:51 2013 -0400
+++ b/matrix_manipulate/matrix_join.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -1,7 +1,8 @@
 <tool id="matrix_join" name="Matrix Join" version="1.0.0">
 	<description>Join Matrices using row labels</description>
-	<command interpreter="python">matrix_join.py -h
+	<command interpreter="python">matrix_join.py
 -o $out
+-m '$missing_value'
 #if $is_float:
 -f
 #end if
@@ -13,6 +14,7 @@
 		<repeat name="in_mats" title="Input Matrix" min="1">
 			<param name="file" type="data" label="Matrix File"/>
 		</repeat>
+		<param name="missing_value" type="text" label="Missing Value" value="NA"/>
 		<param name="is_float" type="boolean" label="Float Values" help="If all matrices are floating point numbers, use to save memory"/>
 	</inputs>
 	<outputs>
--- a/matrix_manipulate/matrix_rank_normalize.py	Fri May 10 14:38:51 2013 -0400
+++ b/matrix_manipulate/matrix_rank_normalize.py	Thu Jun 13 16:54:15 2013 -0400
@@ -17,7 +17,9 @@
 def py_cmp_float(a_ptr, b_ptr):
     a = a_ptr.contents.value
     b = b_ptr.contents.value
-    return (a > b) - (a < b)
+    if a < b:
+        return -1
+    return a > b

 CMPFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float))
--- a/matrix_manipulate/matrix_rank_normalize.xml	Fri May 10 14:38:51 2013 -0400
+++ b/matrix_manipulate/matrix_rank_normalize.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -11,7 +11,7 @@
   </command>
   <inputs>
 	    <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/>
-      <param name="dropZeros" type="boolean" label="Drop Input Zeros" checked="True"/>
+      <param name="dropZeros" type="boolean" label="Drop Input Zeros" checked="False"/>
       <param name="na2zero" type="boolean" label="Set Output NAs to Zero" checked="False"/>
   </inputs>
   <outputs>
--- a/matrix_manipulate/matrix_transpose.py~	Fri May 10 14:38:51 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-
-import string,sys
-import getopt
-import array
-
-if __name__ == "__main__":
-
-    opts, args = getopt.getopt(sys.argv[1:], "lf")
-    if (len(args))!=2:
-        sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n")
-        sys.exit(2)
-
-    label_print = True
-    use_float = True
-    for o, a in opts:
-            if o == "-l":
-                label_print = False
-            if o == "-f":
-                use_float = True
-
-    fin= open(args[0],'r')
-    fout= open(args[1],'w')
-
-    col_label = None
-    row_label = []
-    matrix=[]
-    for line in fin.readlines():
-        data = string.split(line.strip(),'\t')
-        if col_label is None:
-            col_label = data
-        else:
-            row_label.append(data[0])
-            if use_float:
-                o = array.array('f')
-                for i in data[1:]:
-                    try:
-                        o.append(float(i))
-                    except ValueError:
-                        o.append(float('nan'))
-            else:
-                o = data[1:]
-            row_label.append(data[0])
-            matrix.append(o)
-
-    #header
-    out = []
-    if label_print:
-        out = [col_label[0]] + row_label
-    else:
-        out = row_label
-    fout.write("\t".join(out) + "\n")
-
-    #body
-    for col in range(0, len(col_label)):
-        out = []
-        if label_print:
-            out.append(col_label[col+1])
-        for row in matrix:
-            out.append(str(row[col]))
-        fout.write("\t".join(out) + "\n")
-
-    fin.close()
-    fout.close()
--- a/matrix_manipulate/matrix_whitelist.xml	Fri May 10 14:38:51 2013 -0400
+++ b/matrix_manipulate/matrix_whitelist.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -21,6 +21,6 @@
       <data name="outfile" format="tabular"/>
   </outputs>
   <help>
-Matrix Whitelist
+Whitelist the rows or columns of a matrix using a file with a list of valid labels.
   </help>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_zscore.py	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+import sys
+import floatMatrix
+import math
+
+def matrix2zscore(matrix):
+    v = matrix.values()
+    ave = sum(v) / float(len(v))
+    std = math.sqrt( sum(map( lambda x: (x - ave)**2, v)) / float(len(v)) )
+    out = floatMatrix.FloatMatrix()
+    out.init_blank(rows=matrix.get_rows(), cols=matrix.get_cols())
+    for row in matrix.get_rows():
+        for col in matrix.get_cols():
+            value = (matrix.get_value(row_name=row, col_name=col) - ave) / std
+            out.set_value(row_name=row, col_name=col, value=value)
+    return out
+
+
+if __name__ == "__main__":
+    path = sys.argv[1]
+    if path == "-":
+        handle = sys.stdin
+    else:
+        handle = open(matrix_path)
+
+    matrix = floatMatrix.FloatMatrix()
+    matrix.read(handle)
+    handle.close()
+
+    out = matrix2zscore(matrix)
+    out.write(sys.stdout)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/matrix_zscore.xml	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,14 @@
+<tool id="matrix_zscore" name="Matrix Zscore" version="1.0.0">
+  <description>Convert values in matrix to zscores</description>
+  <command interpreter="python">matrix_zscore.py $inMatrix > $outMatrix
+  </command>
+  <inputs>
+		<param name="inMatrix" type="data" format="tabular" label="Input Matrix"/>
+  </inputs>
+  <outputs>
+      <data name="outMatrix" format="tabular"/>
+  </outputs>
+  <help>
+Scale all values in the matrix according to their relative zscore.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matrix_manipulate/quartile_norm.pl	Thu Jun 13 16:54:15 2013 -0400
@@ -0,0 +1,102 @@
+#!/usr/bin/perl
+
+use strict;
+use Getopt::Long;
+
+my $out = '-';
+my $q = 75;
+my @col;
+my @also;
+my $names = 1;
+my $target = 1000;
+my $skip = 0;
+my $min=1;
+GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min);
+
+my $in = shift @ARGV;
+
+die usage() unless $in && @col;
+
+open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n";
+open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n";
+
+@also = (1) if !@also && !grep {$_ eq '1'} @col;
+
+map {$_--} @col;
+map {$_--} @also;
+
+my @d;
+my $cnt = 0;
+my $head ='';
+while(<IN>) {
+        if ($skip) {
+                --$skip;
+                $head .= $_;
+                next;
+        }
+        chomp;
+        my @f = split /\t/;
+        if ($col[0] eq '-2') {
+                @col = (1..$#f);
+        }
+        for (@col) {
+                push @{$d[$_]}, $f[$_];
+        }
+        for (@also) {
+                push @{$d[$_]}, $f[$_];
+        }
+        ++$cnt;
+}
+for (@col) {
+        my @t = grep {$_>=$min} @{$d[$_]};
+        @t = sort {$a <=> $b} @t;
+        my $t=quantile(\@t, $q/100);
+        for (@{$d[$_]}) {
+                $_= sprintf "%.4f", $target*$_/$t;
+        }
+}
+
+my @out = (sort {$a <=> $b} (@col, @also));
+
+print OUT $head;
+
+for (my $i=0;$i<$cnt;++$i) {
+        for my $j (@out) {
+                print OUT "\t" unless $j == $out[0];
+                print OUT $d[$j][$i];
+        }
+        print OUT "\n";
+}
+
+
+sub usage {
+<<EOF;
+Usage: $0 -c COL [opts] FILE
+
+Returns an upper quartile normalization of data in column(s) COL
+of file FILE.
+
+Col is 1-based, zeroes are ignores when calculating upper quartile
+
+Options:
+   -c|col COL    normalize this column of data (can specify more than once, or -1 for all but first col)
+   -q|quant INT  quantile to use (75)
+   -t|target INT target to use (1000)
+   -a|also COL   output these columns also
+   -o|out FILE   output to this file instead of stdout
+   -m|min INT    minimum value (1)
+   -s|skip INT   skip header rows
+EOF
+}
+
+sub quantile {
+        my ($a,$p) = @_;
+        my $l = scalar(@{$a});
+        my $t = ($l-1)*$p;
+        my $v=$a->[int($t)];
+        if ($t > int($t)) {
+                return $v + $p * ($a->[int($t)+1] - $v);
+        } else {
+                return $v;
+        }
+}