Mercurial > repos > kellrott > matrix_manipulate
changeset 4:d0e3b5778e17 draft
Uploaded
author | kellrott |
---|---|
date | Thu, 13 Jun 2013 16:54:15 -0400 |
parents | 2bb8c4bb7348 |
children | 83f2acca2387 |
files | matrix_manipulate/aggregate.pl~ matrix_manipulate/floatMatrix.py matrix_manipulate/floatMatrix.pyc matrix_manipulate/matrix_calc.py matrix_manipulate/matrix_calc.xml matrix_manipulate/matrix_cat.py matrix_manipulate/matrix_cat.xml matrix_manipulate/matrix_edit.py matrix_manipulate/matrix_filter.py matrix_manipulate/matrix_filter.xml matrix_manipulate/matrix_join.py matrix_manipulate/matrix_join.xml matrix_manipulate/matrix_rank_normalize.py matrix_manipulate/matrix_rank_normalize.xml matrix_manipulate/matrix_transpose.py matrix_manipulate/matrix_transpose.py~ matrix_manipulate/matrix_whitelist.xml matrix_manipulate/matrix_zscore.py matrix_manipulate/matrix_zscore.xml matrix_manipulate/quartile_norm.pl |
diffstat | 19 files changed, 1250 insertions(+), 81 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/aggregate.pl~ Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,219 @@ +#!/usr/bin/perl + +############################################################################## +############################################################################## +## +## aggregate.pl +## +############################################################################## +############################################################################## +## +## Written by Josh Stuart in the lab of Stuart Kim, Stanford University. +## +## Email address: jstuart@stanford.edu +## Phone: (650) 725-7612 +## +## Postal address: Department of Developmental Biology +## Beckman Center Room B314 +## 279 Campus Dr. +## Stanford, CA 94305 +## +## Web site: http://www.smi.stanford.edu/people/stuart +## +############################################################################## +############################################################################## +## +## Written: 00/00/02 +## Updated: 00/00/02 +## +############################################################################## +############################################################################## + +## Support for MEDIAN calculation added by Alex Williams, 2007 + +require "$ENV{MYPERLDIR}/lib/libfile.pl"; +require "$ENV{MYPERLDIR}/lib/libstats.pl"; + +use strict; +use warnings; + +my @flags = ( + [ '-q', 'scalar', 0, 1] + , [ '-k', 'scalar', 1, undef] + , [ '-d', 'scalar', "\t", undef] + , [ '-h', 'scalar', 0, undef] + , [ '-sig', 'scalar', 3, undef] + , [ '-f', 'scalar','mean', undef] + , ['--file', 'scalar', '-', undef] + , ['--emptyval', 'scalar', 'NaN', undef] + , ['--test', 'scalar', 0, 1] + ); + +my %args = %{&parseArgs(\@ARGV, \@flags)}; + +if(exists($args{'--help'})) +{ + print STDOUT <DATA>; + exit(0); +} + +my $emptyVal = $args{'--emptyval'}; +my $runTest = $args{'--test'}; # <-- this doesn't do anything right now. Theoretically it should run a test to make sure the values actually work for a few known cases (sanity check for this program) +my $verbose = not($args{'-q'}); +my $col = int($args{'-k'}) - 1; +my $delim = $args{'-d'}; +my $function = lc($args{'-f'}); # lower-case whatever the function name was +my $headers = $args{'-h'}; +my $sigs = $args{'-sig'}; +my $file = $args{'--file'}; + +my $sprintf_ctrl = '%.' . $sigs . 'f'; + + +if ($function ne 'mean' && $function ne 'median') { + die "ERROR in aggregate.pl: You must specify a function ( -f FUNCTION_NAME ). The supported functions are mean and median.\n"; +} + +# my ($ids, $rows) = &readIds($file, $col, $delim); +# my $data = &readDataMatrix($file, $col, $delim, \$max_cols); +my ($data, $ids, $rows, $max_cols) = &readDataAndIds($file, $col, $delim); + +for(my $i = 0; $i < scalar(@{$rows}) and $i < $headers; $i++) { + print $$ids[$i], $delim, join($delim, @{$$data[$i]}), "\n"; +} + +for(my $i = $headers; $i < scalar(@{$rows}); $i++) +{ + my $id = $$ids[$i]; + + my $useMedian = ($function eq 'median'); + my $useMean = ($function eq 'mean'); + + my @sum; + my @count; + my @medianCalcArray; # this is actually just a list of all the items in the column for each key + # Note: medianCalcArray is an array of ARRAYS. + + for(my $j = 0; $j < $max_cols; $j++) { + $sum[$j] = 0; + $count[$j] = 0; + } + + #my @r = @{$$rows[$i]}; + + for(my $k = 0; $k < scalar(@{$$rows[$i]}); $k++) { + my $row = $$rows[$i][$k]; + + #print "Row is $row\n"; + for(my $j = 0; $j < $max_cols; $j++) { + my $thisEntry = $$data[$row][$j]; + if(defined($thisEntry)) { + if($thisEntry =~ /^\s*[\d+\.eE-]+\s*$/) { + $count[$j]++; + $sum[$j] += $thisEntry; + if ($useMedian) { + if (!defined($medianCalcArray[$j])) { + @{$medianCalcArray[$j]} = (); + } + push(@{$medianCalcArray[$j]}, $thisEntry); + #print "$j: $k: $row: "; + #print @{$medianCalcArray[$j]}; + #print "\n"; + } + } + } + } + } + + my @agg; + for(my $j = 0; $j < $max_cols; $j++) { + $agg[$j] = ${emptyVal}; + if($useMean) { + $agg[$j] = ($count[$j] > 0) ? sprintf($sprintf_ctrl, ($sum[$j] / $count[$j])) : ${emptyVal}; + } + if($useMedian) { + # Only calculate the median if we actually specifically want it... otherwise it slows us down + if (defined($medianCalcArray[$j]) && (scalar(@{$medianCalcArray[$j]}) > 0) ) { + $agg[$j] = vec_median(\@{$medianCalcArray[$j]}); # <-- vec_median is in libstats.pl + } + } + if ($useMean && $useMedian) { die "Error in arguments to aggregate.pl: You cannot specify both *mean* AND *median* at the same time! (We would be overwriting the storage variable!) You will have to run the program twice, once with each option.\n"; } + } + + print STDOUT $id, (($max_cols > 0) ? ($delim . join($delim, @agg)) : ""), "\n"; +} + +exit(0); + + +__DATA__ +syntax: aggregate.pl [OPTIONS] + +Combines the numeric data across rows with the same key. Useful if you have experiments +with replicates. See below for a complete example. + +Example usage: aggregate.pl -f median MYFILE + +OPTIONS are: + +-q: Quiet mode (default is verbose) + +-k COL: Use the column COL as the key column. The script uses the entries found + on each line of this column as keys. Duplicates are merged by applying + an aggregation function for each value in their records. + +-d DELIM: Set the field delimiter to DELIM (default is tab). + +-h NUM: The number of headers in the input file (default is 0). + +-f FUNCTION: Set the aggregation function to FUNCTION (default is mean). The + possible values are: + + mean: The mean of the values (default) (-f mean) + + median: The median of the values. (-f median) + +--emptyval VALUE: Sets the "empty"/"no data" values to VALUE. (default is NaN) + If an output value has no input data, then this will be the output. + +EXAMPLE: + +Works like this: + +If the input file INPUTFILE has five columns like below (tab-delimited, although +spaces are shown here): (Note that Column 3 (C3) is blank for all rows except +for the last one) + +Column_1 Column_2 C3 C4 C5 +v v v v v +-------------------------------- (Sample file is below) +Experiment_Alpha 1 0 77 +Experiment_Alpha 2 0 +Expr_Beta 10 +Expr_Beta 30 +Experiment_Alpha 3 6 +Expr_Beta 5 + + +And you type: aggregate.pl -f mean THE_FILE + +Then the output will be the *mean* values for each experiment, across all rows: + +Experiment_Alpha 2.0 NaN 3.0 77 +Expr_Beta 20 5 NaN NaN + +Note that the "77" case (the last item in the first row) is the corect mean, +because the other Experiment_Alpha items for that column do not have any data ta +all. Even though there are 3 rows labeled "Experiment_Alpha", only one of them +has data for the last column (column 4), so 77 is the mean. The output is always +a matrix (although it could be a single-column matrix). Empty values are padded +with NaN (although you can change this with --emptyval). + + +TO DO / FUTURE WORK: + +Future possibility (NOT IMPLEMENTED YET): smean: Standardized mean (mean/stddev). + +KNOWN BUGS / ISSUES: + +None so far.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/floatMatrix.py Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,335 @@ +import array +import math +import csv + +def union(*input): + o = {} + for i in input: + for a in i: + o[a]= True + return o.keys() + +def intersect(a, b): + o = [] + for i in a: + if i in b: + o.append(i) + return o + +class DataException(Exception): + pass + +class NamedVector(dict): + + def init_blank(self, names): + for a in names: + self[a] = float("nan") + + def __getitem__(self, key): + if isinstance(key, NamedVector): + o = NamedVector() + for a in key: + if key[a]: + o[a] = self.get(a) + return o + else: + return self.get(key) + + + def __add__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in union(self, other): + o[a] = self.get(a, 0.0) + other.get(a, 0.0) + else: + for a in self: + o[a] = self[a] + other + return o + + def __div__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) / other.get(a) + else: + for a in self: + o[a] = self[a] / other + return o + + + def __mul__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) * other.get(a) + else: + for a in self: + o[a] = self[a] * other + return o + + def __sub__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) - other.get(a) + else: + for a in self: + o[a] = self[a] - other + return o + + def __gt__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) > other.get(a) + else: + for a in self: + o[a] = self[a] > other + return o + + def __eq__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) == other.get(a) + else: + for a in self: + o[a] = self[a] == other + return o + + def __ne__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) != other.get(a) + else: + for a in self: + o[a] = self[a] != other + return o + + + def __lt__(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = self.get(a) < other.get(a) + else: + for a in self: + o[a] = self[a] < other + return o + + def pow(self, other): + o = NamedVector() + if isinstance(other, NamedVector): + for a in intersect(self, other): + o[a] = math.pow(self.get(a), other.get(a)) + else: + for a in self: + o[a] = math.pow(self[a], other) + return o + + def log(self, base=math.e): + o = NamedVector() + for a in self: + o[a] = math.log(self[a], base) + return o + + def sqrt(self): + o = NamedVector() + for a in self: + o[a] = math.sqrt(self[a]) + return o + + def array(self, names): + o = array.array('f') + for a in names: + o.append( self.get(a, float('nan')) ) + return o + + def set_nan(self, value=0.0): + for a in self: + if math.isnan(self[a]): + self[a] = value + + def sum(self): + return sum(self.values()) + + + +class FloatMatrix: + """ + array.array based float matrix class + """ + def __init__(self): + self.corner_name = "probe" + self.data = None + self.nrows = None + self.ncols = None + self.rowmap = None + self.colmap = None + + def read(self, handle): + header = None + for line in handle: + row = line.rstrip().split("\t") + if header is None: + header = row + self.data = array.array("f") + self.colmap = {} + self.rowmap = {} + self.ncols = len(row) - 1 + self.nrows = 0 + for i, c in enumerate(row[1:]): + self.colmap[c] = i + else: + if len(row) - 1 != self.ncols: + raise DataException("Misformed matrix") + self.rowmap[row[0]] = len(self.rowmap) + a = [] + for v in row[1:]: + try: + a.append(float(v)) + except ValueError: + a.append(float('Nan')) + self.data.extend(a) + self.nrows += 1 + + def init_blank(self, rows, cols): + self.data = array.array("f") + self.colmap = {} + for i,c in enumerate(cols): + self.colmap[c] = i + self.rowmap = {} + for i,r in enumerate(rows): + self.rowmap[r] = i + self.ncols = len(cols) + self.nrows = len(rows) + for i in range(self.nrows): + self.data.extend([float('nan')] * self.ncols) + + + def size(self): + return {'rows' : len(self.rowmap), 'cols' : len(self.colmap)} + + def has_row(self, row_name): + return row_name in self.rowmap + + def has_col(self, col_name): + return col_name in self.colmap + + def values(self): + return self.data + + def get_value(self, row_name, col_name): + return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] + + def set_value(self, row_name, col_name, value): + self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value + + def get_row(self, row_name): + if row_name not in self.rowmap: + raise KeyError + out = NamedVector() + out.init_blank( self.colmap ) + for c in self.colmap: + out[c] = self.data[ self.rowmap[row_name] * self.ncols + self.colmap[c] ] + return out + + def get_col(self, col_name): + if col_name not in self.colmap: + raise KeyError + out = NamedVector() + out.init_blank( self.rowmap ) + for r in self.rowmap: + out[r] = self.data[ self.rowmap[r] * self.ncols + self.colmap[col_name] ] + return out + + + def set_row(self, row_name, row_data): + if row_name not in self.rowmap: + raise KeyError + row_offset = self.rowmap[row_name] * self.ncols + for c in row_data: + if c in self.colmap: + self.data[ row_offset + self.colmap[c] ] = row_data[c] + + def get_cols(self): + if self.colmap is None: + return None + return self.colmap.keys() + + def get_rows(self): + if self.rowmap is None: + return None + return self.rowmap.keys() + + def select(self, rows=None, cols=None): + if rows is None: + rows = self.get_rows() + if cols is None: + cols = self.get_cols() + out = FloatMatrix() + out.init_blank(rows=rows, cols=cols) + for row in rows: + if self.has_row(row): + r = self.get_row(row) + out.set_row(row, r) + return out + + + def write(self, handle, missing='NA', row_select=None, col_select=None): + write = csv.writer(handle, delimiter="\t", lineterminator='\n') + + col_list = [] + if col_select is None: + col_list = self.get_cols() + else: + for c in self.get_cols(): + if c in col_select: + col_list.append(c) + + write.writerow([self.corner_name] + col_list) + for rowName in self.rowmap: + if row_select is None or rowName in row_select: + out = [rowName] + row = self.get_row(rowName) + for col in col_list: + val = row[col] + if val is None or math.isnan(val): + val = missing + else: + val = "%.5f" % (val) + out.append(val) + write.writerow(out) + + def set_nan(self, value=0.0): + for i in range(len(self.data)): + if math.isnan(self.data[i]): + self.data[i] = value + + def row_sums(self): + out = NamedVector() + for rowName in self.rowmap: + row = self.get_row(rowName) + out[rowName] = row.sum() + return out + + def merge(self, other): + out = FloatMatrix() + out.init_blank( rows=union(self.get_rows(), other.get_rows()), cols=union( self.get_cols(), other.get_cols())) + + for rowName in self.rowmap: + for colName in self.colmap: + out.set_value( row_name=rowName, col_name=colName, value=self.get_value(row_name=rowName, col_name=colName)) + + for rowName in other.rowmap: + for colName in other.colmap: + out.set_value( row_name=rowName, col_name=colName, value=other.get_value(row_name=rowName, col_name=colName)) + + return out + + def toRmatrix(self, r): + out = r.matrix(self.data, ncol=self.ncols, dimnames=[ self.get_rows(), self.get_cols() ], byrow=True) + return out + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_calc.py Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +import os +import sys +import csv +import re +import math +import argparse +import floatMatrix + +def value_eval(code, value): + funcmap = { + "len":len, + "value" : value, + "re" : re, + "math" : math, + "sum" : sum, + "float" : float + } + return eval(code,{"__builtins__":None},funcmap) + +def dict_dict_2_table(indata): + all_labels = {} + for r in indata: + for c in indata[r]: + all_labels[c] = True + head = all_labels.keys() + out = [] + out.append( ["#"] + head ) + for r in indata: + o = [r] + for h in head: + o.append( str(indata[r].get(h,"")) ) + out.append(o) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--col-eval', help='Column Eval', dest="col_eval", default=None) + parser.add_argument('-r', '--row-eval', help='Row Eval', dest="row_eval", default=None) + + parser.add_argument("-o", "--out", help="Output File", dest="output", default=None) + parser.add_argument("input", help="Input Matrix", default=None) + + args = parser.parse_args() + + if args.input == "-": + ihandle = sys.stdin + else: + ihandle = open(args.input) + + if args.output is None: + ohandle = sys.stdout + else: + ohandle = open(args.output, "w") + + matrix = floatMatrix.FloatMatrix() + matrix.read(ihandle) + ihandle.close() + + out = {} + + if args.col_eval is not None and len(args.col_eval): + for col in matrix.get_cols(): + value = matrix.get_col(col) + out[col] = value_eval(args.col_eval, value) + table = dict_dict_2_table(out) + writer = csv.writer(ohandle, delimiter="\t", lineterminator="\n") + for i in range(len(table[0])): + o = [] + for r in table: + o.append(r[i]) + writer.write_row(o) + elif args.row_eval is not None and len(args.row_eval): + for row in matrix.get_rows(): + value = matrix.get_row(row) + out[row] = value_eval(args.row_eval, value) + table = dict_dict_2_table(out) + writer = csv.writer(ohandle, delimiter="\t", lineterminator="\n") + writer.writerows(table) + ohandle.close() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_calc.xml Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,48 @@ +<tool id="matrix_calc" name="Matrix Caclulate" version="1.0.0"> + <description>Perform calclations of matrix rows and columns using python statements</description> + <command interpreter="python">./matrix_calc.py +#if len($row_txt) > 0: +--row-filter-file $row_eval_txt +#end if +#if len($col_txt) > 0: +--col-filter-file $col_eval_txt +#end if +$matrix +-o ${outfile} + </command> + <inputs> + <param name="row_txt" type="text" area="True" size="5x35" label="Row Eval Code" optional="True"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + <add source="\" target="\\"/> + </mapping> + </sanitizer> + </param> + <param name="col_txt" type="text" area="True" size="5x35" label="Column Eval Code" optional="True"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + </mapping> + </sanitizer> + </param> + <param name="matrix" type="data" format="tabular" label="Matrix"/> + </inputs> + <outputs> + <data format="tabular" name="outfile" /> + </outputs> + <configfiles> + <configfile name="row_eval_txt">${row_txt}</configfile> + <configfile name="col_eval_txt">${col_txt}</configfile> + </configfiles> + + <help> +This is a utility to perform calculations on the rows and columns of a matrix file. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_cat.py Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,139 @@ +#!/usr/bin/env python +"""matrix_cat.py: + +Usage: + matrix_cat.py [options] file1 file2 [file3 ...] + +Options: + -h skip header + -i output only features in common with all files + -f use 'float' mode to save memory + -o <outfile> Output file + -m <missing> Missing value string + -q run quietly +""" +import os, os.path, sys, getopt, re +import array + +delim = "\t" +verbose = True + +def usage(code = 0): + sys.stderr.write( __doc__ ) + if code != None: sys.exit(code) + +def log(msg, die = False): + if (verbose): + sys.stderr.write(msg) + if die: + sys.exit(1) + +def readFile(inFile, header = True, use_float = False): + dataWidth = None + dataHeader = None + dataMap = {} + f = open(inFile, "r") + if header: + line = f.readline() + if line.isspace(): + log("ERROR: missing header\n", die = True) + pline = re.split(delim, line.rstrip("\n\r")) + dataHeader = pline + dataWidth = len(pline[1:]) + for line in f: + if line.isspace(): + continue + pline = re.split(delim, line.rstrip("\n\r")) + if dataWidth is None: + dataWidth = len(pline[1:]) + assert(len(pline[1:]) == dataWidth) + if use_float: + out = array.array("f") + for a in pline[1:]: + try: + out.append(float(a)) + except ValueError: + out.append(float('nan')) + else: + out = pline[1:] + dataMap[pline[0]] = out + f.close() + return (dataMap, dataHeader, dataWidth) + +def main(args): + ## parse arguments + try: + opts, args = getopt.getopt(args, "hiqfo:m:") + except getopt.GetoptError, err: + sys.stderr.write( str(err) + "\n" ) + usage(2) + + if len(args) > 0: + files = args + else: + files = [] + for i in sys.stdin: + files.append(i.rstrip("\n\r")) + + if len(files) < 1: + sys.stderr.write("incorrect number of arguments\n") + usage(1) + + header = True + useIntersection = False + output = None + use_float = False + missing = "" + global verbose + for o, a in opts: + if o == "-h": + header = False + elif o == "-i": + useIntersection = True + elif o == "-q": + verbose = False + elif o == "-o": + output = a + elif o == "-f": + use_float = True + elif o == "-m": + missing = a + + ## read files + fileData = {} + fileWidth = {} + fileHeader = {} + for file in files: + (fileData[file], fileHeader[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float) + header_dict = {} + for file in files: + for f in fileHeader[file][1:]: + header_dict[f] = True + + headers = list(header_dict.keys()) + if useIntersection: + for file in files: + features = list(set(fileHeader[file].keys()) & set(headers)) + headers.sort() + + if output is not None: + ohandle = open(output, "w") + else: + ohandle = sys.stdout + + ## output + if header: + ohandle.write("#\t%s\n" % (delim.join(headers))) + for file in files: + for feature in fileData[file]: + lineElements = [] + for h in headers: + try: + index = fileHeader[file].index(h) + lineElements.append(fileData[file][feature][index-1]) + except ValueError: + lineElements.append(missing) + ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) )) + +if __name__ == "__main__": + main(sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_cat.xml Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,26 @@ +<tool id="matrix_cat" name="Matrix Cat" version="1.0.0"> + <description>Join Matrices using column labels</description> + <command interpreter="python">matrix_cat.py +-o $out +-m '$missing_value' +#if $is_float: +-f +#end if +#for $a in $in_mats: +$a.file +#end for + </command> + <inputs> + <repeat name="in_mats" title="Input Matrix" min="1"> + <param name="file" type="data" label="Matrix File"/> + </repeat> + <param name="missing_value" type="text" label="Missing Value" value="NA"/> + <param name="is_float" type="boolean" label="Float Values" help="If all matrices are floating point numbers, use to save memory"/> + </inputs> + <outputs> + <data name="out" format="tabular" label="Cat Matrix" help="Cat Matrix"/> + </outputs> + <help> +Join matricies by row labels + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_edit.py Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import os +import sys +import csv +import re +import math +import argparse + +def value_eval(code, value): + funcmap = { + "len":len, + "value" : value, + "re" : re, + "math" : math, + "float" : float + } + out = eval(code,{"__builtins__":None},funcmap) + return str(out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--col-eval', help='Column Eval', dest="col_eval", default=None) + parser.add_argument('-r', '--row-eval', help='Row Eval', dest="row_eval", default=None) + parser.add_argument('-m', '--cell-eval', help='Cell Eval', dest="cell_eval", default=None) + + parser.add_argument("-o", "--out", help="Output File", dest="output", default=None) + parser.add_argument("input", help="Input Matrix", default=None) + + args = parser.parse_args() + + if args.input == "-": + ihandle = sys.stdin + else: + ihandle = open(args.input) + + if args.output is None: + ohandle = sys.stdout + else: + ohandle = open(args.output, "w") + + reader = csv.reader(ihandle, delimiter="\t") + writer = csv.writer(ohandle, delimiter="\t", lineterminator="\n") + + header = True + for row in reader: + if header: + if args.col_eval is not None and len(args.col_eval): + for i, val in enumerate(row[1:]): + row[i+1] = value_eval(args.col_eval, val) + header = False + else: + if args.row_eval is not None and len(args.row_eval): + row[0] = value_eval(args.row_eval, row[0]) + if args.cell_eval is not None and len(args.cell_eval): + for i in range(1,len(row)): + row[i] = value_eval(args.cell_eval,row[i]) + writer.writerow(row) + + ihandle.close() + ohandle.close() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_filter.py Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +import sys +import argparse +import array +import math +import csv +import floatMatrix + +def value_eval(code, values, label, label_set): + funcmap = { + "len":len, + "values" : values, + "label" : label, + "label_set" : label_set, + "math" : math, + "sum" : sum, + "min" : min, + "max" : max + } + return eval(code,{"__builtins__":None},funcmap) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--col-filter', help='Column Filter', dest="col_filter", default=None) + parser.add_argument('--col-filter-file', help='Column Filter File', default=None) + parser.add_argument('-r', '--row-filter', help='Row Filter', dest="row_filter", default=None) + parser.add_argument('--row-filter-file', help='Row Filter File', default=None) + + parser.add_argument("-o", "--out", help="Output File", dest="output", default=None) + parser.add_argument("input", help="Input Matrix", default=None) + + args = parser.parse_args() + + if args.input == "-": + handle = sys.stdin + else: + handle = open(args.input) + + fm = floatMatrix.FloatMatrix() + fm.read(handle) + + col_filter = None + row_filter = None + + if args.col_filter: + col_filter = args.col_filter + if args.col_filter_file: + handle = open(args.col_filter_file) + col_filter = handle.read() + handle.close() + + if args.row_filter: + row_filter = args.row_filter + if args.row_filter_file: + handle = open(args.row_filter_file) + row_filter = handle.read() + handle.close() + + + if col_filter: + cols = [] + for c in fm.get_cols(): + v = fm.get_col(c) + if value_eval(col_filter, v.values(), c, fm.get_cols()): + cols.append(c) + else: + cols = fm.get_cols() + + if row_filter: + rows = [] + for c in fm.get_rows(): + v = fm.get_row(c) + if value_eval(row_filter, v.values(), c, fm.get_rows()): + rows.append(c) + else: + rows = fm.get_rows() + + ohandle = sys.stdout + if args.output is not None: + ohandle = open(args.output, "w") + + fm.write(ohandle, row_select=rows, col_select=cols) + ohandle.close() + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_filter.xml Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,61 @@ +<tool id="matrix_filter" name="Matrix Filter" version="1.0.0"> + <description>Filter row/column of a tabular file using python statements</description> + <command interpreter="python">./matrix_filter.py +#if str($row_txt) != '': +--row-filter-file $row_eval_txt +#end if +#if str($col_txt) != '': +--col-filter-file $col_eval_txt +#end if +$matrix +-o ${outfile} + </command> + <inputs> + <param name="row_txt" type="text" area="True" size="5x35" label="Row Eval Code" optional="True"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + <add source="\" target="\\"/> + </mapping> + </sanitizer> + </param> + <param name="col_txt" type="text" area="True" size="5x35" label="Column Eval Code" optional="True"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + </mapping> + </sanitizer> + </param> + <param name="matrix" type="data" format="tabular" label="Matrix"/> + </inputs> + <outputs> + <data format="tabular" name="outfile" /> + </outputs> + <configfiles> + <configfile name="row_eval_txt">${row_txt}</configfile> + </configfiles> + <configfiles> + <configfile name="col_eval_txt">${col_txt}</configfile> + </configfiles> + + <help> +This is a utility to perform filtering operations on the rows and columns of a tabular file. + +- The 'Column Eval Code' operations occur on the first line. +- The 'Row Eval Code' operations occur on the first cell of every line +- If any of the code blocks are empty, the operation is skipped and assumed to be true + +Avalible values + +- values : The array of values across the row or column +- label : The label for the row or column +- label_set : + + </help> +</tool>
--- a/matrix_manipulate/matrix_join.py Fri May 10 14:38:51 2013 -0400 +++ b/matrix_manipulate/matrix_join.py Thu Jun 13 16:54:15 2013 -0400 @@ -2,12 +2,14 @@ """join.py: Usage: - join.py [options] file1 file2 [file3 ...] + matrix_join.py [options] file1 file2 [file3 ...] Options: - -h header + -h skip header -i output only features in common with all files -f use 'float' mode to save memory + -o <outfile> Output file + -m <missing> Missing value string -q run quietly """ import os, os.path, sys, getopt, re @@ -28,6 +30,7 @@ def readFile(inFile, header = True, use_float = False): dataWidth = None + dataHeader = None dataMap = {} f = open(inFile, "r") if header: @@ -35,7 +38,7 @@ if line.isspace(): log("ERROR: missing header\n", die = True) pline = re.split(delim, line.rstrip("\n\r")) - dataMap["HEADER"] = pline + dataHeader = pline dataWidth = len(pline[1:]) for line in f: if line.isspace(): @@ -55,12 +58,12 @@ out = pline[1:] dataMap[pline[0]] = out f.close() - return (dataMap, dataWidth) + return (dataMap, dataHeader, dataWidth) def main(args): ## parse arguments try: - opts, args = getopt.getopt(args, "hiqfo:") + opts, args = getopt.getopt(args, "hiqfo:m:") except getopt.GetoptError, err: sys.stderr.write( str(err) + "\n" ) usage(2) @@ -72,18 +75,19 @@ for i in sys.stdin: files.append(i.rstrip("\n\r")) - if len(files) < 2: + if len(files) < 1: sys.stderr.write("incorrect number of arguments\n") usage(1) - header = False + header = True useIntersection = False output = None use_float = False + missing = "" global verbose for o, a in opts: if o == "-h": - header = True + header = False elif o == "-i": useIntersection = True elif o == "-q": @@ -92,13 +96,21 @@ output = a elif o == "-f": use_float = True + elif o == "-m": + missing = a ## read files fileData = {} fileWidth = {} + fileHeader = {} for file in files: - (fileData[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float) - features = list(set(fileData[files[0]].keys()) - set(["HEADER"])) + (fileData[file], fileHeader[file], fileWidth[file]) = readFile(file, header = header, use_float = use_float) + feature_dict = {} + for file in files: + for f in fileData[file]: + feature_dict[f] = True + + features = list(feature_dict.keys()) if useIntersection: for file in files: features = list(set(fileData[file].keys()) & set(features)) @@ -111,9 +123,9 @@ ## output if header: - lineElements = [fileData[files[0]]["HEADER"][0]] + lineElements = [fileHeader[files[0]][0]] for file in files: - lineElements += fileData[file]["HEADER"][1:] + lineElements += fileHeader[file][1:] ohandle.write("%s\n" % (delim.join(lineElements))) for feature in features: lineElements = [] @@ -121,7 +133,7 @@ if feature in fileData[file]: lineElements += fileData[file][feature] else: - lineElements += ["" for i in range(fileWidth[file])] + lineElements += [missing for i in range(fileWidth[file])] ohandle.write("%s\n" % (feature + delim + delim.join( (str(c) for c in lineElements)) )) if __name__ == "__main__":
--- a/matrix_manipulate/matrix_join.xml Fri May 10 14:38:51 2013 -0400 +++ b/matrix_manipulate/matrix_join.xml Thu Jun 13 16:54:15 2013 -0400 @@ -1,7 +1,8 @@ <tool id="matrix_join" name="Matrix Join" version="1.0.0"> <description>Join Matrices using row labels</description> - <command interpreter="python">matrix_join.py -h + <command interpreter="python">matrix_join.py -o $out +-m '$missing_value' #if $is_float: -f #end if @@ -13,6 +14,7 @@ <repeat name="in_mats" title="Input Matrix" min="1"> <param name="file" type="data" label="Matrix File"/> </repeat> + <param name="missing_value" type="text" label="Missing Value" value="NA"/> <param name="is_float" type="boolean" label="Float Values" help="If all matrices are floating point numbers, use to save memory"/> </inputs> <outputs>
--- a/matrix_manipulate/matrix_rank_normalize.py Fri May 10 14:38:51 2013 -0400 +++ b/matrix_manipulate/matrix_rank_normalize.py Thu Jun 13 16:54:15 2013 -0400 @@ -17,7 +17,9 @@ def py_cmp_float(a_ptr, b_ptr): a = a_ptr.contents.value b = b_ptr.contents.value - return (a > b) - (a < b) + if a < b: + return -1 + return a > b CMPFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float))
--- a/matrix_manipulate/matrix_rank_normalize.xml Fri May 10 14:38:51 2013 -0400 +++ b/matrix_manipulate/matrix_rank_normalize.xml Thu Jun 13 16:54:15 2013 -0400 @@ -11,7 +11,7 @@ </command> <inputs> <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/> - <param name="dropZeros" type="boolean" label="Drop Input Zeros" checked="True"/> + <param name="dropZeros" type="boolean" label="Drop Input Zeros" checked="False"/> <param name="na2zero" type="boolean" label="Set Output NAs to Zero" checked="False"/> </inputs> <outputs>
--- a/matrix_manipulate/matrix_transpose.py~ Fri May 10 14:38:51 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#!/usr/bin/env python - -import string,sys -import getopt -import array - -if __name__ == "__main__": - - opts, args = getopt.getopt(sys.argv[1:], "lf") - if (len(args))!=2: - sys.stderr.write("python transpose.py extractDataIn transposeOut-Paradigm\n") - sys.exit(2) - - label_print = True - use_float = True - for o, a in opts: - if o == "-l": - label_print = False - if o == "-f": - use_float = True - - fin= open(args[0],'r') - fout= open(args[1],'w') - - col_label = None - row_label = [] - matrix=[] - for line in fin.readlines(): - data = string.split(line.strip(),'\t') - if col_label is None: - col_label = data - else: - row_label.append(data[0]) - if use_float: - o = array.array('f') - for i in data[1:]: - try: - o.append(float(i)) - except ValueError: - o.append(float('nan')) - else: - o = data[1:] - row_label.append(data[0]) - matrix.append(o) - - #header - out = [] - if label_print: - out = [col_label[0]] + row_label - else: - out = row_label - fout.write("\t".join(out) + "\n") - - #body - for col in range(0, len(col_label)): - out = [] - if label_print: - out.append(col_label[col+1]) - for row in matrix: - out.append(str(row[col])) - fout.write("\t".join(out) + "\n") - - fin.close() - fout.close()
--- a/matrix_manipulate/matrix_whitelist.xml Fri May 10 14:38:51 2013 -0400 +++ b/matrix_manipulate/matrix_whitelist.xml Thu Jun 13 16:54:15 2013 -0400 @@ -21,6 +21,6 @@ <data name="outfile" format="tabular"/> </outputs> <help> -Matrix Whitelist +Whitelist the rows or columns of a matrix using a file with a list of valid labels. </help> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_zscore.py Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,32 @@ +#!/usr/bin/env python + +import sys +import floatMatrix +import math + +def matrix2zscore(matrix): + v = matrix.values() + ave = sum(v) / float(len(v)) + std = math.sqrt( sum(map( lambda x: (x - ave)**2, v)) / float(len(v)) ) + out = floatMatrix.FloatMatrix() + out.init_blank(rows=matrix.get_rows(), cols=matrix.get_cols()) + for row in matrix.get_rows(): + for col in matrix.get_cols(): + value = (matrix.get_value(row_name=row, col_name=col) - ave) / std + out.set_value(row_name=row, col_name=col, value=value) + return out + + +if __name__ == "__main__": + path = sys.argv[1] + if path == "-": + handle = sys.stdin + else: + handle = open(matrix_path) + + matrix = floatMatrix.FloatMatrix() + matrix.read(handle) + handle.close() + + out = matrix2zscore(matrix) + out.write(sys.stdout) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/matrix_zscore.xml Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,14 @@ +<tool id="matrix_zscore" name="Matrix Zscore" version="1.0.0"> + <description>Convert values in matrix to zscores</description> + <command interpreter="python">matrix_zscore.py $inMatrix > $outMatrix + </command> + <inputs> + <param name="inMatrix" type="data" format="tabular" label="Input Matrix"/> + </inputs> + <outputs> + <data name="outMatrix" format="tabular"/> + </outputs> + <help> +Scale all values in the matrix according to their relative zscore. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_manipulate/quartile_norm.pl Thu Jun 13 16:54:15 2013 -0400 @@ -0,0 +1,102 @@ +#!/usr/bin/perl + +use strict; +use Getopt::Long; + +my $out = '-'; +my $q = 75; +my @col; +my @also; +my $names = 1; +my $target = 1000; +my $skip = 0; +my $min=1; +GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min); + +my $in = shift @ARGV; + +die usage() unless $in && @col; + +open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n"; +open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n"; + +@also = (1) if !@also && !grep {$_ eq '1'} @col; + +map {$_--} @col; +map {$_--} @also; + +my @d; +my $cnt = 0; +my $head =''; +while(<IN>) { + if ($skip) { + --$skip; + $head .= $_; + next; + } + chomp; + my @f = split /\t/; + if ($col[0] eq '-2') { + @col = (1..$#f); + } + for (@col) { + push @{$d[$_]}, $f[$_]; + } + for (@also) { + push @{$d[$_]}, $f[$_]; + } + ++$cnt; +} +for (@col) { + my @t = grep {$_>=$min} @{$d[$_]}; + @t = sort {$a <=> $b} @t; + my $t=quantile(\@t, $q/100); + for (@{$d[$_]}) { + $_= sprintf "%.4f", $target*$_/$t; + } +} + +my @out = (sort {$a <=> $b} (@col, @also)); + +print OUT $head; + +for (my $i=0;$i<$cnt;++$i) { + for my $j (@out) { + print OUT "\t" unless $j == $out[0]; + print OUT $d[$j][$i]; + } + print OUT "\n"; +} + + +sub usage { +<<EOF; +Usage: $0 -c COL [opts] FILE + +Returns an upper quartile normalization of data in column(s) COL +of file FILE. + +Col is 1-based, zeroes are ignores when calculating upper quartile + +Options: + -c|col COL normalize this column of data (can specify more than once, or -1 for all but first col) + -q|quant INT quantile to use (75) + -t|target INT target to use (1000) + -a|also COL output these columns also + -o|out FILE output to this file instead of stdout + -m|min INT minimum value (1) + -s|skip INT skip header rows +EOF +} + +sub quantile { + my ($a,$p) = @_; + my $l = scalar(@{$a}); + my $t = ($l-1)*$p; + my $v=$a->[int($t)]; + if ($t > int($t)) { + return $v + $p * ($a->[int($t)+1] - $v); + } else { + return $v; + } +}