Mercurial > repos > kellrott > siggenes
changeset 0:29573fbd1f67 draft default tip
Uploaded
| author | kellrott |
|---|---|
| date | Fri, 21 Dec 2012 16:41:30 -0500 |
| parents | |
| children | |
| files | siggenes/siggenes.py siggenes/siggenes.xml |
| diffstat | 2 files changed, 394 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siggenes/siggenes.py Fri Dec 21 16:41:30 2012 -0500 @@ -0,0 +1,326 @@ +#!/usr/bin/env python + +import sys +import argparse +import array +import csv +import math +try: + from rpy2 import robjects + from rpy2.robjects.packages import importr +except ImportError: + pass + +def die(msg): + sys.stderr.write("%s\n" % (msg)) + sys.exit(1) + +def log(msg): + pass + +class DataException(Exception): + pass + +class FloatMatrix: + def __init__(self): + self.corner_name = "probe" + self.data = None + self.nrows = None + self.ncols = None + self.rowmap = None + self.colmap = None + + def read(self, handle): + header = None + for line in handle: + row = line.rstrip().split("\t") + if header is None: + header = row + self.data = array.array("f") + self.colmap = {} + self.rowmap = {} + self.ncols = len(row) - 1 + self.nrows = 0 + for i, c in enumerate(row[1:]): + self.colmap[c] = i + else: + if len(row) - 1 != self.ncols: + raise DataException("Misformed matrix") + self.rowmap[row[0]] = len(self.rowmap) + a = [] + for v in row[1:]: + try: + a.append(float(v)) + except ValueError: + a.append(float('Nan')) + self.data.extend(a) + self.nrows += 1 + + def init_blank(self, rows, cols): + self.data = array.array("f") + self.colmap = {} + for i,c in enumerate(cols): + self.colmap[c] = i + self.rowmap = {} + for i,r in enumerate(rows): + self.rowmap[r] = i + self.ncols = len(cols) + self.nrows = len(rows) + for i in range(self.nrows): + self.data.extend([float('nan')] * self.ncols) + + def get_value(self, row_name, col_name): + return self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] + + def set_value(self, row_name, col_name, value): + self.data[ self.rowmap[row_name] * self.ncols + self.colmap[col_name] ] = value + + def get_row(self, row_name): + return self.data[ self.rowmap[row_name] * self.ncols : (self.rowmap[row_name]+1) * self.ncols ] + + def get_cols(self): + out = self.colmap.keys() + return sorted(out, key=self.colmap.get) + + def has_row(self, row): + return row in self.rowmap + + def has_col(self, col): + return col in self.colmap + + def get_rows(self): + out = self.rowmap.keys() + return sorted(out, key=self.rowmap.get) + + def write(self, handle, missing='NA'): + write = csv.writer(handle, delimiter="\t", lineterminator='\n') + col_list = self.get_cols() + + write.writerow([self.corner_name] + col_list) + for rowName in self.rowmap: + out = [rowName] + row = self.get_row(rowName) + for col in col_list: + val = row[self.colmap[col]] + if val is None or math.isnan(val): + val = missing + else: + val = "%.5f" % (val) + out.append(val) + write.writerow(out) + + def toRmatrix(self, r): + out = r.matrix(self.data, ncol=self.ncols, dimnames=[ self.get_rows(), self.get_cols() ], byrow=True) + return out + + + + +class StringMatrix(FloatMatrix): + def __init__(self): + self.corner_name = "sample" + self.data = None + self.nrows = None + self.ncols = None + self.rowmap = None + self.colmap = None + + def init_blank(self, rows, cols): + self.data = [] + self.rowmap = {} + for r in rows: + self.rowmap[r] = len(self.rowmap) + if self.colmap is None: + self.colmap = {} + for c in cols: + self.colmap[c] = len(self.colmap) + self.ncols = len(self.colmap) + + for c in cols: + self.data.append(None) + self.nrows = len(self.rowmap) + + def read(self, handle): + header = None + for line in handle: + row = line.rstrip("\r\n").split("\t") + if header is None: + header = row + self.data = [] + self.colmap = {} + self.rowmap = {} + self.ncols = len(row) - 1 + self.nrows = 0 + for i, c in enumerate(row[1:]): + self.colmap[c] = i + else: + if len(row) - 1 != self.ncols: + raise DataException("Misformed matrix") + self.rowmap[row[0]] = len(self.rowmap) + a = [] + for v in row[1:]: + a.append(v) + self.data.extend(a) + self.nrows += 1 + + def write(self, handle, missing=''): + write = csv.writer(handle, delimiter="\t", lineterminator='\n') + col_list = self.get_cols() + + write.writerow([self.corner_name] + col_list) + for rowName in self.rowmap: + out = [rowName] + row = self.get_row(rowName) + for col in col_list: + val = row[self.colmap[col]] + if val is None: + val = missing + else: + val = "%s" % (val) + out.append(val) + write.writerow(out) + + +class SigGenes: + def __init__(self): + self.siggenes = importr("siggenes") + + def calc(self, method, genomicMatrix, phenotypeMatrix, phenotype, phenotypeEval): + r = robjects.r + + samples = [] + for s_name in genomicMatrix.get_cols(): + if phenotypeMatrix.has_row( s_name ): + samples.append(s_name) + + probes = genomicMatrix.get_rows() + f_sub = FloatMatrix() + f_sub.init_blank(rows=probes, cols=samples) + for row in probes: + for col in samples: + f_sub.set_value(row_name=row,col_name=col, value=genomicMatrix.get_value(row_name=row, col_name=col)) + + mat = f_sub.toRmatrix(r) + + cls = FloatMatrix() + cls.init_blank(rows=samples, cols=[phenotype]) + for row in samples: + val = phenotypeMatrix.get_value(row_name=row, col_name=phenotype) + try: + val = float(val) + except ValueError: + pass + if eval(phenotypeEval, {"__builtins__":None}, { 'float' : float, 'int' : int, 'value' : val } ): + cls.set_value(row_name=row, col_name=phenotype, value=1) + else: + cls.set_value(row_name=row, col_name=phenotype, value=0) + + cls_r = cls.toRmatrix(r) + if method == 'sam': + sam_out = self.siggenes.sam(mat, r.c(cls_r)) + sam_att = r.cbind( + r.c(r.attributes(sam_out).rx2("d")), + r.c(r.attributes(sam_out).rx2("vec.false")), + r.c(r.attributes(sam_out).rx2("q.value")), + r.c(r.attributes(sam_out).rx2("p.value")), + r.c(r.attributes(sam_out).rx2("s")) + ) + out = FloatMatrix() + ocols = [ "Score", "FalseCalls", "Q-value", + "P-value", "StdDev"] + out.init_blank( cols=ocols, + rows=probes) + + for i,n in enumerate(probes): + vals = list(sam_att.rx(i+1, True)) + for j, col in enumerate(ocols): + out.set_value( col_name=col, row_name=n, value=vals[j]) + return out + + if method == 'ebam': + e_a0 = r.get("find.a0")(mat, r.c(cls_r)) + ebam_out = self.siggenes.ebam(e_a0) + ebam_att = r.cbind( + r.c(r.attributes(ebam_out).rx2('z')), + r.c(r.attributes(ebam_out).rx2('vec.pos')), + r.c(r.attributes(ebam_out).rx2('vec.neg')) + ) + + out = FloatMatrix() + ocols = [ "Score", "PosCalls", "NegCalls" ] + out.init_blank( cols=ocols, + rows=probes) + + for i,n in enumerate(probes): + vals = list(ebam_att.rx(i+1, True)) + for j, col in enumerate(ocols): + out.set_value( col_name=col, row_name=n, value=vals[j]) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-m', '--method', help="SigGenes method (sam/ebam)", default='sam') + parser.add_argument('-g', '--genomic', help='GenomicMatrix', default=None) + parser.add_argument('-p', '--pheno', help='Phenotype Matrix', default=None) + parser.add_argument('-l', '--list', help="Sample List File", default=None) + parser.add_argument('-e', '--eval', help='Class Evalutation (val=="+")', default=None) + parser.add_argument('-o', '--out', help="Output Path", default="None") + + args = parser.parse_args() + + if args.genomic is None: + die("Need to define genomic matrix") + + handle = open(args.genomic) + genomicMatrix = FloatMatrix() + genomicMatrix.read(handle) + handle.close() + + eval_text = args.eval + + phenotypeMatrix = None + if args.pheno is not None: + handle = open(args.pheno) + phenotypeMatrix = StringMatrix() + phenotypeMatrix.read(handle) + handle.close() + + if args.list is not None: + in_list = {} + handle = open(args.list) + for line in handle: + in_list[line.rstrip()] = True + handle.close() + phenotypeMatrix = StringMatrix() + phenotypeMatrix.init_blank( cols=[ 'member' ], rows=genomicMatrix.get_cols() ) + for r in genomicMatrix.get_cols(): + if r in in_list: + phenotypeMatrix.set_value(row_name=r, col_name='member', value='+') + else: + phenotypeMatrix.set_value(row_name=r, col_name='member', value='-') + eval_text = "value == '+'" + + if phenotypeMatrix is None: + die("Need to define phenotypes") + if eval_text is None: + die("Need to define evaluation statement") + + + sig = SigGenes() + out = FloatMatrix() + out.init_blank(rows=genomicMatrix.get_rows(), cols=phenotypeMatrix.get_cols()) + for phenotype in phenotypeMatrix.get_cols(): + log("Calculating" + phenotype) + cur = sig.calc(args.method, genomicMatrix, phenotypeMatrix, phenotype, eval_text) + for row in cur.get_rows(): + out.set_value( row_name=row, col_name=phenotype, value=cur.get_value(row_name=row, col_name="Score") ) + if args.out is None: + out.write(sys.stdout) + else: + handle = open(args.out, "w") + out.write(handle) + handle.close() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siggenes/siggenes.xml Fri Dec 21 16:41:30 2012 -0500 @@ -0,0 +1,68 @@ +<tool id="siggenes" name="Siggenes" version="1.0.0"> + <description>Signficance test for genes in Microarray data</description> + <command interpreter="python">siggenes.py --genomic $genomic --out $out +#if $samples.mode == "matrix": +-p ${samples.phenotype} +-e "${samples.eval_text}" +#end if +#if $samples.mode == "list": +-l ${samples.sample_list} +#end if + </command> + <inputs> + <param name="genomic" type="data" label="Genomic Matrix" help="Matrix of Genomic Values"/> + + <conditional name="samples"> + <param name="mode" type="select" label="Set Mode"> + <option value="list">Sample List</option> + <option value="matrix">Phenotype Matrix</option> + </param> + <when value="list"> + <param name="sample_list" type="data" label="Sample List" help="List of positive samples"/> + </when> + <when value="matrix"> + <param name="phenotype" type="data" label="Phenotype Matrix" help="Matrix of Phenotypes"/> + <param name="eval_text" type="text" size="90" label="Evaluation Statement" help="evalation statement to select positive set" value="value=='+'"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + </mapping> + </sanitizer> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data name="out" format="tabular" label="Significance Matrix" help="Matrix of gene significance"/> + </outputs> + <help> +Wrapper for the siggenes R package. + +Genomic Matrix: + + Matrix of values from micro array, samples as columns, probe values as rows + +Phenotype Matrix: + + Matrix of values describing the samples class membership. Classes as columns, samples as rows. + All values will be evaluated using the 'Evaluation Statement'. + +Sample List: + + A list of samples that belong to the positive set. An alternative input to providing a phenotype matrix and + evaluation statement + +Evaluation Statement: + + A statement that will be evaluated against every cell in the phenotype matrix (it is ignored if a sample list is provided). A 'True' return value + means that the sample is part of the group. The value of the cell is 'value' in string format. Example statements:: + + value == '+' + + float(value) > 0.0 + + </help> +</tool>
