Mercurial > repos > melissacline > ucsc_cancer_utilities
view mergeGenomicMatrixFiles.py @ 12:eba1f6b111c4
Trying again to commit missing changes...
author | melissacline |
---|---|
date | Tue, 10 Mar 2015 19:25:49 -0700 |
parents | 30aab34424a9 |
children | 1d83dbbee373 |
line wrap: on
line source
#!/usr/bin/env python import argparse import string,os,sys def header (samples, sourceFiles, infile, labelThisFile): if labelThisFile == None: labelToUse = infile else: labelToUse = labelThisFile fin= open(infile, 'U') #header, samples newSamples = string.split(string.strip(fin.readline()),'\t')[1:] for sample in newSamples: if sample not in samples: samples[sample]= len(samples) sourceFiles[sample] = labelToUse fin.close() return def process(genes, samples, dataMatrix, infile): maxLength= len(samples) fin= open(infile,'U') #header newSamples = string.split(string.strip(fin.readline()),'\t') while 1: line = fin.readline()[:-1] if line =="": break data = string.split(line,"\t") gene = data[0] if gene not in genes: genes[gene]= len(genes) l=[] for i in range (0, maxLength): l.append("") dataMatrix.append(l) x = genes[gene] for i in range (1, len(data)): sample = newSamples[i] y = samples[sample] dataMatrix[x][y]= data[i] fin.close() return def outputSourceMatrix(sourceData, outputFileName): fout = open(outputFileName, "w") fout.write("Sample\tSource\n") for thisSample in sourceData.keys(): fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample])) fout.close() return def outputMergedMatrix(dataMatrix, samples, genes, outfile): fout = open(outfile,"w") maxLength= len(samples) sList=[] for i in range (0, maxLength): sList.append("") for sample in samples: pos =samples[sample] sList[pos] = sample fout.write("sample") for sample in sList: fout.write("\t"+sample) fout.write("\n") for gene in genes: fout.write(gene) for sample in sList: value = dataMatrix[genes[gene]][samples[sample]] fout.write("\t"+value) fout.write("\n") fout.close() return if __name__ == '__main__' : # # The input files to this script are two or more matrices, in which # columns represent samples and rows represent genes or measurements. # There are two output files: outMergedData contains the input data merged # into a single matrix, and outSourceMatrix is a two-column matrix # indicating which file each sample (or column label) came from. This # assumes that each sample came from at most one file. # parser = argparse.ArgumentParser() parser.add_argument("inFileA", type=str, help="First input file") parser.add_argument("inFileB", type=str, help="Second input file") parser.add_argument("outMergedData", type=str, help="Filename for the merged dataset") parser.add_argument("outSourceMatrix", type=str, help="""Filename for a Nx2 matrix that indicates the source file of each column""") parser.add_argument("--aLabel", type=str, default=None, help="User-friendly label for the first input file") parser.add_argument("--bLabel", type=str, default=None, help="User-friendly label for the second input file") args = parser.parse_args() genes={} samples={} sourceFiles = {} dataMatrix=[] header(samples, sourceFiles, args.inFileA, args.aLabel) header(samples, sourceFiles, args.inFileB, args.bLabel) process(genes, samples, dataMatrix, args.inFileA) process(genes, samples, dataMatrix, args.inFileB) outputSourceMatrix(sourceFiles, args.outSourceMatrix) outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)