Mercurial > repos > melissacline > ucsc_cancer_utilities
view mergeGenomicMatrixFiles.py @ 3:12a1ea920524
Creating a tool to merge genomic datasets
author | melissacline |
---|---|
date | Wed, 11 Feb 2015 16:44:33 -0800 |
parents | |
children | d0674221a6ae |
line wrap: on
line source
import string,os,sys def header (samples, infile): fin= open(infile,'r') #header, samples newSamples = string.split(string.strip(fin.readline()),'\t')[1:] for sample in newSamples: if sample not in samples: samples[sample]= len(samples) fin.close() return def process(genes, samples, dataMatrix, infile): maxLength= len(samples) fin= open(infile,'r') #header newSamples = string.split(string.strip(fin.readline()),'\t') while 1: line = fin.readline()[:-1] if line =="": break data = string.split(line,"\t") gene = data[0] if gene not in genes: genes[gene]= len(genes) l=[] for i in range (0, maxLength): l.append("") dataMatrix.append(l) x = genes[gene] for i in range (1, len(data)): sample = newSamples[i] y = samples[sample] dataMatrix[x][y]= data[i] fin.close() return def outputMatrix(dataMatrix, samples, genes, outfile): fout = open(outfile,"w") maxLength= len(samples) sList=[] for i in range (0, maxLength): sList.append("") for sample in samples: pos =samples[sample] sList[pos] = sample fout.write("sample") for sample in sList: fout.write("\t"+sample) fout.write("\n") for gene in genes: fout.write(gene) for sample in sList: value = dataMatrix[genes[gene]][samples[sample]] fout.write("\t"+value) fout.write("\n") fout.close() return if __name__ == '__main__' : if len(sys.argv[:]) <4: print "python mergeFilesByColumn.py output inputfile(s)" print "**********memory intensive, not for very genomic data with hugo number of probes" print "this is merging data A+B=C\n" sys.exit() inFiles = sys.argv[2:] outfile = sys.argv[1] genes={} samples={} dataMatrix=[] for infile in inFiles: header (samples, infile) for infile in inFiles: process(genes, samples, dataMatrix, infile) outputMatrix(dataMatrix, samples, genes, outfile)