# HG changeset patch # User melissacline # Date 1423701873 28800 # Node ID 12a1ea92052445d455d90730807959207d4638a1 # Parent d1104ad3646a499fee0cc78891a60008003a11fe Creating a tool to merge genomic datasets diff -r d1104ad3646a -r 12a1ea920524 mergeGenomicFiles.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mergeGenomicFiles.xml Wed Feb 11 16:44:33 2015 -0800 @@ -0,0 +1,20 @@ + + + Given two genomic datasets, merge them to create a third dataset with the row and column identifiers from both datasets. + + + mergeGenomicMatrixFiles.py $outputC $inputA $inputB + + + + + + + + + + ***Merge Genomic Datasets*** + + Given two genomic datasets, merge them to produce a third dataset that is the union of the first two. The new dataset will contain all column labels from either dataset, and all row labels from either dataset. If a row label appears in both datasets, the output dataset will contain, for that row, all values for the first set of columns, plus all values for the second set of columns. If a row label appears in the first dataset only, the output dataset will contain the values for the columns of the first dataset, and blanks (indicating missing values) for the columns of the second da + + diff -r d1104ad3646a -r 12a1ea920524 mergeGenomicMatrixFiles.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mergeGenomicMatrixFiles.py Wed Feb 11 16:44:33 2015 -0800 @@ -0,0 +1,86 @@ +import string,os,sys + +def header (samples, infile): + fin= open(infile,'r') + #header, samples + newSamples = string.split(string.strip(fin.readline()),'\t')[1:] + for sample in newSamples: + if sample not in samples: + samples[sample]= len(samples) + fin.close() + return + +def process(genes, samples, dataMatrix, infile): + maxLength= len(samples) + + fin= open(infile,'r') + #header + newSamples = string.split(string.strip(fin.readline()),'\t') + + while 1: + line = fin.readline()[:-1] + if line =="": + break + data = string.split(line,"\t") + gene = data[0] + if gene not in genes: + genes[gene]= len(genes) + l=[] + for i in range (0, maxLength): + l.append("") + dataMatrix.append(l) + + x = genes[gene] + for i in range (1, len(data)): + sample = newSamples[i] + y = samples[sample] + dataMatrix[x][y]= data[i] + + fin.close() + return + +def outputMatrix(dataMatrix, samples, genes, outfile): + fout = open(outfile,"w") + maxLength= len(samples) + sList=[] + for i in range (0, maxLength): + sList.append("") + for sample in samples: + pos =samples[sample] + sList[pos] = sample + + fout.write("sample") + for sample in sList: + fout.write("\t"+sample) + fout.write("\n") + + for gene in genes: + fout.write(gene) + for sample in sList: + value = dataMatrix[genes[gene]][samples[sample]] + fout.write("\t"+value) + fout.write("\n") + fout.close() + return + +if __name__ == '__main__' : + if len(sys.argv[:]) <4: + print "python mergeFilesByColumn.py output inputfile(s)" + print "**********memory intensive, not for very genomic data with hugo number of probes" + print "this is merging data A+B=C\n" + sys.exit() + + inFiles = sys.argv[2:] + outfile = sys.argv[1] + + genes={} + samples={} + dataMatrix=[] + + for infile in inFiles: + header (samples, infile) + + for infile in inFiles: + process(genes, samples, dataMatrix, infile) + + outputMatrix(dataMatrix, samples, genes, outfile)