Mercurial > repos > melissacline > ucsc_cancer_utilities
changeset 3:12a1ea920524
Creating a tool to merge genomic datasets
author | melissacline |
---|---|
date | Wed, 11 Feb 2015 16:44:33 -0800 |
parents | d1104ad3646a |
children | d0674221a6ae |
files | mergeGenomicFiles.xml mergeGenomicMatrixFiles.py |
diffstat | 2 files changed, 106 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mergeGenomicFiles.xml Wed Feb 11 16:44:33 2015 -0800 @@ -0,0 +1,20 @@ +<tool id="mergeGenomicFiles" description="Merge two genomic datasets into a new dataset" name="mergeGenomicFiles" version="0.0.1"> + <description> + Given two genomic datasets, merge them to create a third dataset with the row and column identifiers from both datasets. + </description> + <command interpreter="python"> + mergeGenomicMatrixFiles.py $outputC $inputA $inputB + </command> + <inputs> + <param name="inputA" format="tabular" type="data" label="Genomic Dataset A"/> + <param name="inputB" format="tabular" type="data" label="Genomic Dataset B"/> + </inputs> + <outputs> + <data name="outputC" format="tabular"/> + </outputs> + <help> + ***Merge Genomic Datasets*** + + Given two genomic datasets, merge them to produce a third dataset that is the union of the first two. The new dataset will contain all column labels from either dataset, and all row labels from either dataset. If a row label appears in both datasets, the output dataset will contain, for that row, all values for the first set of columns, plus all values for the second set of columns. If a row label appears in the first dataset only, the output dataset will contain the values for the columns of the first dataset, and blanks (indicating missing values) for the columns of the second da + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mergeGenomicMatrixFiles.py Wed Feb 11 16:44:33 2015 -0800 @@ -0,0 +1,86 @@ +import string,os,sys + +def header (samples, infile): + fin= open(infile,'r') + #header, samples + newSamples = string.split(string.strip(fin.readline()),'\t')[1:] + for sample in newSamples: + if sample not in samples: + samples[sample]= len(samples) + fin.close() + return + +def process(genes, samples, dataMatrix, infile): + maxLength= len(samples) + + fin= open(infile,'r') + #header + newSamples = string.split(string.strip(fin.readline()),'\t') + + while 1: + line = fin.readline()[:-1] + if line =="": + break + data = string.split(line,"\t") + gene = data[0] + if gene not in genes: + genes[gene]= len(genes) + l=[] + for i in range (0, maxLength): + l.append("") + dataMatrix.append(l) + + x = genes[gene] + for i in range (1, len(data)): + sample = newSamples[i] + y = samples[sample] + dataMatrix[x][y]= data[i] + + fin.close() + return + +def outputMatrix(dataMatrix, samples, genes, outfile): + fout = open(outfile,"w") + maxLength= len(samples) + sList=[] + for i in range (0, maxLength): + sList.append("") + for sample in samples: + pos =samples[sample] + sList[pos] = sample + + fout.write("sample") + for sample in sList: + fout.write("\t"+sample) + fout.write("\n") + + for gene in genes: + fout.write(gene) + for sample in sList: + value = dataMatrix[genes[gene]][samples[sample]] + fout.write("\t"+value) + fout.write("\n") + fout.close() + return + +if __name__ == '__main__' : + if len(sys.argv[:]) <4: + print "python mergeFilesByColumn.py output inputfile(s)" + print "**********memory intensive, not for very genomic data with hugo number of probes" + print "this is merging data A+B=C\n" + sys.exit() + + inFiles = sys.argv[2:] + outfile = sys.argv[1] + + genes={} + samples={} + dataMatrix=[] + + for infile in inFiles: + header (samples, infile) + + for infile in inFiles: + process(genes, samples, dataMatrix, infile) + + outputMatrix(dataMatrix, samples, genes, outfile)