comparison mergeGenomicMatrixFiles.py @ 7:1d150e860c4d

Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
author melissacline
date Mon, 09 Mar 2015 19:58:03 -0700
parents 2035405538b4
children 5d4538cb38db
comparison
equal deleted inserted replaced
6:2035405538b4 7:1d150e860c4d
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse
3 import string,os,sys 4 import string,os,sys
4 5
5 def header (samples, infile): 6 def header (samples, sourceFiles, infile, labelThisFile):
7 if labelThisFile == None:
8 labelToUse = infile
9 else:
10 labelToUse = labelThisFile
6 fin= open(infile,'r') 11 fin= open(infile,'r')
7 #header, samples 12 #header, samples
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:] 13 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
9 for sample in newSamples: 14 for sample in newSamples:
10 if sample not in samples: 15 if sample not in samples:
11 samples[sample]= len(samples) 16 samples[sample]= len(samples)
17 sourceFiles[sample] = labelToUse
12 fin.close() 18 fin.close()
13 return 19 return
14 20
15 def process(genes, samples, dataMatrix, infile): 21 def process(genes, samples, dataMatrix, infile):
16 maxLength= len(samples) 22 maxLength= len(samples)
39 dataMatrix[x][y]= data[i] 45 dataMatrix[x][y]= data[i]
40 46
41 fin.close() 47 fin.close()
42 return 48 return
43 49
44 def outputMatrix(dataMatrix, samples, genes, outfile): 50
51 def outputSourceMatrix(sourceData, outputFileName):
52 fout = open(outputFileName, "w")
53 fout.write("Sample\tSource\n")
54 for thisSample in sourceData.keys():
55 fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
56 fout.close()
57 return
58
59
60 def outputMergedMatrix(dataMatrix, samples, genes, outfile):
45 fout = open(outfile,"w") 61 fout = open(outfile,"w")
46 maxLength= len(samples) 62 maxLength= len(samples)
47 sList=[] 63 sList=[]
48 for i in range (0, maxLength): 64 for i in range (0, maxLength):
49 sList.append("") 65 sList.append("")
64 fout.write("\n") 80 fout.write("\n")
65 fout.close() 81 fout.close()
66 return 82 return
67 83
68 if __name__ == '__main__' : 84 if __name__ == '__main__' :
69 if len(sys.argv[:]) <4: 85 #
70 print "python mergeFilesByColumn.py output inputfile(s)" 86 # The input files to this script are two or more matrices, in which
71 print "**********memory intensive, not for very genomic data with hugo number of probes" 87 # columns represent samples and rows represent genes or measurements.
72 print "this is merging data A+B=C\n" 88 # There are two output files: outMergedData contains the input data merged
73 sys.exit() 89 # into a single matrix, and outSourceMatrix is a two-column matrix
74 90 # indicating which file each sample (or column label) came from. This
75 inFiles = sys.argv[2:] 91 # assumes that each sample came from at most one file.
76 outfile = sys.argv[1] 92 #
93 parser = argparse.ArgumentParser()
94 parser.add_argument("inFileA", type=str, help="First input file")
95 parser.add_argument("inFileB", type=str, help="Second input file")
96 parser.add_argument("outMergedData", type=str,
97 help="Filename for the merged dataset")
98 parser.add_argument("outSourceMatrix", type=str,
99 help="""Filename for a Nx2 matrix that indicates
100 the source file of each column""")
101 parser.add_argument("--aLabel", type=str, default=None,
102 help="User-friendly label for the first input file")
103 parser.add_argument("--bLabel", type=str, default=None,
104 help="User-friendly label for the second input file")
105 args = parser.parse_args()
106 outSourceMatrix = sys.argv[2]
77 107
78 genes={} 108 genes={}
79 samples={} 109 samples={}
110 sourceFiles = {}
80 dataMatrix=[] 111 dataMatrix=[]
81 112
82 for infile in inFiles: 113 header(samples, sourceFiles, args.inFileA, args.aLabel)
83 header (samples, infile) 114 header(samples, sourceFiles, args.inFileB, args.bLabel)
84 115
85 for infile in inFiles: 116 process(genes, samples, dataMatrix, args.inFileA)
86 process(genes, samples, dataMatrix, infile) 117 process(genes, samples, dataMatrix, args.inFileB)
87 118
88 outputMatrix(dataMatrix, samples, genes, outfile) 119 outputSourceMatrix(sourceFiles, args.outSourceMatrix)
120 outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)