comparison mergeGenomicMatrixFiles.py @ 14:d0b8c8eee9d5

Committing changes that hadn't made it in yet, merge hell
author melissacline
date Tue, 10 Mar 2015 19:32:14 -0700
parents 30aab34424a9
children 1d83dbbee373
comparison
equal deleted inserted replaced
13:dd93e7d1bf01 14:d0b8c8eee9d5
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse
3 import string,os,sys 4 import string,os,sys
4 5
5 def header (samples, infile): 6 def header (samples, sourceFiles, infile, labelThisFile):
6 fin= open(infile,'r') 7 if labelThisFile == None:
8 labelToUse = infile
9 else:
10 labelToUse = labelThisFile
11 fin= open(infile, 'U')
7 #header, samples 12 #header, samples
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:] 13 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
9 for sample in newSamples: 14 for sample in newSamples:
10 if sample not in samples: 15 if sample not in samples:
11 samples[sample]= len(samples) 16 samples[sample]= len(samples)
17 sourceFiles[sample] = labelToUse
12 fin.close() 18 fin.close()
13 return 19 return
14 20
15 def process(genes, samples, dataMatrix, infile): 21 def process(genes, samples, dataMatrix, infile):
16 maxLength= len(samples) 22 maxLength= len(samples)
17 23
18 fin= open(infile,'r') 24 fin= open(infile,'U')
19 #header 25 #header
20 newSamples = string.split(string.strip(fin.readline()),'\t') 26 newSamples = string.split(string.strip(fin.readline()),'\t')
21 27
22 while 1: 28 while 1:
23 line = fin.readline()[:-1] 29 line = fin.readline()[:-1]
39 dataMatrix[x][y]= data[i] 45 dataMatrix[x][y]= data[i]
40 46
41 fin.close() 47 fin.close()
42 return 48 return
43 49
44 def outputMatrix(dataMatrix, samples, genes, outfile): 50
51 def outputSourceMatrix(sourceData, outputFileName):
52 fout = open(outputFileName, "w")
53 fout.write("Sample\tSource\n")
54 for thisSample in sourceData.keys():
55 fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
56 fout.close()
57 return
58
59
60 def outputMergedMatrix(dataMatrix, samples, genes, outfile):
45 fout = open(outfile,"w") 61 fout = open(outfile,"w")
46 maxLength= len(samples) 62 maxLength= len(samples)
47 sList=[] 63 sList=[]
48 for i in range (0, maxLength): 64 for i in range (0, maxLength):
49 sList.append("") 65 sList.append("")
64 fout.write("\n") 80 fout.write("\n")
65 fout.close() 81 fout.close()
66 return 82 return
67 83
68 if __name__ == '__main__' : 84 if __name__ == '__main__' :
69 if len(sys.argv[:]) <4: 85 #
70 print "python mergeFilesByColumn.py output inputfile(s)" 86 # The input files to this script are two or more matrices, in which
71 print "**********memory intensive, not for very genomic data with hugo number of probes" 87 # columns represent samples and rows represent genes or measurements.
72 print "this is merging data A+B=C\n" 88 # There are two output files: outMergedData contains the input data merged
73 sys.exit() 89 # into a single matrix, and outSourceMatrix is a two-column matrix
74 90 # indicating which file each sample (or column label) came from. This
75 inFiles = sys.argv[2:] 91 # assumes that each sample came from at most one file.
76 outfile = sys.argv[1] 92 #
93 parser = argparse.ArgumentParser()
94 parser.add_argument("inFileA", type=str, help="First input file")
95 parser.add_argument("inFileB", type=str, help="Second input file")
96 parser.add_argument("outMergedData", type=str,
97 help="Filename for the merged dataset")
98 parser.add_argument("outSourceMatrix", type=str,
99 help="""Filename for a Nx2 matrix that indicates
100 the source file of each column""")
101 parser.add_argument("--aLabel", type=str, default=None,
102 help="User-friendly label for the first input file")
103 parser.add_argument("--bLabel", type=str, default=None,
104 help="User-friendly label for the second input file")
105 args = parser.parse_args()
77 106
78 genes={} 107 genes={}
79 samples={} 108 samples={}
109 sourceFiles = {}
80 dataMatrix=[] 110 dataMatrix=[]
81 111
82 for infile in inFiles: 112 header(samples, sourceFiles, args.inFileA, args.aLabel)
83 header (samples, infile) 113 header(samples, sourceFiles, args.inFileB, args.bLabel)
84 114
85 for infile in inFiles: 115 process(genes, samples, dataMatrix, args.inFileA)
86 process(genes, samples, dataMatrix, infile) 116 process(genes, samples, dataMatrix, args.inFileB)
87 117
88 outputMatrix(dataMatrix, samples, genes, outfile) 118 outputSourceMatrix(sourceFiles, args.outSourceMatrix)
119 outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)