Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison mergeGenomicMatrixFiles.py @ 14:d0b8c8eee9d5
Committing changes that hadn't made it in yet, merge hell
author | melissacline |
---|---|
date | Tue, 10 Mar 2015 19:32:14 -0700 |
parents | 30aab34424a9 |
children | 1d83dbbee373 |
comparison
equal
deleted
inserted
replaced
13:dd93e7d1bf01 | 14:d0b8c8eee9d5 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | |
3 import string,os,sys | 4 import string,os,sys |
4 | 5 |
5 def header (samples, infile): | 6 def header (samples, sourceFiles, infile, labelThisFile): |
6 fin= open(infile,'r') | 7 if labelThisFile == None: |
8 labelToUse = infile | |
9 else: | |
10 labelToUse = labelThisFile | |
11 fin= open(infile, 'U') | |
7 #header, samples | 12 #header, samples |
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:] | 13 newSamples = string.split(string.strip(fin.readline()),'\t')[1:] |
9 for sample in newSamples: | 14 for sample in newSamples: |
10 if sample not in samples: | 15 if sample not in samples: |
11 samples[sample]= len(samples) | 16 samples[sample]= len(samples) |
17 sourceFiles[sample] = labelToUse | |
12 fin.close() | 18 fin.close() |
13 return | 19 return |
14 | 20 |
15 def process(genes, samples, dataMatrix, infile): | 21 def process(genes, samples, dataMatrix, infile): |
16 maxLength= len(samples) | 22 maxLength= len(samples) |
17 | 23 |
18 fin= open(infile,'r') | 24 fin= open(infile,'U') |
19 #header | 25 #header |
20 newSamples = string.split(string.strip(fin.readline()),'\t') | 26 newSamples = string.split(string.strip(fin.readline()),'\t') |
21 | 27 |
22 while 1: | 28 while 1: |
23 line = fin.readline()[:-1] | 29 line = fin.readline()[:-1] |
39 dataMatrix[x][y]= data[i] | 45 dataMatrix[x][y]= data[i] |
40 | 46 |
41 fin.close() | 47 fin.close() |
42 return | 48 return |
43 | 49 |
44 def outputMatrix(dataMatrix, samples, genes, outfile): | 50 |
51 def outputSourceMatrix(sourceData, outputFileName): | |
52 fout = open(outputFileName, "w") | |
53 fout.write("Sample\tSource\n") | |
54 for thisSample in sourceData.keys(): | |
55 fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample])) | |
56 fout.close() | |
57 return | |
58 | |
59 | |
60 def outputMergedMatrix(dataMatrix, samples, genes, outfile): | |
45 fout = open(outfile,"w") | 61 fout = open(outfile,"w") |
46 maxLength= len(samples) | 62 maxLength= len(samples) |
47 sList=[] | 63 sList=[] |
48 for i in range (0, maxLength): | 64 for i in range (0, maxLength): |
49 sList.append("") | 65 sList.append("") |
64 fout.write("\n") | 80 fout.write("\n") |
65 fout.close() | 81 fout.close() |
66 return | 82 return |
67 | 83 |
68 if __name__ == '__main__' : | 84 if __name__ == '__main__' : |
69 if len(sys.argv[:]) <4: | 85 # |
70 print "python mergeFilesByColumn.py output inputfile(s)" | 86 # The input files to this script are two or more matrices, in which |
71 print "**********memory intensive, not for very genomic data with hugo number of probes" | 87 # columns represent samples and rows represent genes or measurements. |
72 print "this is merging data A+B=C\n" | 88 # There are two output files: outMergedData contains the input data merged |
73 sys.exit() | 89 # into a single matrix, and outSourceMatrix is a two-column matrix |
74 | 90 # indicating which file each sample (or column label) came from. This |
75 inFiles = sys.argv[2:] | 91 # assumes that each sample came from at most one file. |
76 outfile = sys.argv[1] | 92 # |
93 parser = argparse.ArgumentParser() | |
94 parser.add_argument("inFileA", type=str, help="First input file") | |
95 parser.add_argument("inFileB", type=str, help="Second input file") | |
96 parser.add_argument("outMergedData", type=str, | |
97 help="Filename for the merged dataset") | |
98 parser.add_argument("outSourceMatrix", type=str, | |
99 help="""Filename for a Nx2 matrix that indicates | |
100 the source file of each column""") | |
101 parser.add_argument("--aLabel", type=str, default=None, | |
102 help="User-friendly label for the first input file") | |
103 parser.add_argument("--bLabel", type=str, default=None, | |
104 help="User-friendly label for the second input file") | |
105 args = parser.parse_args() | |
77 | 106 |
78 genes={} | 107 genes={} |
79 samples={} | 108 samples={} |
109 sourceFiles = {} | |
80 dataMatrix=[] | 110 dataMatrix=[] |
81 | 111 |
82 for infile in inFiles: | 112 header(samples, sourceFiles, args.inFileA, args.aLabel) |
83 header (samples, infile) | 113 header(samples, sourceFiles, args.inFileB, args.bLabel) |
84 | 114 |
85 for infile in inFiles: | 115 process(genes, samples, dataMatrix, args.inFileA) |
86 process(genes, samples, dataMatrix, infile) | 116 process(genes, samples, dataMatrix, args.inFileB) |
87 | 117 |
88 outputMatrix(dataMatrix, samples, genes, outfile) | 118 outputSourceMatrix(sourceFiles, args.outSourceMatrix) |
119 outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData) |