18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4 import string, os, sys
|
|
5
|
|
6 requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"]
|
|
7
|
|
8 def headerError(filename, column, ferror):
|
|
9 ferror.write(filename +" does not have column " + column+"\n")
|
|
10 ferror.close()
|
|
11 sys.exit(1)
|
|
12
|
|
13 def findAnyValueInList (values, dataList):
|
|
14 for value in values:
|
|
15 for i in range(0,len(dataList)):
|
|
16 if value == dataList[i]:
|
|
17 return i
|
|
18 return -1
|
|
19
|
|
20 def header (infile, ferror):
|
|
21 fin= open(infile,'U')
|
|
22
|
|
23 columnDic ={}
|
|
24 #header
|
|
25 line = fin.readline()
|
|
26 fin.close()
|
|
27 if line [0]=="#":
|
|
28 line = line[1:-1]
|
|
29 data = string.split(line,"\t")
|
|
30
|
|
31 columnDic["chr"]= findAnyValueInList (["chr","chrom"], data)
|
|
32 if columnDic["chr"] ==-1:
|
|
33 headerError(infile, "chr", ferror)
|
|
34
|
|
35 columnDic["start"]= findAnyValueInList (["start","chrStart"], data)
|
|
36 if columnDic["start"] == -1:
|
|
37 headerError(infile, "start", ferror)
|
|
38
|
|
39 columnDic["end"]= findAnyValueInList (["end","chrEnd"], data)
|
|
40 if columnDic["end"] == -1:
|
|
41 headerError(infile, "end", ferror)
|
|
42
|
|
43 columnDic["alt"]= findAnyValueInList (["alt"], data)
|
|
44 if columnDic["alt"] == -1:
|
|
45 headerError(infile, "alt", ferror)
|
|
46
|
|
47 columnDic["reference"]= findAnyValueInList (["reference","ref"], data)
|
|
48 if columnDic["reference"] == -1:
|
|
49 headerError(infile, "reference", ferror)
|
|
50
|
|
51 columnDic["gene"]= findAnyValueInList (["gene"], data)
|
|
52 if columnDic["gene"] == -1:
|
|
53 headerError(infile, "gene", ferror)
|
|
54
|
|
55 columnDic["effect"]= findAnyValueInList (["effect"], data)
|
|
56 if columnDic["effect"] == -1:
|
|
57 headerError(infile, "effect", ferror)
|
|
58
|
|
59 requiredCols = columnDic.keys()
|
|
60 requiredColsPos = columnDic.values()
|
|
61 for i in range(1,len(data)):
|
|
62 if i not in requiredColsPos:
|
|
63 columnDic [data[i]]=i
|
|
64
|
|
65 return columnDic
|
|
66
|
20
|
67 def summarizeColumns(inFiles, fileColumn, allCols, ferror):
|
18
|
68 for infile in inFiles:
|
|
69 columnDic = header (infile, ferror)
|
|
70 fileColumn [infile] = columnDic
|
|
71 for col in columnDic:
|
|
72 if col not in allCols:
|
|
73 allCols.append(col)
|
|
74 return
|
|
75
|
|
76 def outputHeader (requiredCOLs,allCols,fout):
|
|
77 fout.write("#sample")
|
|
78 for col in requiredCOLs:
|
|
79 fout.write("\t"+col)
|
|
80 for col in allCols:
|
|
81 if col not in requiredCOLs:
|
|
82 fout.write("\t"+col)
|
|
83 fout.write("\n")
|
|
84 fout.close()
|
|
85 return
|
|
86
|
|
87 def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout):
|
|
88 fin = open(infile,'U')
|
|
89 fin.readline()
|
|
90 while 1:
|
|
91 line = fin.readline()[:-1]
|
|
92 if line =="":
|
|
93 break
|
|
94 data = string.split(line,'\t')
|
|
95 fout.write(data[0])
|
|
96 for col in requiredCOLs:
|
|
97 pos = columnDic[col]
|
|
98 fout.write("\t"+ data[pos])
|
|
99 for col in allCols:
|
|
100 if col not in requiredCOLs:
|
|
101 if col in columnDic:
|
|
102 pos = columnDic[col]
|
|
103 fout.write("\t"+ data[pos])
|
|
104 else:
|
|
105 fout.write("\t")
|
|
106 fout.write("\n")
|
|
107 fin.close()
|
|
108 return
|
|
109
|
|
110 def collectSource(inFile, label, sampleDic):
|
|
111 fin = open(inFile,'U')
|
|
112 fin.readline()
|
|
113 while 1:
|
|
114 line = fin.readline()[:-1]
|
|
115 if line =="":
|
|
116 break
|
|
117 sample = string.split(line,'\t')[0]
|
|
118 if sample not in sampleDic:
|
|
119 sampleDic[sample]=[]
|
|
120 if inFile not in sampleDic[sample]:
|
|
121 sampleDic[sample].append(label)
|
|
122 fin.close()
|
|
123 return
|
|
124
|
|
125 def outputSampleDic (sampleDic, outPhenotypeFile):
|
|
126 fout = open(outPhenotypeFile,'w')
|
|
127 fout.write("sample\tsource\n")
|
|
128 for sample in sampleDic:
|
|
129 source = sampleDic[sample]
|
|
130 source.sort()
|
|
131 fout.write(sample+"\t"+string.join(source,", ")+"\n")
|
|
132 fout.close()
|
|
133 return
|
|
134
|
|
135 if __name__ == '__main__' :
|
|
136 if len(sys.argv[:]) <6:
|
|
137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
|
|
138 print "this is merging data A+B=C for mutation by position type of data\n"
|
|
139 sys.exit(1)
|
|
140
|
20
|
141 #
|
|
142 # The input files to this script are two or more matrices, in which
|
|
143 # columns represent samples and rows represent genes or measurements.
|
|
144 # There are two output files: outMergedData contains the input data merged
|
|
145 # into a single matrix, and outSourceMatrix is a two-column matrix
|
|
146 # indicating which file each sample (or column label) came from. This
|
|
147 # assumes that each sample came from at most one file.
|
|
148 #
|
|
149 parser = argparse.ArgumentParser()
|
|
150 parser.add_argument("outMergedData", type=str,
|
|
151 help="Filename for the merged dataset")
|
|
152 parser.add_argument("outSourceMatrix", type=str,
|
|
153 help="""Filename for a Nx2 matrix that indicates
|
18
|
154 the source file of each column""")
|
20
|
155 parser.add_argument("errorLog", type=str,
|
|
156 help="""Error log""")
|
|
157 parser.add_argument("inFileA", type=str, help="First input file")
|
|
158 parser.add_argument("inFileB", type=str, help="Second input file")
|
|
159 parser.add_argument("--aLabel", type=str, default=None,
|
|
160 help="User-friendly label for the first input file")
|
|
161 parser.add_argument("--bLabel", type=str, default=None,
|
|
162 help="User-friendly label for the second input file")
|
|
163 args = parser.parse_args()
|
18
|
164
|
|
165
|
20
|
166 #inFiles = sys.argv[4:]
|
|
167 inFiles = list()
|
|
168 inFiles.append(args.inFileA)
|
|
169 inFiles.append(args.inFileB)
|
|
170 errofile = args.errorLog
|
|
171 outfile = args.outMergedData
|
|
172 #print outfile
|
|
173 outPhenotypeFile = args.outSourceMatrix
|
|
174 #print outPhenotypeFile
|
18
|
175
|
20
|
176 ferror = open(errofile,'w')
|
|
177
|
|
178 #get all the columns, build fileColumn dictionary
|
|
179 fileColumn={}
|
|
180 allCols =[]
|
|
181 summarizeColumns(inFiles, fileColumn, allCols, ferror)
|
|
182 ferror.close()
|
18
|
183
|
20
|
184 #output header line
|
|
185 fout = open(outfile,'w')
|
|
186 outputHeader (requiredCOLs,allCols,fout)
|
|
187
|
|
188 #process and output combined mutationXena file
|
|
189 fout = open(outfile,'a')
|
18
|
190
|
20
|
191 columnDic = fileColumn[args.inFileA]
|
|
192 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
|
|
193 columnDic = fileColumn[args.inFileB]
|
|
194 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
|
|
195 fout.close()
|
|
196
|
|
197 #collect sample from source information
|
|
198 sampleDic ={}
|
|
199 if args.aLabel is None:
|
|
200 collectSource(args.inFileA, args.inFileA, sampleDic)
|
|
201 else:
|
|
202 collectSource(args.inFileA, args.aLabel, sampleDic)
|
|
203 if args.bLabel is None:
|
|
204 collectSource(args.inFileB, args.inFileB, sampleDic)
|
|
205 else:
|
|
206 collectSource(args.inFileB, args.bLabel, sampleDic)
|
18
|
207
|
|
208
|
20
|
209 #output sample source information as phenotype matrix
|
|
210 outputSampleDic (sampleDic, outPhenotypeFile)
|
18
|
211
|
|
212
|
|
213
|
|
214
|