comparison mergeXenaMutation.py @ 18:15cb5a49cdbc

Uploaded
author melissacline
date Fri, 20 Mar 2015 18:08:54 -0400
parents
children 914bc8ee6222
comparison
equal deleted inserted replaced
6:2035405538b4 18:15cb5a49cdbc
1 #!/usr/bin/env python
2
3 import argparse
4 import string, os, sys
5
6 requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"]
7
8 def headerError(filename, column, ferror):
9 ferror.write(filename +" does not have column " + column+"\n")
10 ferror.close()
11 sys.exit(1)
12
13 def findAnyValueInList (values, dataList):
14 for value in values:
15 for i in range(0,len(dataList)):
16 if value == dataList[i]:
17 return i
18 return -1
19
20 def header (infile, ferror):
21 fin= open(infile,'U')
22
23 columnDic ={}
24 #header
25 line = fin.readline()
26 fin.close()
27 if line [0]=="#":
28 line = line[1:-1]
29 data = string.split(line,"\t")
30
31 columnDic["chr"]= findAnyValueInList (["chr","chrom"], data)
32 if columnDic["chr"] ==-1:
33 headerError(infile, "chr", ferror)
34
35 columnDic["start"]= findAnyValueInList (["start","chrStart"], data)
36 if columnDic["start"] == -1:
37 headerError(infile, "start", ferror)
38
39 columnDic["end"]= findAnyValueInList (["end","chrEnd"], data)
40 if columnDic["end"] == -1:
41 headerError(infile, "end", ferror)
42
43 columnDic["alt"]= findAnyValueInList (["alt"], data)
44 if columnDic["alt"] == -1:
45 headerError(infile, "alt", ferror)
46
47 columnDic["reference"]= findAnyValueInList (["reference","ref"], data)
48 if columnDic["reference"] == -1:
49 headerError(infile, "reference", ferror)
50
51 columnDic["gene"]= findAnyValueInList (["gene"], data)
52 if columnDic["gene"] == -1:
53 headerError(infile, "gene", ferror)
54
55 columnDic["effect"]= findAnyValueInList (["effect"], data)
56 if columnDic["effect"] == -1:
57 headerError(infile, "effect", ferror)
58
59 requiredCols = columnDic.keys()
60 requiredColsPos = columnDic.values()
61 for i in range(1,len(data)):
62 if i not in requiredColsPos:
63 columnDic [data[i]]=i
64
65 return columnDic
66
67 def summarizeColumns(infiles, fileColumn, allCols, ferror):
68 for infile in inFiles:
69 columnDic = header (infile, ferror)
70 fileColumn [infile] = columnDic
71 for col in columnDic:
72 if col not in allCols:
73 allCols.append(col)
74 return
75
76 def outputHeader (requiredCOLs,allCols,fout):
77 fout.write("#sample")
78 for col in requiredCOLs:
79 fout.write("\t"+col)
80 for col in allCols:
81 if col not in requiredCOLs:
82 fout.write("\t"+col)
83 fout.write("\n")
84 fout.close()
85 return
86
87 def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout):
88 fin = open(infile,'U')
89 fin.readline()
90 while 1:
91 line = fin.readline()[:-1]
92 if line =="":
93 break
94 data = string.split(line,'\t')
95 fout.write(data[0])
96 for col in requiredCOLs:
97 pos = columnDic[col]
98 fout.write("\t"+ data[pos])
99 for col in allCols:
100 if col not in requiredCOLs:
101 if col in columnDic:
102 pos = columnDic[col]
103 fout.write("\t"+ data[pos])
104 else:
105 fout.write("\t")
106 fout.write("\n")
107 fin.close()
108 return
109
110 def collectSource(inFile, label, sampleDic):
111 fin = open(inFile,'U')
112 fin.readline()
113 while 1:
114 line = fin.readline()[:-1]
115 if line =="":
116 break
117 sample = string.split(line,'\t')[0]
118 if sample not in sampleDic:
119 sampleDic[sample]=[]
120 if inFile not in sampleDic[sample]:
121 sampleDic[sample].append(label)
122 fin.close()
123 return
124
125 def outputSampleDic (sampleDic, outPhenotypeFile):
126 fout = open(outPhenotypeFile,'w')
127 fout.write("sample\tsource\n")
128 for sample in sampleDic:
129 source = sampleDic[sample]
130 source.sort()
131 fout.write(sample+"\t"+string.join(source,", ")+"\n")
132 fout.close()
133 return
134
135 if __name__ == '__main__' :
136 if len(sys.argv[:]) <6:
137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
138 print "this is merging data A+B=C for mutation by position type of data\n"
139 sys.exit(1)
140
141 #
142 # The input files to this script are two or more matrices, in which
143 # columns represent samples and rows represent genes or measurements.
144 # There are two output files: outMergedData contains the input data merged
145 # into a single matrix, and outSourceMatrix is a two-column matrix
146 # indicating which file each sample (or column label) came from. This
147 # assumes that each sample came from at most one file.
148 #
149 parser = argparse.ArgumentParser()
150 parser.add_argument("inFileA", type=str, help="First input file")
151 parser.add_argument("inFileB", type=str, help="Second input file")
152 parser.add_argument("outMergedData", type=str,
153 help="Filename for the merged dataset")
154 parser.add_argument("outSourceMatrix", type=str,
155 help="""Filename for a Nx2 matrix that indicates
156 the source file of each column""")
157 parser.add_argument("errorLog", type=str,
158 help="""Error log""")
159 parser.add_argument("--aLabel", type=str, default=None,
160 help="User-friendly label for the first input file")
161 parser.add_argument("--bLabel", type=str, default=None,
162 help="User-friendly label for the second input file")
163 args = parser.parse_args()
164
165
166 #inFiles = sys.argv[4:]
167 print inFiles
168 errofile = args.errorLog
169 outfile = args.outMergedData
170 print outfile
171 outPhenotypeFile = args.outSourceMatrix
172 print outPhenotypeFile
173
174 ferror = open(errofile,'w')
175
176 #get all the columns, build fileColumn dictionary
177 fileColumn={}
178 allCols =[]
179 summarizeColumns(inFiles, fileColumn, allCols, ferror)
180 ferror.close()
181
182 #output header line
183 fout = open(outfile,'w')
184 outputHeader (requiredCOLs,allCols,fout)
185
186 #process and output combined mutationXena file
187 fout = open(outfile,'a')
188
189 columnDic = fileColumn[args.inFileA]
190 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
191 columnDic = fileColumn[args.inFileB]
192 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
193 fout.close()
194
195 #collect sample from source information
196 sampleDic ={}
197 if args.aLabel is None:
198 collectSource(args.inFileA, args.inFileA, sampleDic)
199 else:
200 collectSource(args.inFileA, args.aLabel, sampleDic
201 if args.bLabel is None:
202 collectSource(args.inFileB, args.inFileB, sampleDic)
203 else:
204 collectSource(args.inFileB, args.bLabel, sampleDic
205
206
207 #output sample source information as phenotype matrix
208 outputSampleDic (sampleDic, outPhenotypeFile)
209
210
211
212