annotate mergeXenaMutation.py @ 19:371579dd9bc6

Uploaded
author melissacline
date Fri, 20 Mar 2015 18:09:15 -0400 (2015-03-20)
parents 15cb5a49cdbc
children 914bc8ee6222
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
2
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
3 import argparse
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
4 import string, os, sys
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
5
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
6 requiredCOLs = ["chr", "start","end","reference","alt","gene","effect"]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
7
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
8 def headerError(filename, column, ferror):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
9 ferror.write(filename +" does not have column " + column+"\n")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
10 ferror.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
11 sys.exit(1)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
12
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
13 def findAnyValueInList (values, dataList):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
14 for value in values:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
15 for i in range(0,len(dataList)):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
16 if value == dataList[i]:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
17 return i
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
18 return -1
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
19
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
20 def header (infile, ferror):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
21 fin= open(infile,'U')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
22
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
23 columnDic ={}
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
24 #header
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
25 line = fin.readline()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
26 fin.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
27 if line [0]=="#":
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
28 line = line[1:-1]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
29 data = string.split(line,"\t")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
30
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
31 columnDic["chr"]= findAnyValueInList (["chr","chrom"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
32 if columnDic["chr"] ==-1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
33 headerError(infile, "chr", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
34
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
35 columnDic["start"]= findAnyValueInList (["start","chrStart"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
36 if columnDic["start"] == -1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
37 headerError(infile, "start", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
38
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
39 columnDic["end"]= findAnyValueInList (["end","chrEnd"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
40 if columnDic["end"] == -1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
41 headerError(infile, "end", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
42
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
43 columnDic["alt"]= findAnyValueInList (["alt"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
44 if columnDic["alt"] == -1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
45 headerError(infile, "alt", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
46
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
47 columnDic["reference"]= findAnyValueInList (["reference","ref"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
48 if columnDic["reference"] == -1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
49 headerError(infile, "reference", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
50
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
51 columnDic["gene"]= findAnyValueInList (["gene"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
52 if columnDic["gene"] == -1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
53 headerError(infile, "gene", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
54
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
55 columnDic["effect"]= findAnyValueInList (["effect"], data)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
56 if columnDic["effect"] == -1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
57 headerError(infile, "effect", ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
58
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
59 requiredCols = columnDic.keys()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
60 requiredColsPos = columnDic.values()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
61 for i in range(1,len(data)):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
62 if i not in requiredColsPos:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
63 columnDic [data[i]]=i
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
64
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
65 return columnDic
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
66
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
67 def summarizeColumns(infiles, fileColumn, allCols, ferror):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
68 for infile in inFiles:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
69 columnDic = header (infile, ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
70 fileColumn [infile] = columnDic
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
71 for col in columnDic:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
72 if col not in allCols:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
73 allCols.append(col)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
74 return
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
75
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
76 def outputHeader (requiredCOLs,allCols,fout):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
77 fout.write("#sample")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
78 for col in requiredCOLs:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
79 fout.write("\t"+col)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
80 for col in allCols:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
81 if col not in requiredCOLs:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
82 fout.write("\t"+col)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
83 fout.write("\n")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
84 fout.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
85 return
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
86
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
87 def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
88 fin = open(infile,'U')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
89 fin.readline()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
90 while 1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
91 line = fin.readline()[:-1]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
92 if line =="":
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
93 break
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
94 data = string.split(line,'\t')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
95 fout.write(data[0])
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
96 for col in requiredCOLs:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
97 pos = columnDic[col]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
98 fout.write("\t"+ data[pos])
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
99 for col in allCols:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
100 if col not in requiredCOLs:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
101 if col in columnDic:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
102 pos = columnDic[col]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
103 fout.write("\t"+ data[pos])
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
104 else:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
105 fout.write("\t")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
106 fout.write("\n")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
107 fin.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
108 return
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
109
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
110 def collectSource(inFile, label, sampleDic):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
111 fin = open(inFile,'U')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
112 fin.readline()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
113 while 1:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
114 line = fin.readline()[:-1]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
115 if line =="":
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
116 break
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
117 sample = string.split(line,'\t')[0]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
118 if sample not in sampleDic:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
119 sampleDic[sample]=[]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
120 if inFile not in sampleDic[sample]:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
121 sampleDic[sample].append(label)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
122 fin.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
123 return
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
124
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
125 def outputSampleDic (sampleDic, outPhenotypeFile):
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
126 fout = open(outPhenotypeFile,'w')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
127 fout.write("sample\tsource\n")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
128 for sample in sampleDic:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
129 source = sampleDic[sample]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
130 source.sort()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
131 fout.write(sample+"\t"+string.join(source,", ")+"\n")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
132 fout.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
133 return
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
134
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
135 if __name__ == '__main__' :
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
136 if len(sys.argv[:]) <6:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
138 print "this is merging data A+B=C for mutation by position type of data\n"
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
139 sys.exit(1)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
140
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
141 #
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
142 # The input files to this script are two or more matrices, in which
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
143 # columns represent samples and rows represent genes or measurements.
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
144 # There are two output files: outMergedData contains the input data merged
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
145 # into a single matrix, and outSourceMatrix is a two-column matrix
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
146 # indicating which file each sample (or column label) came from. This
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
147 # assumes that each sample came from at most one file.
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
148 #
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
149 parser = argparse.ArgumentParser()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
150 parser.add_argument("inFileA", type=str, help="First input file")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
151 parser.add_argument("inFileB", type=str, help="Second input file")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
152 parser.add_argument("outMergedData", type=str,
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
153 help="Filename for the merged dataset")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
154 parser.add_argument("outSourceMatrix", type=str,
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
155 help="""Filename for a Nx2 matrix that indicates
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
156 the source file of each column""")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
157 parser.add_argument("errorLog", type=str,
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
158 help="""Error log""")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
159 parser.add_argument("--aLabel", type=str, default=None,
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
160 help="User-friendly label for the first input file")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
161 parser.add_argument("--bLabel", type=str, default=None,
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
162 help="User-friendly label for the second input file")
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
163 args = parser.parse_args()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
164
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
165
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
166 #inFiles = sys.argv[4:]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
167 print inFiles
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
168 errofile = args.errorLog
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
169 outfile = args.outMergedData
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
170 print outfile
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
171 outPhenotypeFile = args.outSourceMatrix
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
172 print outPhenotypeFile
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
173
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
174 ferror = open(errofile,'w')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
175
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
176 #get all the columns, build fileColumn dictionary
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
177 fileColumn={}
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
178 allCols =[]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
179 summarizeColumns(inFiles, fileColumn, allCols, ferror)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
180 ferror.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
181
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
182 #output header line
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
183 fout = open(outfile,'w')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
184 outputHeader (requiredCOLs,allCols,fout)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
185
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
186 #process and output combined mutationXena file
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
187 fout = open(outfile,'a')
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
188
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
189 columnDic = fileColumn[args.inFileA]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
190 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
191 columnDic = fileColumn[args.inFileB]
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
192 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
193 fout.close()
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
194
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
195 #collect sample from source information
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
196 sampleDic ={}
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
197 if args.aLabel is None:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
198 collectSource(args.inFileA, args.inFileA, sampleDic)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
199 else:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
200 collectSource(args.inFileA, args.aLabel, sampleDic
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
201 if args.bLabel is None:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
202 collectSource(args.inFileB, args.inFileB, sampleDic)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
203 else:
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
204 collectSource(args.inFileB, args.bLabel, sampleDic
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
205
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
206
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
207 #output sample source information as phenotype matrix
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
208 outputSampleDic (sampleDic, outPhenotypeFile)
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
209
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
210
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
211
15cb5a49cdbc Uploaded
melissacline
parents:
diff changeset
212