annotate mergeGenomicMatrixFiles.py @ 46:ebf3bc09c383

add snpEff code
author jingchunzhu
date Thu, 13 Aug 2015 21:49:03 -0700
parents eb5acf81e609
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
d0674221a6ae Added a proper first line
melissacline
parents: 3
diff changeset
1 #!/usr/bin/env python
d0674221a6ae Added a proper first line
melissacline
parents: 3
diff changeset
2
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
3 import argparse
43
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
4 import string,os,sys,json
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
5
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
6 def header (samples, sourceFiles, infile, labelThisFile):
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
7 if labelThisFile == None:
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
8 labelToUse = infile
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
9 else:
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
10 labelToUse = labelThisFile
8
5d4538cb38db When opening files for reading, changed the open() mode from 'r' to 'U' to accommodate non-unix systems
melissacline
parents: 7
diff changeset
11 fin= open(infile, 'U')
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
12 #header, samples
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
13 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
14 for sample in newSamples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
15 if sample not in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
16 samples[sample]= len(samples)
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
17 sourceFiles[sample] = labelToUse
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
18 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
19 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
20
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
21 def process(genes, samples, dataMatrix, infile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
22 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
23
8
5d4538cb38db When opening files for reading, changed the open() mode from 'r' to 'U' to accommodate non-unix systems
melissacline
parents: 7
diff changeset
24 fin= open(infile,'U')
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
25 #header
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
26 newSamples = string.split(string.strip(fin.readline()),'\t')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
27
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
28 while 1:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
29 line = fin.readline()[:-1]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
30 if line =="":
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
31 break
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
32 data = string.split(line,"\t")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
33 gene = data[0]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
34 if gene not in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
35 genes[gene]= len(genes)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
36 l=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
37 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
38 l.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
39 dataMatrix.append(l)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
40
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
41 x = genes[gene]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
42 for i in range (1, len(data)):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
43 sample = newSamples[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
44 y = samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
45 dataMatrix[x][y]= data[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
46
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
47 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
48 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
49
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
50
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
51 def outputSourceMatrix(sourceData, outputFileName):
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
52 fout = open(outputFileName, "w")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
53 fout.write("Sample\tSource\n")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
54 for thisSample in sourceData.keys():
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
55 fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
56 fout.close()
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
57 return
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
58
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
59
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
60 def outputMergedMatrix(dataMatrix, samples, genes, outfile):
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
61 fout = open(outfile,"w")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
62 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
63 sList=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
64 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
65 sList.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
66 for sample in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
67 pos =samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
68 sList[pos] = sample
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
69
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
70 fout.write("sample")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
71 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
72 fout.write("\t"+sample)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
73 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
74
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
75 for gene in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
76 fout.write(gene)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
77 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
78 value = dataMatrix[genes[gene]][samples[sample]]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
79 fout.write("\t"+value)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
80 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
81 fout.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
82 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
83
43
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
84 def outputMergedMatrixJson(output):
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
85 fout = open(output,'w')
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
86 j={}
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
87 j["type"]="genomicMatrix"
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
88 json.dump(j, fout)
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
89 fout.close()
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
90
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
91 if __name__ == '__main__' :
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
92 #
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
93 # The input files to this script are two or more matrices, in which
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
94 # columns represent samples and rows represent genes or measurements.
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
95 # There are two output files: outMergedData contains the input data merged
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
96 # into a single matrix, and outSourceMatrix is a two-column matrix
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
97 # indicating which file each sample (or column label) came from. This
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
98 # assumes that each sample came from at most one file.
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
99 #
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
100 parser = argparse.ArgumentParser()
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
101 parser.add_argument("inFileA", type=str, help="First input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
102 parser.add_argument("inFileB", type=str, help="Second input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
103 parser.add_argument("outMergedData", type=str,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
104 help="Filename for the merged dataset")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
105 parser.add_argument("outSourceMatrix", type=str,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
106 help="""Filename for a Nx2 matrix that indicates
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
107 the source file of each column""")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
108 parser.add_argument("--aLabel", type=str, default=None,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
109 help="User-friendly label for the first input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
110 parser.add_argument("--bLabel", type=str, default=None,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
111 help="User-friendly label for the second input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
112 args = parser.parse_args()
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
113
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
114 genes={}
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
115 samples={}
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
116 sourceFiles = {}
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
117 dataMatrix=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
118
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
119 header(samples, sourceFiles, args.inFileA, args.aLabel)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
120 header(samples, sourceFiles, args.inFileB, args.bLabel)
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
121
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
122 process(genes, samples, dataMatrix, args.inFileA)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
123 process(genes, samples, dataMatrix, args.inFileB)
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
124
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
125 outputSourceMatrix(sourceFiles, args.outSourceMatrix)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
126 outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)
43
eb5acf81e609 improve messages
jingchunzhu
parents: 24
diff changeset
127