annotate intersect.py @ 0:f5d5eed73180 draft default tip

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:31:34 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/python
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
2
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
3 import os.path
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
4 import sys
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
5 import optparse
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
6
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
7 import bedClass
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
8 from bedClass import *
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
9
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
10 import vcfClass
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
11 from vcfClass import *
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
12
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
13 import tools
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
14 from tools import *
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
15
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
16 if __name__ == "__main__":
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
17 main()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
18
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
19 # Intersect two vcf files. It is assumed that the two files are
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
20 # sorted by genomic coordinates and the reference sequences are
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
21 # in the same order.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
22 def intersectVcf(v1, v2, priority, outputFile):
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
23 success1 = v1.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
24 success2 = v2.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
25 currentReferenceSequence = v1.referenceSequence
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
26
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
27 # As soon as the end of either file is reached, there can be no
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
28 # more intersecting SNPs, so terminate.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
29 while success1 and success2:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
30 if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
31 if v1.position == v2.position:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
32 writeVcfRecord(priority, v1, v2, outputFile)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
33 success1 = v1.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
34 success2 = v2.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
35 elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
36 elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
37 else:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
38 if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
39 elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
40
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
41 # If the last record for a reference sequence is the same for both vcf
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
42 # files, they will both have referenceSequences different from the
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
43 # current reference sequence. Change the reference sequence to reflect
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
44 # this and proceed.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
45 else:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
46 if v1.referenceSequence != v2.referenceSequence:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
47 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
48 print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
49 print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
50 exit(1)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
51 currentReferenceSequence = v1.referenceSequence
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
52
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
53 # Intersect a vcf file and a bed file. It is assumed that the
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
54 # two files are sorted by genomic coordinates and the reference
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
55 # sequences are in the same order.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
56 def intersectVcfBed(v, b, outputFile):
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
57 successb = b.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
58 successv = v.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
59 currentReferenceSequence = v.referenceSequence
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
60
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
61 # As soon as the end of the first file is reached, there are no
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
62 # more intersections and the program can terminate.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
63 while successv:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
64 if v.referenceSequence == b.referenceSequence:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
65 if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
66 elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
67 else:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
68 outputFile.write(v.record)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
69 successv = v.getRecord()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
70 else:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
71 if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
72 if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
73 currentReferenceSequence = v.referenceSequence
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
74
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
75 def main():
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
76
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
77 # Parse the command line options
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
78 usage = "Usage: vcfPytools.py intersect [options]"
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
79 parser = optparse.OptionParser(usage = usage)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
80 parser.add_option("-i", "--in",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
81 action="append", type="string",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
82 dest="vcfFiles", help="input vcf files")
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
83 parser.add_option("-b", "--bed",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
84 action="store", type="string",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
85 dest="bedFile", help="input bed vcf file")
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
86 parser.add_option("-o", "--out",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
87 action="store", type="string",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
88 dest="output", help="output vcf file")
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
89 parser.add_option("-f", "--priority-file",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
90 action="store", type="string",
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
91 dest="priorityFile", help="output records from this vcf file")
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
92
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
93 (options, args) = parser.parse_args()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
94
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
95 # Check that a single vcf file is given.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
96 if options.vcfFiles == None:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
97 parser.print_help()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
98 print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection."
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
99 exit(1)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
100 elif len(options.vcfFiles) > 2:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
101 parser.print_help()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
102 print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection."
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
103 exit(1)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
104 elif len(options.vcfFiles) == 1 and not options.bedFile:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
105 parser.print_help()
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
106 print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection."
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
107 exit(1)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
108
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
109 # Set the output file to stdout if no output file was specified.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
110 outputFile, writeOut = setOutput(options.output) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
111
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
112 # If intersecting with a bed file, call the bed intersection routine.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
113 if options.bedFile:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
114 v = vcf() # Define vcf object.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
115 b = bed() # Define bed object.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
116
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
117 # Open the files.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
118 v.openVcf(options.vcfFiles[0])
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
119 b.openBed(options.bedFile)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
120
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
121 # Read in the header information.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
122 v.parseHeader(options.vcfFiles[0], writeOut)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
123 taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
124 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
125
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
126 # Intersect the vcf file with the bed file.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
127 intersectVcfBed(v, b, outputFile)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
128
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
129 # Check that the input files had the same list of reference sequences.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
130 # If not, it is possible that there were some problems.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
131 checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
132
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
133 # Close the files.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
134 v.closeVcf(options.vcfFiles[0])
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
135 b.closeBed(options.bedFile)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
136
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
137 else:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
138 priority = setVcfPriority(options.priorityFile, options.vcfFiles)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
139 v1 = vcf() # Define vcf object.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
140 v2 = vcf() # Define vcf object.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
141
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
142 # Open the vcf files.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
143 v1.openVcf(options.vcfFiles[0])
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
144 v2.openVcf(options.vcfFiles[1])
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
145
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
146 # Read in the header information.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
147 v1.parseHeader(options.vcfFiles[0], writeOut)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
148 v2.parseHeader(options.vcfFiles[1], writeOut)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
149 if priority == 3:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
150 v3 = vcf() # Generate a new vcf object that will contain the header information of the new file.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
151 mergeHeaders(v1, v2, v3) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
152 v1.processInfo = True
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
153 v2.processInfo = True
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
154 else: checkDataSets(v1, v2)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
155
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
156 #print v1.samplesList
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
157 #print v2.samplesList
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
158
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
159 # Check that the header for the two files contain the same samples.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
160 if v1.samplesList != v2.samplesList:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
161 print >> sys.stderr, "vcf files contain different samples (or sample order)."
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
162 exit(1)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
163 else:
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
164 taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
165 if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
166 elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
167 else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
168
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
169 # Intersect the two vcf files.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
170 intersectVcf(v1, v2, priority, outputFile)
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
171
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
172 # Check that the input files had the same list of reference sequences.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
173 # If not, it is possible that there were some problems.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
174 checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
175
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
176 # Close the vcf files.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
177 v1.closeVcf(options.vcfFiles[0])
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
178 v2.closeVcf(options.vcfFiles[1])
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
179
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
180 # End the program.
f5d5eed73180 Imported from capsule None
devteam
parents:
diff changeset
181 return 0