Mercurial > repos > devteam > vcf_extract
comparison extract.py @ 0:c6fb674dfda3 draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Thu, 23 Jan 2014 12:31:12 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6fb674dfda3 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import os.path | |
4 import sys | |
5 import optparse | |
6 | |
7 import vcfClass | |
8 from vcfClass import * | |
9 | |
10 import tools | |
11 from tools import * | |
12 | |
13 if __name__ == "__main__": | |
14 main() | |
15 | |
16 def main(): | |
17 | |
18 # Parse the command line options | |
19 usage = "Usage: vcfPytools.py extract [options]" | |
20 parser = optparse.OptionParser(usage = usage) | |
21 parser.add_option("-i", "--in", | |
22 action="store", type="string", | |
23 dest="vcfFile", help="input vcf file (stdin for piped vcf)") | |
24 parser.add_option("-o", "--out", | |
25 action="store", type="string", | |
26 dest="output", help="output validation file") | |
27 parser.add_option("-s", "--reference-sequence", | |
28 action="store", type="string", | |
29 dest="referenceSequence", help="extract records from this reference sequence") | |
30 parser.add_option("-r", "--region", | |
31 action="store", type="string", | |
32 dest="region", help="extract records from this region") | |
33 parser.add_option("-q", "--keep-quality", | |
34 action="append", type="string", nargs=2, | |
35 dest="keepQuality", help="keep records containing this quality") | |
36 parser.add_option("-k", "--keep-info", | |
37 action="append", type="string", | |
38 dest="infoKeep", help="keep records containing this info field") | |
39 parser.add_option("-d", "--discard-info", | |
40 action="append", type="string", | |
41 dest="infoDiscard", help="discard records containing this info field") | |
42 parser.add_option("-p", "--pass-filter", | |
43 action="store_true", default=False, | |
44 dest="passFilter", help="discard records whose filter field is not PASS") | |
45 | |
46 (options, args) = parser.parse_args() | |
47 | |
48 # Check that a vcf file is given. | |
49 if options.vcfFile == None: | |
50 parser.print_help() | |
51 print >> sys.stderr, "\nInput vcf file (--in, -i) is required." | |
52 exit(1) | |
53 | |
54 # Check that either a reference sequence or region is specified, | |
55 # but not both if not dealing with info fields. | |
56 if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality: | |
57 if not options.referenceSequence and not options.region: | |
58 parser.print_help() | |
59 print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied" | |
60 print >> sys.stderr, "if not extracting records based on info strings." | |
61 exit(1) | |
62 if options.referenceSequence and options.region: | |
63 parser.print_help() | |
64 print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both." | |
65 exit(1) | |
66 | |
67 # If a region was supplied, check the format. | |
68 if options.region: | |
69 if options.region.find(":") == -1 or options.region.find("..") == -1: | |
70 print >> sys.stderr, "\nIncorrect format for region string. Required: ref:start..end." | |
71 exit(1) | |
72 regionList = options.region.split(":",1) | |
73 referenceSequence = regionList[0] | |
74 try: start = int(regionList[1].split("..")[0]) | |
75 except ValueError: | |
76 print >> sys.stderr, "region start coordinate is not an integer" | |
77 exit(1) | |
78 try: end = int(regionList[1].split("..")[1]) | |
79 except ValueError: | |
80 print >> sys.stderr, "region end coordinate is not an integer" | |
81 exit(1) | |
82 | |
83 # Ensure that discard-info and keep-info haven't both been defined. | |
84 if options.infoKeep and options.infoDiscard: | |
85 print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously." | |
86 exit(1) | |
87 | |
88 # If the --keep-quality argument is used, check that a value and a logical | |
89 # argument are supplied and that the logical argument is valid. | |
90 | |
91 if options.keepQuality: | |
92 for value, logic in options.keepQuality: | |
93 if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge": | |
94 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:" | |
95 print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>" | |
96 print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt" | |
97 exit(1) | |
98 try: qualityValue = float(value) | |
99 except ValueError: | |
100 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:" | |
101 print >> sys.stderr, "Quality value must be an integer or float value." | |
102 exit(1) | |
103 qualityLogic = logic | |
104 | |
105 # Set the output file to stdout if no output file was specified. | |
106 outputFile, writeOut = setOutput(options.output) | |
107 | |
108 v = vcf() # Define vcf object. | |
109 | |
110 # Set process info to True if info strings need to be parsed. | |
111 if options.infoKeep or options.infoDiscard: v.processInfo = True | |
112 | |
113 # Open the file. | |
114 v.openVcf(options.vcfFile) | |
115 | |
116 # Read in the header information. | |
117 v.parseHeader(options.vcfFile, writeOut) | |
118 taskDescriptor = "##vcfPytools=extract data" | |
119 writeHeader(outputFile, v, False, taskDescriptor) # tools.py | |
120 | |
121 # Read through all the entries and write out records in the correct | |
122 # reference sequence. | |
123 while v.getRecord(): | |
124 writeRecord = True | |
125 if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False | |
126 elif options.region: | |
127 if v.referenceSequence != referenceSequence: writeRecord = False | |
128 elif v.position < start or v.position > end: writeRecord = False | |
129 | |
130 # Only consider these fields if the record is contained within the | |
131 # specified region. | |
132 if options.infoKeep and writeRecord: | |
133 for tag in options.infoKeep: | |
134 if v.infoTags.has_key(tag): | |
135 writeRecord = True | |
136 break | |
137 if not v.infoTags.has_key(tag): writeRecord = False | |
138 if options.infoDiscard and writeRecord: | |
139 for tag in options.infoDiscard: | |
140 if v.infoTags.has_key(tag): writeRecord = False | |
141 if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False | |
142 if options.keepQuality: | |
143 if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False | |
144 if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False | |
145 if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False | |
146 if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False | |
147 if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False | |
148 | |
149 if writeRecord: outputFile.write(v.record) | |
150 | |
151 # Close the file. | |
152 v.closeVcf(options.vcfFile) | |
153 | |
154 # Terminate the program cleanly. | |
155 return 0 |