Mercurial > repos > devteam > vcf_extract
comparison extract.py @ 0:c6fb674dfda3 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Thu, 23 Jan 2014 12:31:12 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c6fb674dfda3 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import os.path | |
| 4 import sys | |
| 5 import optparse | |
| 6 | |
| 7 import vcfClass | |
| 8 from vcfClass import * | |
| 9 | |
| 10 import tools | |
| 11 from tools import * | |
| 12 | |
| 13 if __name__ == "__main__": | |
| 14 main() | |
| 15 | |
| 16 def main(): | |
| 17 | |
| 18 # Parse the command line options | |
| 19 usage = "Usage: vcfPytools.py extract [options]" | |
| 20 parser = optparse.OptionParser(usage = usage) | |
| 21 parser.add_option("-i", "--in", | |
| 22 action="store", type="string", | |
| 23 dest="vcfFile", help="input vcf file (stdin for piped vcf)") | |
| 24 parser.add_option("-o", "--out", | |
| 25 action="store", type="string", | |
| 26 dest="output", help="output validation file") | |
| 27 parser.add_option("-s", "--reference-sequence", | |
| 28 action="store", type="string", | |
| 29 dest="referenceSequence", help="extract records from this reference sequence") | |
| 30 parser.add_option("-r", "--region", | |
| 31 action="store", type="string", | |
| 32 dest="region", help="extract records from this region") | |
| 33 parser.add_option("-q", "--keep-quality", | |
| 34 action="append", type="string", nargs=2, | |
| 35 dest="keepQuality", help="keep records containing this quality") | |
| 36 parser.add_option("-k", "--keep-info", | |
| 37 action="append", type="string", | |
| 38 dest="infoKeep", help="keep records containing this info field") | |
| 39 parser.add_option("-d", "--discard-info", | |
| 40 action="append", type="string", | |
| 41 dest="infoDiscard", help="discard records containing this info field") | |
| 42 parser.add_option("-p", "--pass-filter", | |
| 43 action="store_true", default=False, | |
| 44 dest="passFilter", help="discard records whose filter field is not PASS") | |
| 45 | |
| 46 (options, args) = parser.parse_args() | |
| 47 | |
| 48 # Check that a vcf file is given. | |
| 49 if options.vcfFile == None: | |
| 50 parser.print_help() | |
| 51 print >> sys.stderr, "\nInput vcf file (--in, -i) is required." | |
| 52 exit(1) | |
| 53 | |
| 54 # Check that either a reference sequence or region is specified, | |
| 55 # but not both if not dealing with info fields. | |
| 56 if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality: | |
| 57 if not options.referenceSequence and not options.region: | |
| 58 parser.print_help() | |
| 59 print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied" | |
| 60 print >> sys.stderr, "if not extracting records based on info strings." | |
| 61 exit(1) | |
| 62 if options.referenceSequence and options.region: | |
| 63 parser.print_help() | |
| 64 print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both." | |
| 65 exit(1) | |
| 66 | |
| 67 # If a region was supplied, check the format. | |
| 68 if options.region: | |
| 69 if options.region.find(":") == -1 or options.region.find("..") == -1: | |
| 70 print >> sys.stderr, "\nIncorrect format for region string. Required: ref:start..end." | |
| 71 exit(1) | |
| 72 regionList = options.region.split(":",1) | |
| 73 referenceSequence = regionList[0] | |
| 74 try: start = int(regionList[1].split("..")[0]) | |
| 75 except ValueError: | |
| 76 print >> sys.stderr, "region start coordinate is not an integer" | |
| 77 exit(1) | |
| 78 try: end = int(regionList[1].split("..")[1]) | |
| 79 except ValueError: | |
| 80 print >> sys.stderr, "region end coordinate is not an integer" | |
| 81 exit(1) | |
| 82 | |
| 83 # Ensure that discard-info and keep-info haven't both been defined. | |
| 84 if options.infoKeep and options.infoDiscard: | |
| 85 print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously." | |
| 86 exit(1) | |
| 87 | |
| 88 # If the --keep-quality argument is used, check that a value and a logical | |
| 89 # argument are supplied and that the logical argument is valid. | |
| 90 | |
| 91 if options.keepQuality: | |
| 92 for value, logic in options.keepQuality: | |
| 93 if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge": | |
| 94 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:" | |
| 95 print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>" | |
| 96 print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt" | |
| 97 exit(1) | |
| 98 try: qualityValue = float(value) | |
| 99 except ValueError: | |
| 100 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:" | |
| 101 print >> sys.stderr, "Quality value must be an integer or float value." | |
| 102 exit(1) | |
| 103 qualityLogic = logic | |
| 104 | |
| 105 # Set the output file to stdout if no output file was specified. | |
| 106 outputFile, writeOut = setOutput(options.output) | |
| 107 | |
| 108 v = vcf() # Define vcf object. | |
| 109 | |
| 110 # Set process info to True if info strings need to be parsed. | |
| 111 if options.infoKeep or options.infoDiscard: v.processInfo = True | |
| 112 | |
| 113 # Open the file. | |
| 114 v.openVcf(options.vcfFile) | |
| 115 | |
| 116 # Read in the header information. | |
| 117 v.parseHeader(options.vcfFile, writeOut) | |
| 118 taskDescriptor = "##vcfPytools=extract data" | |
| 119 writeHeader(outputFile, v, False, taskDescriptor) # tools.py | |
| 120 | |
| 121 # Read through all the entries and write out records in the correct | |
| 122 # reference sequence. | |
| 123 while v.getRecord(): | |
| 124 writeRecord = True | |
| 125 if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False | |
| 126 elif options.region: | |
| 127 if v.referenceSequence != referenceSequence: writeRecord = False | |
| 128 elif v.position < start or v.position > end: writeRecord = False | |
| 129 | |
| 130 # Only consider these fields if the record is contained within the | |
| 131 # specified region. | |
| 132 if options.infoKeep and writeRecord: | |
| 133 for tag in options.infoKeep: | |
| 134 if v.infoTags.has_key(tag): | |
| 135 writeRecord = True | |
| 136 break | |
| 137 if not v.infoTags.has_key(tag): writeRecord = False | |
| 138 if options.infoDiscard and writeRecord: | |
| 139 for tag in options.infoDiscard: | |
| 140 if v.infoTags.has_key(tag): writeRecord = False | |
| 141 if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False | |
| 142 if options.keepQuality: | |
| 143 if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False | |
| 144 if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False | |
| 145 if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False | |
| 146 if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False | |
| 147 if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False | |
| 148 | |
| 149 if writeRecord: outputFile.write(v.record) | |
| 150 | |
| 151 # Close the file. | |
| 152 v.closeVcf(options.vcfFile) | |
| 153 | |
| 154 # Terminate the program cleanly. | |
| 155 return 0 |
