Mercurial > repos > devteam > vcf_annotate
diff annotate.py @ 0:3eca7b2537aa draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Thu, 23 Jan 2014 12:31:47 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotate.py Thu Jan 23 12:31:47 2014 -0500 @@ -0,0 +1,163 @@ +#!/usr/bin/python + +import os.path +import sys +import optparse + +import vcfClass +from vcfClass import * + +import tools +from tools import * + +if __name__ == "__main__": + main() + +# Check that the reference and alternate in the dbsnp vcf file match those +# from the input vcf file. +def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation): + text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \ + vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt + + allelesAgree = True + if vcfRef.lower() != dbsnpRef.lower(): + if vcfRef.lower() != dbsnpAlt.lower(): + #print >> sys.stderr, text + allelesAgree = False + else: + if vcfAlt.lower() != dbsnpAlt.lower(): + #print >> sys.stderr, text + allelesAgree = False + + return allelesAgree + +# Intersect two vcf files. It is assumed that the two files are +# sorted by genomic coordinates and the reference sequences are +# in the same order. +def annotateVcf(v, d, outputFile, annotation): + success1 = v.getRecord() + success2 = d.getRecord() + currentReferenceSequence = v.referenceSequence + +# Finish when the end of the first file has been reached. + while success1: + +# If the end of the dbsnp vcf file is reached, write out the +# remaining records from the vcf file. + if not success2: + outputFile.write(v.record) + success1 = v.getRecord() + + if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence: + if v.position == d.position: + allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation) + if annotation == "dbsnp": v.rsid = d.getDbsnpInfo() + elif annotation == "hapmap": + if allelesAgree: v.info += ";HM3" + else: v.info += ";HM3A" + record = v.buildRecord(False) + outputFile.write(record) + + success1 = v.getRecord() + success2 = d.getRecord() + elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) + elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) + else: + if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile) + elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None) + +# If the last record for a reference sequence is the same for both vcf +# files, they will both have referenceSequences different from the +# current reference sequence. Change the reference sequence to reflect +# this and proceed. + else: + if v.referenceSequence != d.referenceSequence: + print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." + print >> sys.stderr, "Check that both files contain records for the following reference sequences:" + print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence + exit(1) + currentReferenceSequence = v.referenceSequence + +def main(): + +# Parse the command line options + usage = "Usage: vcfPytools.py annotate [options]" + parser = optparse.OptionParser(usage = usage) + parser.add_option("-i", "--in", + action="store", type="string", + dest="vcfFile", help="input vcf files") + parser.add_option("-d", "--dbsnp", + action="store", type="string", + dest="dbsnpFile", help="input dbsnp vcf file") + parser.add_option("-m", "--hapmap", + action="store", type="string", + dest="hapmapFile", help="input hapmap vcf file") + parser.add_option("-o", "--out", + action="store", type="string", + dest="output", help="output vcf file") + + (options, args) = parser.parse_args() + +# Check that a single vcf file is given. + if options.vcfFile == None: + parser.print_help() + print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation." + exit(1) + +# Check that either a hapmap or a dbsnp vcf file is included. + if options.dbsnpFile == None and options.hapmapFile == None: + parser.print_help() + print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)." + exit(1) + elif options.dbsnpFile != None and options.hapmapFile != None: + parser.print_help() + print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)." + exit(1) + +# Set the output file to stdout if no output file was specified. + outputFile, writeOut = setOutput(options.output) # tools.py + + v = vcf() # Define vcf object. + d = vcf() # Define dbsnp/hapmap vcf object. + if options.dbsnpFile: + d.dbsnpVcf = True + annotationFile = options.dbsnpFile + annotation = "dbsnp" + elif options.hapmapFile: + d.hapmapVcf = True + annotationFile = options.hapmapFile + annotation = "hapmap" + +# Open the vcf files. + v.openVcf(options.vcfFile) + d.openVcf(annotationFile) + +# Read in the header information. + v.parseHeader(options.vcfFile, writeOut) + d.parseHeader(annotationFile, writeOut) + +# Add an extra line to the vcf header to indicate the file used for +# performing dbsnp annotation. + taskDescriptor = "##vcfPytools=annotated vcf file with " + if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile + elif options.hapmapFile: + taskDescriptor += "hapmap file " + options.hapmapFile + v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \ + options.hapmapFile + "\">" + v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \ + ", determined from file " + options.hapmapFile + "\">" + writeHeader(outputFile, v, False, taskDescriptor) # tools.py + +# Annotate the vcf file. + annotateVcf(v, d, outputFile, annotation) + +# Check that the input files had the same list of reference sequences. +# If not, it is possible that there were some problems. + checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py + +# Close the vcf files. + v.closeVcf(options.vcfFile) + d.closeVcf(annotationFile) + +# End the program. + return 0