Mercurial > repos > sanbi-uwc > vcf_to_alignment
changeset 3:62fbd3f96b30 draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
author | sanbi-uwc |
---|---|
date | Fri, 03 Feb 2017 05:56:11 -0500 |
parents | a0c85f2d74a5 |
children | f58178c0f00d |
files | pyqver2.py test-data/output2.fasta vcf_to_alignment.xml vcf_to_msa.py |
diffstat | 4 files changed, 48 insertions(+), 370 deletions(-) [+] |
line wrap: on
line diff
--- a/pyqver2.py Wed Feb 01 08:45:12 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,366 +0,0 @@ -#!/usr/bin/env python - -import compiler -import platform -import sys - -StandardModules = { - "__future__": (2, 1), - "abc": (2, 6), - "argparse": (2, 7), - "ast": (2, 6), - "atexit": (2, 0), - "bz2": (2, 3), - "cgitb": (2, 2), - "collections": (2, 4), - "contextlib": (2, 5), - "cookielib": (2, 4), - "cProfile": (2, 5), - "csv": (2, 3), - "ctypes": (2, 5), - "datetime": (2, 3), - "decimal": (2, 4), - "difflib": (2, 1), - "DocXMLRPCServer": (2, 3), - "dummy_thread": (2, 3), - "dummy_threading": (2, 3), - "email": (2, 2), - "fractions": (2, 6), - "functools": (2, 5), - "future_builtins": (2, 6), - "hashlib": (2, 5), - "heapq": (2, 3), - "hmac": (2, 2), - "hotshot": (2, 2), - "HTMLParser": (2, 2), - "importlib": (2, 7), - "inspect": (2, 1), - "io": (2, 6), - "itertools": (2, 3), - "json": (2, 6), - "logging": (2, 3), - "modulefinder": (2, 3), - "msilib": (2, 5), - "multiprocessing": (2, 6), - "netrc": (1, 5, 2), - "numbers": (2, 6), - "optparse": (2, 3), - "ossaudiodev": (2, 3), - "pickletools": (2, 3), - "pkgutil": (2, 3), - "platform": (2, 3), - "pydoc": (2, 1), - "runpy": (2, 5), - "sets": (2, 3), - "shlex": (1, 5, 2), - "SimpleXMLRPCServer": (2, 2), - "spwd": (2, 5), - "sqlite3": (2, 5), - "ssl": (2, 6), - "stringprep": (2, 3), - "subprocess": (2, 4), - "sysconfig": (2, 7), - "tarfile": (2, 3), - "textwrap": (2, 3), - "timeit": (2, 3), - "unittest": (2, 1), - "uuid": (2, 5), - "warnings": (2, 1), - "weakref": (2, 1), - "winsound": (1, 5, 2), - "wsgiref": (2, 5), - "xml.dom": (2, 0), - "xml.dom.minidom": (2, 0), - "xml.dom.pulldom": (2, 0), - "xml.etree.ElementTree": (2, 5), - "xml.parsers.expat":(2, 0), - "xml.sax": (2, 0), - "xml.sax.handler": (2, 0), - "xml.sax.saxutils": (2, 0), - "xml.sax.xmlreader":(2, 0), - "xmlrpclib": (2, 2), - "zipfile": (1, 6), - "zipimport": (2, 3), - "_ast": (2, 5), - "_winreg": (2, 0), -} - -Functions = { - "all": (2, 5), - "any": (2, 5), - "collections.Counter": (2, 7), - "collections.defaultdict": (2, 5), - "collections.OrderedDict": (2, 7), - "enumerate": (2, 3), - "frozenset": (2, 4), - "itertools.compress": (2, 7), - "math.erf": (2, 7), - "math.erfc": (2, 7), - "math.expm1": (2, 7), - "math.gamma": (2, 7), - "math.lgamma": (2, 7), - "memoryview": (2, 7), - "next": (2, 6), - "os.getresgid": (2, 7), - "os.getresuid": (2, 7), - "os.initgroups": (2, 7), - "os.setresgid": (2, 7), - "os.setresuid": (2, 7), - "reversed": (2, 4), - "set": (2, 4), - "subprocess.check_call": (2, 5), - "subprocess.check_output": (2, 7), - "sum": (2, 3), - "symtable.is_declared_global": (2, 7), - "weakref.WeakSet": (2, 7), -} - -Identifiers = { - "False": (2, 2), - "True": (2, 2), -} - -def uniq(a): - if len(a) == 0: - return [] - else: - return [a[0]] + uniq([x for x in a if x != a[0]]) - -class NodeChecker(object): - def __init__(self): - self.vers = dict() - self.vers[(2,0)] = [] - def add(self, node, ver, msg): - if ver not in self.vers: - self.vers[ver] = [] - self.vers[ver].append((node.lineno, msg)) - def default(self, node): - for child in node.getChildNodes(): - self.visit(child) - def visitCallFunc(self, node): - def rollup(n): - if isinstance(n, compiler.ast.Name): - return n.name - elif isinstance(n, compiler.ast.Getattr): - r = rollup(n.expr) - if r: - return r + "." + n.attrname - name = rollup(node.node) - if name: - v = Functions.get(name) - if v is not None: - self.add(node, v, name) - self.default(node) - def visitClass(self, node): - if node.bases: - self.add(node, (2,2), "new-style class") - if node.decorators: - self.add(node, (2,6), "class decorator") - self.default(node) - def visitDictComp(self, node): - self.add(node, (2,7), "dictionary comprehension") - self.default(node) - def visitFloorDiv(self, node): - self.add(node, (2,2), "// operator") - self.default(node) - def visitFrom(self, node): - v = StandardModules.get(node.modname) - if v is not None: - self.add(node, v, node.modname) - for n in node.names: - name = node.modname + "." + n[0] - v = Functions.get(name) - if v is not None: - self.add(node, v, name) - def visitFunction(self, node): - if node.decorators: - self.add(node, (2,4), "function decorator") - self.default(node) - def visitGenExpr(self, node): - self.add(node, (2,4), "generator expression") - self.default(node) - def visitGetattr(self, node): - if (isinstance(node.expr, compiler.ast.Const) - and isinstance(node.expr.value, str) - and node.attrname == "format"): - self.add(node, (2,6), "string literal .format()") - self.default(node) - def visitIfExp(self, node): - self.add(node, (2,5), "inline if expression") - self.default(node) - def visitImport(self, node): - for n in node.names: - v = StandardModules.get(n[0]) - if v is not None: - self.add(node, v, n[0]) - self.default(node) - def visitName(self, node): - v = Identifiers.get(node.name) - if v is not None: - self.add(node, v, node.name) - self.default(node) - def visitSet(self, node): - self.add(node, (2,7), "set literal") - self.default(node) - def visitSetComp(self, node): - self.add(node, (2,7), "set comprehension") - self.default(node) - def visitTryFinally(self, node): - # try/finally with a suite generates a Stmt node as the body, - # but try/except/finally generates a TryExcept as the body - if isinstance(node.body, compiler.ast.TryExcept): - self.add(node, (2,5), "try/except/finally") - self.default(node) - def visitWith(self, node): - if isinstance(node.body, compiler.ast.With): - self.add(node, (2,7), "with statement with multiple contexts") - else: - self.add(node, (2,5), "with statement") - self.default(node) - def visitYield(self, node): - self.add(node, (2,2), "yield expression") - self.default(node) - -def get_versions(source): - """Return information about the Python versions required for specific features. - - The return value is a dictionary with keys as a version number as a tuple - (for example Python 2.6 is (2,6)) and the value are a list of features that - require the indicated Python version. - """ - tree = compiler.parse(source) - checker = compiler.walk(tree, NodeChecker()) - return checker.vers - -def v27(source): - if sys.version_info >= (2, 7): - return qver(source) - else: - print >>sys.stderr, "Not all features tested, run --test with Python 2.7" - return (2, 7) - -def qver(source): - """Return the minimum Python version required to run a particular bit of code. - - >>> qver('print "hello world"') - (2, 0) - >>> qver('class test(object): pass') - (2, 2) - >>> qver('yield 1') - (2, 2) - >>> qver('a // b') - (2, 2) - >>> qver('True') - (2, 2) - >>> qver('enumerate(a)') - (2, 3) - >>> qver('total = sum') - (2, 0) - >>> qver('sum(a)') - (2, 3) - >>> qver('(x*x for x in range(5))') - (2, 4) - >>> qver('class C:\\n @classmethod\\n def m(): pass') - (2, 4) - >>> qver('y if x else z') - (2, 5) - >>> qver('import hashlib') - (2, 5) - >>> qver('from hashlib import md5') - (2, 5) - >>> qver('import xml.etree.ElementTree') - (2, 5) - >>> qver('try:\\n try: pass;\\n except: pass;\\nfinally: pass') - (2, 0) - >>> qver('try: pass;\\nexcept: pass;\\nfinally: pass') - (2, 5) - >>> qver('from __future__ import with_statement\\nwith x: pass') - (2, 5) - >>> qver('collections.defaultdict(list)') - (2, 5) - >>> qver('from collections import defaultdict') - (2, 5) - >>> qver('"{0}".format(0)') - (2, 6) - >>> qver('memoryview(x)') - (2, 7) - >>> v27('{1, 2, 3}') - (2, 7) - >>> v27('{x for x in s}') - (2, 7) - >>> v27('{x: y for x in s}') - (2, 7) - >>> qver('from __future__ import with_statement\\nwith x:\\n with y: pass') - (2, 5) - >>> v27('from __future__ import with_statement\\nwith x, y: pass') - (2, 7) - >>> qver('@decorator\\ndef f(): pass') - (2, 4) - >>> qver('@decorator\\nclass test:\\n pass') - (2, 6) - - #>>> qver('0o0') - #(2, 6) - #>>> qver('@foo\\nclass C: pass') - #(2, 6) - """ - return max(get_versions(source).keys()) - -Verbose = False -MinVersion = (2, 3) -Lint = False - -files = [] -i = 1 -while i < len(sys.argv): - a = sys.argv[i] - if a == "--test": - import doctest - doctest.testmod() - sys.exit(0) - if a == "-v" or a == "--verbose": - Verbose = True - elif a == "-l" or a == "--lint": - Lint = True - elif a == "-m" or a == "--min-version": - i += 1 - MinVersion = tuple(map(int, sys.argv[i].split("."))) - else: - files.append(a) - i += 1 - -if not files: - print >>sys.stderr, """Usage: %s [options] source ... - - Report minimum Python version required to run given source files. - - -m x.y or --min-version x.y (default 2.3) - report version triggers at or above version x.y in verbose mode - -v or --verbose - print more detailed report of version triggers for each version -""" % sys.argv[0] - sys.exit(1) - -for fn in files: - try: - f = open(fn) - source = f.read() - f.close() - ver = get_versions(source) - if Verbose: - print fn - for v in sorted([k for k in ver.keys() if k >= MinVersion], reverse=True): - reasons = [x for x in uniq(ver[v]) if x] - if reasons: - # each reason is (lineno, message) - print "\t%s\t%s" % (".".join(map(str, v)), ", ".join([x[1] for x in reasons])) - elif Lint: - for v in sorted([k for k in ver.keys() if k >= MinVersion], reverse=True): - reasons = [x for x in uniq(ver[v]) if x] - for r in reasons: - # each reason is (lineno, message) - print "%s:%s: %s %s" % (fn, r[0], ".".join(map(str, v)), r[1]) - else: - print "%s\t%s" % (".".join(map(str, max(ver.keys()))), fn) - except SyntaxError, x: - print "%s: syntax error compiling with Python %s: %s" % (fn, platform.python_version(), x)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output2.fasta Fri Feb 03 05:56:11 2017 -0500 @@ -0,0 +1,8 @@ +>NC_000962.3 Mycobacterium tuberculosis H37Rv, complete genome +CATGGGCGCGCATGCCGGCCCGCTCCCCCTCGTCCCCCGAGCTATCACCAAGGCCCCC +>vcf_inputs.vcf1 +AGCGCCCACGGGCTGAACGCCGCTCCCCCCTCCGGTGACGATCGTCGTCAGGTCTTCC +>vcf_inputs.vcf2 +CGCCCCCATGCGCGCCAGGGGGTGTTCCCCCGTGCCCCCAGCCGAGGCTTGAGCCCGC +>vcf_inputs.vcf3 +CGCGCCGACACGCGCCAGGCCACTCCTTGCCGTGCCCCCAGCTATCACCAGGGTCCCT
--- a/vcf_to_alignment.xml Wed Feb 01 08:45:12 2017 -0500 +++ b/vcf_to_alignment.xml Fri Feb 03 05:56:11 2017 -0500 @@ -1,5 +1,5 @@ <?xml version="1.0" encoding="utf-8" ?> -<tool id="vcf_to_alignment" name="Generate FASTA alignment from VCF collection" version="0.2"> +<tool id="vcf_to_alignment" name="Generate FASTA alignment from VCF collection" version="0.3"> <description>Generate a multiple sequence alignment given a collection of variants and a reference sequence</description> <requirements> <requirement type="package" version="1.67">biopython</requirement> @@ -10,7 +10,9 @@ #if str($reference.source) == 'history': ln -s '${reference.history}' reference.fasta && #end if - python $__tool_directory__/vcf_to_msa.py --vcf_files + python $__tool_directory__/vcf_to_msa.py + $remove_invariant_sites + --vcf_files #for $vcf_file in $vcf_inputs: '${vcf_file.element_identifier}^^^${vcf_file}' #end for @@ -25,6 +27,7 @@ </command> <inputs> <param name="vcf_inputs" type="data_collection" format="vcf" collection_type="list" label="Variants (VCF format)" /> + <param name="remove_invariant_sites" type="boolean" truevalue="--remove_invariant" falsevalue="" label="Remove invariant sites from alignment" /> <conditional name="reference" label="Reference sequence source"> <param name="source" type="select"> <option value="history" selected="True">History</option> @@ -46,6 +49,7 @@ </outputs> <tests> <test> + <param name="remove_invariant_sites" value="False" /> <param name="vcf_inputs"> <collection type="list"> <element name="vcf_inputs.vcf1" value="vcf1.vcf" /> @@ -56,6 +60,18 @@ <param name="history" value="reference.fasta" ftype="fasta" /> <output name="output_alignment" value="output1.fasta" /> </test> + <test> + <param name="remove_invariant_sites" value="True" /> + <param name="vcf_inputs"> + <collection type="list"> + <element name="vcf_inputs.vcf1" value="vcf1.vcf" /> + <element name="vcf_inputs.vcf2" value="vcf2.vcf" /> + <element name="vcf_inputs.vcf3" value="vcf3.vcf" /> + </collection> + </param> + <param name="history" value="reference.fasta" ftype="fasta" /> + <output name="output_alignment" value="output2.fasta" /> + </test> </tests> <help><![CDATA[ Using the SNPs identified by the VCF files given as input, generates a sequence including the
--- a/vcf_to_msa.py Wed Feb 01 08:45:12 2017 -0500 +++ b/vcf_to_msa.py Fri Feb 03 05:56:11 2017 -0500 @@ -32,11 +32,14 @@ parser.add_argument('--vcf_files', nargs="+") parser.add_argument('--reference_file', type=argparse.FileType()) parser.add_argument('--output_file', type=argparse.FileType('w')) +parser.add_argument('--remove_invariant', action='store_true', default=False) args = parser.parse_args() do_inserts = False do_deletes = False do_snps = True +if (do_inserts or do_deletes) and args.remove_invariant: + exit("Cannot combine indel processing with 'remove_invariant' argument") # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) # print(reference, file=open('/tmp/reference.txt', 'w')) # vcf_files_dir = os.path.expanduser("~/Data/vcf") @@ -50,6 +53,8 @@ tree = intervaltree.IntervalTree() sequence_names = [] sequences = {} +if args.remove_invariant: + variant_sites = set() for i, vcf_descriptor in enumerate(args.vcf_files): # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] (seqname,vcf_filename) = vcf_descriptor.split('^^^') @@ -69,6 +74,8 @@ sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele except IndexError as e: print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) + if args.remove_invariant: + variant_sites.add(record.affected_start) count += 1 elif record.is_indel: length = record.affected_end - record.affected_start @@ -98,10 +105,18 @@ # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) # tree.add(new_interval) -SeqIO.write(reference_seq, args.output_file, "fasta") +if args.remove_invariant: + reference_str = ''.join([c for (i, c) in enumerate(reference) if i in variant_sites]) + reference_seq_variant = SeqRecord(Seq(reference_str), id=reference_seq.id, description=reference_seq.description) + SeqIO.write(reference_seq_variant, args.output_file, "fasta") +else: + SeqIO.write(reference_seq, args.output_file, "fasta") + offset = 0 +sequence_length = 0 for name in sequence_names: sequence = sequences[name] + sequence_length = len(sequence) for site in sorted(insertion_sites, key=itemgetter(0)): (start, end, allele, seqname) = site # print(start, allele, seqname) @@ -116,7 +131,12 @@ sequence[start:end] = ['-'] * length except IndexError as e: print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) - SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") + + if args.remove_invariant: + seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) + else: + seq_str = ''.join(sequence) + SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) # output.close()