Mercurial > repos > peterjc > venn_list
view tools/venn_list/venn_list.py @ 25:faf1ea8d1c06 draft
v0.0.10 with fixed Biopython dependency
author | peterjc |
---|---|
date | Thu, 02 Feb 2017 11:44:52 -0500 |
parents | 2f2c62ec2d08 |
children | 2cb493fc9811 |
line wrap: on
line source
#!/usr/bin/env python """Plot up to 3-way Venn Diagram using R limma vennDiagram (via rpy) This script is copyright 2010 by Peter Cock, The James Hutton Institute (formerly SCRI), UK. All rights reserved. See accompanying text file for licence details (MIT/BSD style). This is version 0.0.8 of the script. """ import sys try: import rpy except ImportError: sys.exit("Requires the Python library rpy (to call R)") except RuntimeError, e: sys.exit("The Python library rpy is not availble for the current R version\n\n%s" % e) try: rpy.r.library("limma") except Exception: sys.exit("Requires the R library limma (for vennDiagram function)") if len(sys.argv) - 1 not in [7, 10, 13]: sys.exit("Expected 7, 10 or 13 arguments (for 1, 2 or 3 sets), not %i" % (len(sys.argv) - 1)) all_file, all_type, all_label = sys.argv[1:4] set_data = [] if len(sys.argv) - 1 >= 7: set_data.append(tuple(sys.argv[4:7])) if len(sys.argv) - 1 >= 10: set_data.append(tuple(sys.argv[7:10])) if len(sys.argv) - 1 >= 13: set_data.append(tuple(sys.argv[10:13])) pdf_file = sys.argv[-1] n = len(set_data) print "Doing %i-way Venn Diagram" % n def load_ids(filename, filetype): if filetype == "tabular": for line in open(filename): line = line.rstrip("\n") if line and not line.startswith("#"): yield line.split("\t", 1)[0] elif filetype == "fasta": for line in open(filename): if line.startswith(">"): yield line[1:].rstrip("\n").split(None, 1)[0] elif filetype.startswith("fastq"): # Use the Galaxy library not Biopython to cope with CS from galaxy_utils.sequence.fastq import fastqReader handle = open(filename, "rU") for record in fastqReader(handle): # The [1:] is because the fastaReader leaves the @ on the identifer. yield record.identifier.split()[0][1:] handle.close() elif filetype == "sff": try: from Bio.SeqIO import index except ImportError: sys.exit("Require Biopython 1.54 or later (to read SFF files)") # This will read the SFF index block if present (very fast) for name in index(filename, "sff"): yield name else: sys.exit("Unexpected file type %s" % filetype) def load_ids_whitelist(filename, filetype, whitelist): for name in load_ids(filename, filetype): if name in whitelist: yield name else: sys.exit("Unexpected ID %s in %s file %s" % (name, filetype, filename)) if all_file in ["", "-", '""', '"-"']: # Load without white list sets = [set(load_ids(f, t)) for (f, t, c) in set_data] # Take union all_ids = set() for s in sets: all_ids.update(s) print "Inferred total of %i IDs" % len(all_ids) else: all_ids = set(load_ids(all_file, all_type)) print "Total of %i IDs" % len(all_ids) sets = [set(load_ids_whitelist(f, t, all_ids)) for (f, t, c) in set_data] for s, (f, t, c) in zip(sets, set_data): print "%i in %s" % (len(s), c) # Now call R library to draw simple Venn diagram try: # Create dummy Venn diagram counts object for three groups cols = 'c("%s")' % '","'.join("Set%i" % (i + 1) for i in range(n)) rpy.r('groups <- cbind(%s)' % ','.join(['1'] * n)) rpy.r('colnames(groups) <- %s' % cols) rpy.r('vc <- vennCounts(groups)') # Populate the 2^n classes with real counts # Don't make any assumptions about the class order # print rpy.r('vc') for index, row in enumerate(rpy.r('vc[,%s]' % cols)): if isinstance(row, int) or isinstance(row, float): # Hack for rpy being too clever for single element row row = [row] names = all_ids for wanted, s in zip(row, sets): if wanted: names = names.intersection(s) else: names = names.difference(s) rpy.r('vc[%i,"Counts"] <- %i' % (index + 1, len(names))) # print rpy.r('vc') if n == 1: # Single circle, don't need to add (Total XXX) line names = [c for (t, f, c) in set_data] else: names = ["%s\n(Total %i)" % (c, len(s)) for s, (f, t, c) in zip(sets, set_data)] rpy.r.assign("names", names) rpy.r.assign("colors", ["red", "green", "blue"][:n]) rpy.r.pdf(pdf_file, 8, 8) rpy.r("""vennDiagram(vc, include="both", names=names, main="%s", sub="(Total %i)", circle.col=colors) """ % (all_label, len(all_ids))) rpy.r.dev_off() except Exception, exc: sys.exit("%s" % str(exc)) rpy.r.quit(save="no") print "Done"