Mercurial > repos > carlos-reyes > gu_pipeline_no
view GU_files/gu.py @ 0:5231cbbed6ca draft default tip
Upload GU tool
author | carlos-reyes |
---|---|
date | Fri, 15 Jun 2012 10:39:48 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/local/bin/python # # gu - Gene Unification Pipeline # # This pipeline compares two GFF input files for coordinate overlaps, and # bucketizes the overlapping feature pairs. The comparison step takes two lists # of features [a1, a2, ...], [b1, b2, ...], outputs all feature pairs [ai, bj] such # that ai overlaps bj. # # These feature pairs form the edges in a bipartite graph of overlapping # a's and b's. Bucketization finds the connected components of the graph # and segregates those components (or clusters) into buckets, according to # how many a's and b's there are in the cluster (one-to-one, one-to-many, etc). # The buckets are output to separate files (bucket_1-1.txt, bucket_1-n.txt, ...) # # There is one line output per overlapping pair [ai,bj]. This line contains # all the columns of ai and bj plus three additional columns about the cluster # to which this pair belongs. In detail: # 1. The id of the cluster (connected component) to which this pair belongs. # 2. The bucket label (e.g. "n-1"). # 3. The actual counts of a's and b's in the cluster, e.g. "3-1" for 3 a's and 1 b. # 4- All of the columns of ai followed by all of the columns of bj. # # GU operates in one of two modes. The above description corresponds to 'no-aggregate' # mode: the overlapping features are bucketized directly. Sometimes, however, you want # the results aggregated to a higher level. For example, you may have files of exons, # but what you want is buckets of overlapping genes. For this, there is 'aggregate' mode, # in which lists of overlapping exons (for example) are turned into lists of overlapping genes # before being bucketized. Because aggregation loses the details of the underlying features, # the output format is somewhat different from no-aggregate mode. # #-------------------------------------------- import sys import time import string from tempfile import mkstemp import os import os.path from TableTools import TA,TB,TD,TF,TI,TJ,TP,TS,TU,TX,FJ from optparse import OptionParser import re USAGE="%prog [options] --f1 file1.gff --f2 file2.gff\n(For help, use -h option.)" def now(): return time.asctime(time.localtime(time.time())) class GUPipeline : #---------------------------------------------------- def __init__(self, argv): # 1. initialize the option parser # self.argv = argv self.parser = OptionParser(USAGE) self.tempfiles = [] self.types1 = [] self.types2 = [] self.parser.add_option("-1", "--f1", "--file1", dest="file1", metavar="FILE1", default=None, help="The first GFF file.") self.parser.add_option("-2", "--f2", "--file2", dest="file2", metavar="FILE2", default=None, help="The second GFF file.") self.parser.add_option("-k", "--minOverlap", dest="k", metavar=" AMT", default="1", help="The minimum required overlap. (Default: 1)") #self.parser.set_defaults(mode="advanced") self.parser.add_option("--t1", dest="types1", action="append", metavar="GFFTYPE", default=[], help="A GFF type to select from file 1. Repeatable. (Default = all types)" ) self.parser.add_option("--t2", dest="types2", action="append", metavar="GFFTYPE", default=[], help="A GFF type to select from file 2. Repeatable.(Default = all types)" ) self.parser.add_option("--nt1", dest="notTypes1", action="append", metavar="GFFTYPE", default=[], help="A GFF type to FILTER OUT of file 1. Repeatable. (Default = filters no types)" ) self.parser.add_option("--nt2", dest="notTypes2", action="append", metavar="GFFTYPE", default=[], help="A GFF type to filter out of file 2. Repeatable.(Default = filters no types)" ) self.parser.add_option("--ire", dest="ire", metavar="REGEX", default=None, help="A regular expression that will be used to extract ids from column 9. " + "(Also: what you'll feel as you try to get this parameter right...) " + "Only applicable in aggregate mode. You can specify different patterns for " + "inputs 1 and 2 by specifying --ire1 and --ire2 instead. " + "Example: to aggregate both inputs by MGI id: --ire 'MGI:[0-9]+'. " + "Example: to aggregate input1 by MGI id and input 2 by VEGA id: " + "--ire1 'MGI:[0-9]+' --ire2 'OTTMUSG[0-9]+'. " + "Advanced usage: Sometimes you need to define a regular expression where the actual " + "id you wish to extract is a sub-part of the whole pattern. This is called a 'capture'. " + "To capture the id, surround that part of the pattern with the magic symbols " + "'(?P<id>' and ')'. For example, suppose you want to capture the MGI id only when it's " + "part of a dbref attribute like this: 'Dbxref=MGI:MGI:012345;'. You could use the following " + "regular expression: 'Dbxref=MGI:(?P<id>MGI:[0-9]+);'." ) self.parser.add_option("--ire1", dest="ire1", metavar="REGEX", default=None, help="Specify regex for input 1 only. See --ire." ) self.parser.add_option("--ire2", dest="ire2", metavar="REGEX", default=None, help="Specify regex for input 2 only. See --ire." ) self.parser.add_option("-i", "--ignoreStrand", dest="ignoreStrand", action="store_true", default = False, help="Ignore strand when determining overlaps (Default = strands must match)") self.parser.add_option("-n", "--noAggregate", dest="noAggregate", action="store_true", default = False, help="Do not aggregate features. Opposite of -a.") self.parser.add_option("-C", "--chrMatchLoose", dest="chrMatchLoose", action="store_true", default = False, help="If specified, chromosome matching is 'loose'. Otherwise it is exact. " + \ "In loose matching, leading 'chr' is removed from chromosome field, " + \ "so that '19' matches 'Chr19'. ") self.parser.add_option("-s", "--noSelfHits", dest="noSelfHits", action="store_true", default = False, help="Ignore self-hit in overlap detection. (default: reports self-hits)") self.parser.add_option("-d", "--od", dest="outDir", metavar="DIRECTORY", default=".", help="Path of output directory. (Default = current directory)") self.parser.add_option("-t", dest="template", metavar="TEMPLATE", default="bucket_%s.txt", help="Template string for naming output bucket files. " + \ "Must contain the substring '%s', which is replaced by the bucket class. " + \ "The classes are: '1-0', '0-1', '1-1', '1-n', 'n-1', 'n-m'. " + \ "(Default: bucket_%s.txt)") self.parser.add_option("-l", "--lf", "--logFile", dest="logFile", metavar="FILE", default=None, help="Log file. (Default = writes to stderr)") # 2. Parse the command line # (self.options,xxx) = self.parser.parse_args(argv) # 3. Validate the args # if self.options.logFile is not None: sys.stderr = open(self.options.logFile, 'a') if self.options.file1 is None \ or self.options.file2 is None: self.parser.error("Must specify both --f1 and --f2.") if not os.path.isdir(self.options.outDir): self.parser.error("Output directory " + self.options.outDir + \ " does not exist or is not a directory.") self.types1 = str( self.options.types1 ).lower() self.types2 = str( self.options.types2 ).lower() self.notTypes1 = str( self.options.notTypes1 ).lower() self.notTypes2 = str( self.options.notTypes2 ).lower() self.minOverlap = self.options.k if not '%s' in self.options.template: self.options.template += '%s' self.options.template = \ os.path.join( self.options.outDir, self.options.template) if self.options.chrMatchLoose: self.options.chrMatchExpr = "[re.sub('^(?i)chr(om(osome)?)?', '', IN[1])]+IN[2:]" else: self.options.chrMatchExpr = "IN[1:]" self.options.guDir = os.path.split(__file__)[0] self.options.guUtilFile = os.path.join(self.options.guDir,'guUtil.py') # local function def groomRe(name, default): r = getattr(self.options, name) if r is None: r = default if "(?P<id>" not in r: r = "(?P<id>%s)" % r r = "r'%s'" % r setattr(self.options, name, r ) self.debug('%s=%s\n'%(name,r)) if self.options.ire is None: self.options.ire = r' *(gene_?(id)?|id)? *[=:]? *"?(?P<id>[^"; ]+)"?' dflt = self.options.ire groomRe('ire1',dflt) groomRe('ire2',dflt) #---------------------------------------------------- def mkTmp(self,preserve=False): tf = mkstemp() os.close(tf[0]) if not preserve: self.tempfiles.append(tf[1]) return tf[1] #---------------------------------------------------- def cleanupTempFiles(self): for tf in self.tempfiles: os.remove(tf) #---------------------------------------------------- def debug(self,s,ts=False): if ts: sys.stderr.write(now()+': ') sys.stderr.write(s) #---------------------------------------------------- def execStep(self, tool, args): self.debug( tool.__name__ + " " + string.join(args, " ") + "\n", True) t = tool(args) t.go() t.closeFiles() return t.nOutputRows #---------------------------------------------------- def go_noAggregate(self): ### Select rows from first file. args = [ "--file1="+self.options.file1, self.options.chrMatchExpr, ] if len(self.options.types1) > 0: args.append( "?string.lower(IN[3]) in %s" % self.types1 ) if len(self.options.notTypes1) > 0: args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 ) if len(args) > 2: f1 = self.mkTmp() args.append( "--out-file=" + f1 ) self.execStep(TF, args) else: f1 = self.options.file1 ### Select rows from second file. args = [ "--file1="+self.options.file2, self.options.chrMatchExpr, ] if len(self.options.types2) > 0: args.append( "?string.lower(IN[3]) in %s" % self.types2 ) if len(self.options.notTypes2) > 0: args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 ) if len(args) > 2: f2 = self.mkTmp() args.append( "--out-file=" + f2 ) self.execStep(TF, args) else: f2 = self.options.file2 ### find overlapping features. overlaps=self.mkTmp() args = [ "-1", f1, "-2", f2, "-s", "both", "-k", self.minOverlap, "-o", overlaps, ] if self.options.ignoreStrand: args = args + ["--columns1", "1,4,5"] novl = self.execStep(FJ, args) if self.options.noSelfHits: xxx = overlaps overlaps = self.mkTmp() args = [ "?IN[10]!=IN[19]", "-1", xxx, "-o", overlaps, ] novl = self.execStep(TF, args) #self.debug("FJ out: " + xxx) #self.debug("TF out: " + overlaps) #os.system("diff %s %s" %(xxx,overlaps)) if novl == 0: self.debug("No overlapping features detected.\n") ### bucketize the pairs. bucketized=self.mkTmp() self.execStep(TB, [ "--file1=" + overlaps, "--k1=10", "--k2=19", "-t"+bucketized, "IN[1:]", "int(IN[11]!=IN[20])", ## compute column: 0==same strands 1==diff "int(string.lower(IN[7])!='gene')", ## compute column: 0==all genes 1==nongene ]) sorted = self.mkTmp() self.execStep(TS, [ "--file1=" + bucketized, "-k 3", "-k 1", "--out-file=" + sorted, ]) self.execStep(TP, [ "--file1=" + sorted, "-o" + self.options.template, "-p 3", ]) ### Bucketization did not generate 1-0 and 0-1 buckets ### (because we only fed it overlapping pairs). ### Generate these buckets by diff'ing the inputs ### against the fjoin output. self.execStep(TD, [ "--file1=" + f1, "--k1=9", "--file2=" + bucketized, "--k2=13", "--out-file=" + (self.options.template%"1-0") ]) self.execStep(TD, [ "--file1=" + f2, "--k1=9", "--file2=" + bucketized, "--k2=22", "--out-file=" + (self.options.template%"0-1") ]) #---------------------------------------------------- def go_aggregate(self): # Select rows from first file, and extract feature ids. f1 = self.mkTmp() args = [ "--file1="+self.options.file1, "--exec-file="+self.options.guUtilFile, ] if len(self.options.types1) > 0: args.append( "?string.lower(IN[3]) in %s" % self.types1 ) if len(self.options.notTypes1) > 0: args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 ) args += [ "IN[1:9]", 'extractID(IN[9],%s)'%self.options.ire1, "--out-file=" + f1 ] self.execStep(TF, args) # Select rows from second file, and extract feature ids. f2 = self.mkTmp() args = [ "--file1="+self.options.file2, "--exec-file="+self.options.guUtilFile, ] if len(self.options.types2) > 0: args.append( "?string.lower(IN[3]) in %s" % self.types2 ) if len(self.options.notTypes2) > 0: args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 ) args += [ "IN[1:9]", 'extractID(IN[9],%s)'%self.options.ire2, "--out-file=" + f2 ] self.execStep(TF, args) # Find the distinct higher-level features in file1 and count the base features genes1=os.path.join(self.options.outDir, "features1.txt") self.execStep(TA, [ "--file1="+f1, "-g9", # id "-acount", # num. lines w/ this id "-afirst:1", # first chr "-amin:4", # min start val "-amax:5", # max end val "-afirst:7", # first strand "--out-file=" + genes1 ]) # Find the unique genes in file2 and count the exons genes2=os.path.join(self.options.outDir, "features2.txt") self.execStep(TA, [ "--file1="+f2, "-g9", "-acount", "-afirst:1", "-amin:4", "-amax:5", "-afirst:7", "--out-file=" + genes2 ]) # Find all overlapping feature pairs. ovlExons=self.mkTmp() args = [ "-1", f1, "-2", f2, "-o", ovlExons, "-s", "both", "-k", self.minOverlap, ] if self.options.ignoreStrand: args = args + ["--columns1", "1,4,5"] novl = self.execStep(FJ, args) if self.options.noSelfHits: xxx = ovlExons ovlExons = self.mkTmp() args = [ "?IN[10] != IN[19]", "-1", xxx, "-o", ovlExons, ] novl = self.execStep(TF, args) if novl == 0: self.debug("No overlapping features detected.\n") # Aggregate overlapping feature pairs into higher-level overlaps. # Count the base features involved. ovlGenes = self.mkTmp() self.execStep(TA, [ "-g10,19", "-acount:5", "-acount:14", "--file1="+ovlExons, "--out-file=" + ovlGenes ]) # Join with genes1 to pull in total exon counts. # Do an outer join so that every gene in genes1 is # represented. tmp1 = self.mkTmp() self.execStep(TJ, [ "--file1=" + ovlGenes, "--file2=" + genes1, "--k1=1", "--k2=1", "--right", "-n.", "--out-file=" + tmp1 ]) # Join with genes2 to pull in total exon counts. # Do a bidi-outer join so that every gene in genes2 is # represented, as is every gene in genes1.. tmp2 = self.mkTmp() self.execStep(TJ, [ "--file1=" + tmp1, "--file2=" + genes2, "--k1=2", "--k2=1", "--left", "--right", "-n.", "--out-file=" + tmp2 ]) # Filter for final output formatting. tmp3 = self.mkTmp() self.execStep(TF, [ "--file1=" + tmp2, "--out-file=" + tmp3, "IN[7]=='.' and IN[13] or IN[7]" , "IN[10]=='.' and IN[16] or IN[16]=='.' and IN[10] or IN[16]==IN[10] and IN[16] or '???'", "IN[5]", "IN[3]", "IN[6]", "IN[8]", "IN[9]", "IN[10]", "IN[11]", "IN[4]", "IN[12]", "IN[14]", "IN[15]", "IN[16]", ]) # Bucketize the overlapping genes. Output separate file for # each bucket. self.execStep(TB, [ "--file1=" + tmp3, "--k1=3", "--k2=9", "-n.", "-t" + self.options.template, "IN[1:3]", "IN[4:]", ## remove the bucket id column ]) def go(self): self.debug("======================================\n") self.debug("Starting GU pipeline\n", True) self.debug("Command line:\n%s\n" % (" ".join(self.argv))) if(self.options.noAggregate): self.go_noAggregate() else: self.go_aggregate() self.debug("Pipeline completed.\n", True) self.debug("Cleaning up...\n") # Delete the temp files #self.cleanupTempFiles() self.debug("Goodbye.\n\n") #-------------------------------- GUPipeline(sys.argv).go()