comparison GU_files/gu.py @ 0:5231cbbed6ca draft default tip

Upload GU tool
author carlos-reyes
date Fri, 15 Jun 2012 10:39:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5231cbbed6ca
1 #!/usr/local/bin/python
2 #
3 # gu - Gene Unification Pipeline
4 #
5 # This pipeline compares two GFF input files for coordinate overlaps, and
6 # bucketizes the overlapping feature pairs. The comparison step takes two lists
7 # of features [a1, a2, ...], [b1, b2, ...], outputs all feature pairs [ai, bj] such
8 # that ai overlaps bj.
9 #
10 # These feature pairs form the edges in a bipartite graph of overlapping
11 # a's and b's. Bucketization finds the connected components of the graph
12 # and segregates those components (or clusters) into buckets, according to
13 # how many a's and b's there are in the cluster (one-to-one, one-to-many, etc).
14 # The buckets are output to separate files (bucket_1-1.txt, bucket_1-n.txt, ...)
15 #
16 # There is one line output per overlapping pair [ai,bj]. This line contains
17 # all the columns of ai and bj plus three additional columns about the cluster
18 # to which this pair belongs. In detail:
19 # 1. The id of the cluster (connected component) to which this pair belongs.
20 # 2. The bucket label (e.g. "n-1").
21 # 3. The actual counts of a's and b's in the cluster, e.g. "3-1" for 3 a's and 1 b.
22 # 4- All of the columns of ai followed by all of the columns of bj.
23 #
24 # GU operates in one of two modes. The above description corresponds to 'no-aggregate'
25 # mode: the overlapping features are bucketized directly. Sometimes, however, you want
26 # the results aggregated to a higher level. For example, you may have files of exons,
27 # but what you want is buckets of overlapping genes. For this, there is 'aggregate' mode,
28 # in which lists of overlapping exons (for example) are turned into lists of overlapping genes
29 # before being bucketized. Because aggregation loses the details of the underlying features,
30 # the output format is somewhat different from no-aggregate mode.
31 #
32 #--------------------------------------------
33
34 import sys
35 import time
36 import string
37 from tempfile import mkstemp
38 import os
39 import os.path
40 from TableTools import TA,TB,TD,TF,TI,TJ,TP,TS,TU,TX,FJ
41 from optparse import OptionParser
42 import re
43
44 USAGE="%prog [options] --f1 file1.gff --f2 file2.gff\n(For help, use -h option.)"
45
46 def now():
47 return time.asctime(time.localtime(time.time()))
48
49 class GUPipeline :
50 #----------------------------------------------------
51 def __init__(self, argv):
52 # 1. initialize the option parser
53 #
54 self.argv = argv
55 self.parser = OptionParser(USAGE)
56 self.tempfiles = []
57 self.types1 = []
58 self.types2 = []
59
60 self.parser.add_option("-1", "--f1", "--file1",
61 dest="file1",
62 metavar="FILE1",
63 default=None,
64 help="The first GFF file.")
65
66 self.parser.add_option("-2", "--f2", "--file2",
67 dest="file2",
68 metavar="FILE2",
69 default=None,
70 help="The second GFF file.")
71
72 self.parser.add_option("-k", "--minOverlap",
73 dest="k",
74 metavar=" AMT",
75 default="1",
76 help="The minimum required overlap. (Default: 1)")
77
78 #self.parser.set_defaults(mode="advanced")
79 self.parser.add_option("--t1",
80 dest="types1",
81 action="append",
82 metavar="GFFTYPE",
83 default=[],
84 help="A GFF type to select from file 1. Repeatable. (Default = all types)" )
85
86 self.parser.add_option("--t2",
87 dest="types2",
88 action="append",
89 metavar="GFFTYPE",
90 default=[],
91 help="A GFF type to select from file 2. Repeatable.(Default = all types)" )
92
93 self.parser.add_option("--nt1",
94 dest="notTypes1",
95 action="append",
96 metavar="GFFTYPE",
97 default=[],
98 help="A GFF type to FILTER OUT of file 1. Repeatable. (Default = filters no types)" )
99
100 self.parser.add_option("--nt2",
101 dest="notTypes2",
102 action="append",
103 metavar="GFFTYPE",
104 default=[],
105 help="A GFF type to filter out of file 2. Repeatable.(Default = filters no types)" )
106
107 self.parser.add_option("--ire",
108 dest="ire",
109 metavar="REGEX",
110 default=None,
111 help="A regular expression that will be used to extract ids from column 9. " +
112 "(Also: what you'll feel as you try to get this parameter right...) " +
113 "Only applicable in aggregate mode. You can specify different patterns for " +
114 "inputs 1 and 2 by specifying --ire1 and --ire2 instead. " +
115 "Example: to aggregate both inputs by MGI id: --ire 'MGI:[0-9]+'. " +
116 "Example: to aggregate input1 by MGI id and input 2 by VEGA id: " +
117 "--ire1 'MGI:[0-9]+' --ire2 'OTTMUSG[0-9]+'. " +
118 "Advanced usage: Sometimes you need to define a regular expression where the actual " +
119 "id you wish to extract is a sub-part of the whole pattern. This is called a 'capture'. " +
120 "To capture the id, surround that part of the pattern with the magic symbols " +
121 "'(?P<id>' and ')'. For example, suppose you want to capture the MGI id only when it's " +
122 "part of a dbref attribute like this: 'Dbxref=MGI:MGI:012345;'. You could use the following " +
123 "regular expression: 'Dbxref=MGI:(?P<id>MGI:[0-9]+);'."
124 )
125
126 self.parser.add_option("--ire1",
127 dest="ire1",
128 metavar="REGEX",
129 default=None,
130 help="Specify regex for input 1 only. See --ire."
131 )
132
133 self.parser.add_option("--ire2",
134 dest="ire2",
135 metavar="REGEX",
136 default=None,
137 help="Specify regex for input 2 only. See --ire."
138 )
139
140 self.parser.add_option("-i", "--ignoreStrand",
141 dest="ignoreStrand",
142 action="store_true",
143 default = False,
144 help="Ignore strand when determining overlaps (Default = strands must match)")
145
146 self.parser.add_option("-n", "--noAggregate",
147 dest="noAggregate",
148 action="store_true",
149 default = False,
150 help="Do not aggregate features. Opposite of -a.")
151
152 self.parser.add_option("-C", "--chrMatchLoose",
153 dest="chrMatchLoose",
154 action="store_true",
155 default = False,
156 help="If specified, chromosome matching is 'loose'. Otherwise it is exact. " + \
157 "In loose matching, leading 'chr' is removed from chromosome field, " + \
158 "so that '19' matches 'Chr19'. ")
159
160 self.parser.add_option("-s", "--noSelfHits",
161 dest="noSelfHits",
162 action="store_true",
163 default = False,
164 help="Ignore self-hit in overlap detection. (default: reports self-hits)")
165
166 self.parser.add_option("-d", "--od",
167 dest="outDir",
168 metavar="DIRECTORY",
169 default=".",
170 help="Path of output directory. (Default = current directory)")
171
172 self.parser.add_option("-t",
173 dest="template",
174 metavar="TEMPLATE",
175 default="bucket_%s.txt",
176 help="Template string for naming output bucket files. " + \
177 "Must contain the substring '%s', which is replaced by the bucket class. " + \
178 "The classes are: '1-0', '0-1', '1-1', '1-n', 'n-1', 'n-m'. " + \
179 "(Default: bucket_%s.txt)")
180
181 self.parser.add_option("-l", "--lf", "--logFile",
182 dest="logFile",
183 metavar="FILE",
184 default=None,
185 help="Log file. (Default = writes to stderr)")
186
187 # 2. Parse the command line
188 #
189 (self.options,xxx) = self.parser.parse_args(argv)
190
191 # 3. Validate the args
192 #
193 if self.options.logFile is not None:
194 sys.stderr = open(self.options.logFile, 'a')
195
196 if self.options.file1 is None \
197 or self.options.file2 is None:
198 self.parser.error("Must specify both --f1 and --f2.")
199
200 if not os.path.isdir(self.options.outDir):
201 self.parser.error("Output directory " + self.options.outDir + \
202 " does not exist or is not a directory.")
203
204 self.types1 = str( self.options.types1 ).lower()
205 self.types2 = str( self.options.types2 ).lower()
206 self.notTypes1 = str( self.options.notTypes1 ).lower()
207 self.notTypes2 = str( self.options.notTypes2 ).lower()
208
209 self.minOverlap = self.options.k
210
211 if not '%s' in self.options.template:
212 self.options.template += '%s'
213
214 self.options.template = \
215 os.path.join( self.options.outDir, self.options.template)
216
217 if self.options.chrMatchLoose:
218 self.options.chrMatchExpr = "[re.sub('^(?i)chr(om(osome)?)?', '', IN[1])]+IN[2:]"
219 else:
220 self.options.chrMatchExpr = "IN[1:]"
221
222 self.options.guDir = os.path.split(__file__)[0]
223 self.options.guUtilFile = os.path.join(self.options.guDir,'guUtil.py')
224
225
226 # local function
227 def groomRe(name, default):
228 r = getattr(self.options, name)
229 if r is None:
230 r = default
231 if "(?P<id>" not in r:
232 r = "(?P<id>%s)" % r
233 r = "r'%s'" % r
234 setattr(self.options, name, r )
235 self.debug('%s=%s\n'%(name,r))
236
237 if self.options.ire is None:
238 self.options.ire = r' *(gene_?(id)?|id)? *[=:]? *"?(?P<id>[^"; ]+)"?'
239 dflt = self.options.ire
240 groomRe('ire1',dflt)
241 groomRe('ire2',dflt)
242
243
244 #----------------------------------------------------
245 def mkTmp(self,preserve=False):
246 tf = mkstemp()
247 os.close(tf[0])
248 if not preserve:
249 self.tempfiles.append(tf[1])
250 return tf[1]
251
252 #----------------------------------------------------
253 def cleanupTempFiles(self):
254 for tf in self.tempfiles:
255 os.remove(tf)
256
257 #----------------------------------------------------
258 def debug(self,s,ts=False):
259 if ts:
260 sys.stderr.write(now()+': ')
261 sys.stderr.write(s)
262
263 #----------------------------------------------------
264 def execStep(self, tool, args):
265 self.debug( tool.__name__ + " " + string.join(args, " ") + "\n", True)
266 t = tool(args)
267 t.go()
268 t.closeFiles()
269 return t.nOutputRows
270
271 #----------------------------------------------------
272 def go_noAggregate(self):
273
274 ### Select rows from first file.
275
276 args = [
277 "--file1="+self.options.file1,
278 self.options.chrMatchExpr,
279 ]
280
281 if len(self.options.types1) > 0:
282 args.append( "?string.lower(IN[3]) in %s" % self.types1 )
283
284 if len(self.options.notTypes1) > 0:
285 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 )
286
287 if len(args) > 2:
288 f1 = self.mkTmp()
289 args.append( "--out-file=" + f1 )
290 self.execStep(TF, args)
291 else:
292 f1 = self.options.file1
293
294 ### Select rows from second file.
295
296 args = [
297 "--file1="+self.options.file2,
298 self.options.chrMatchExpr,
299 ]
300
301 if len(self.options.types2) > 0:
302 args.append( "?string.lower(IN[3]) in %s" % self.types2 )
303
304 if len(self.options.notTypes2) > 0:
305 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 )
306
307 if len(args) > 2:
308 f2 = self.mkTmp()
309 args.append( "--out-file=" + f2 )
310 self.execStep(TF, args)
311 else:
312 f2 = self.options.file2
313
314 ### find overlapping features.
315
316 overlaps=self.mkTmp()
317 args = [
318 "-1", f1,
319 "-2", f2,
320 "-s", "both",
321 "-k", self.minOverlap,
322 "-o", overlaps,
323 ]
324 if self.options.ignoreStrand:
325 args = args + ["--columns1", "1,4,5"]
326 novl = self.execStep(FJ, args)
327
328 if self.options.noSelfHits:
329 xxx = overlaps
330 overlaps = self.mkTmp()
331 args = [
332 "?IN[10]!=IN[19]",
333 "-1", xxx,
334 "-o", overlaps,
335 ]
336 novl = self.execStep(TF, args)
337 #self.debug("FJ out: " + xxx)
338 #self.debug("TF out: " + overlaps)
339 #os.system("diff %s %s" %(xxx,overlaps))
340
341
342 if novl == 0:
343 self.debug("No overlapping features detected.\n")
344
345 ### bucketize the pairs.
346
347 bucketized=self.mkTmp()
348 self.execStep(TB, [
349 "--file1=" + overlaps,
350 "--k1=10",
351 "--k2=19",
352 "-t"+bucketized,
353 "IN[1:]",
354 "int(IN[11]!=IN[20])", ## compute column: 0==same strands 1==diff
355 "int(string.lower(IN[7])!='gene')", ## compute column: 0==all genes 1==nongene
356 ])
357
358 sorted = self.mkTmp()
359 self.execStep(TS, [
360 "--file1=" + bucketized,
361 "-k 3",
362 "-k 1",
363 "--out-file=" + sorted,
364 ])
365
366 self.execStep(TP, [
367 "--file1=" + sorted,
368 "-o" + self.options.template,
369 "-p 3",
370 ])
371
372 ### Bucketization did not generate 1-0 and 0-1 buckets
373 ### (because we only fed it overlapping pairs).
374 ### Generate these buckets by diff'ing the inputs
375 ### against the fjoin output.
376
377 self.execStep(TD, [
378 "--file1=" + f1,
379 "--k1=9",
380 "--file2=" + bucketized,
381 "--k2=13",
382 "--out-file=" + (self.options.template%"1-0") ])
383
384 self.execStep(TD, [
385 "--file1=" + f2,
386 "--k1=9",
387 "--file2=" + bucketized,
388 "--k2=22",
389 "--out-file=" + (self.options.template%"0-1") ])
390
391 #----------------------------------------------------
392 def go_aggregate(self):
393
394 # Select rows from first file, and extract feature ids.
395 f1 = self.mkTmp()
396 args = [
397 "--file1="+self.options.file1,
398 "--exec-file="+self.options.guUtilFile,
399 ]
400 if len(self.options.types1) > 0:
401 args.append( "?string.lower(IN[3]) in %s" % self.types1 )
402 if len(self.options.notTypes1) > 0:
403 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 )
404 args += [
405 "IN[1:9]",
406 'extractID(IN[9],%s)'%self.options.ire1,
407 "--out-file=" + f1
408 ]
409
410 self.execStep(TF, args)
411
412 # Select rows from second file, and extract feature ids.
413 f2 = self.mkTmp()
414 args = [
415 "--file1="+self.options.file2,
416 "--exec-file="+self.options.guUtilFile,
417 ]
418 if len(self.options.types2) > 0:
419 args.append( "?string.lower(IN[3]) in %s" % self.types2 )
420 if len(self.options.notTypes2) > 0:
421 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 )
422 args += [
423 "IN[1:9]",
424 'extractID(IN[9],%s)'%self.options.ire2,
425 "--out-file=" + f2
426 ]
427 self.execStep(TF, args)
428
429 # Find the distinct higher-level features in file1 and count the base features
430 genes1=os.path.join(self.options.outDir, "features1.txt")
431 self.execStep(TA, [
432 "--file1="+f1,
433 "-g9", # id
434 "-acount", # num. lines w/ this id
435 "-afirst:1", # first chr
436 "-amin:4", # min start val
437 "-amax:5", # max end val
438 "-afirst:7", # first strand
439 "--out-file=" + genes1
440 ])
441
442 # Find the unique genes in file2 and count the exons
443 genes2=os.path.join(self.options.outDir, "features2.txt")
444 self.execStep(TA, [
445 "--file1="+f2,
446 "-g9",
447 "-acount",
448 "-afirst:1",
449 "-amin:4",
450 "-amax:5",
451 "-afirst:7",
452 "--out-file=" + genes2
453 ])
454
455 # Find all overlapping feature pairs.
456 ovlExons=self.mkTmp()
457 args = [
458 "-1", f1,
459 "-2", f2,
460 "-o", ovlExons,
461 "-s", "both",
462 "-k", self.minOverlap,
463 ]
464 if self.options.ignoreStrand:
465 args = args + ["--columns1", "1,4,5"]
466 novl = self.execStep(FJ, args)
467
468 if self.options.noSelfHits:
469 xxx = ovlExons
470 ovlExons = self.mkTmp()
471 args = [
472 "?IN[10] != IN[19]",
473 "-1", xxx,
474 "-o", ovlExons,
475 ]
476 novl = self.execStep(TF, args)
477
478 if novl == 0:
479 self.debug("No overlapping features detected.\n")
480
481 # Aggregate overlapping feature pairs into higher-level overlaps.
482 # Count the base features involved.
483 ovlGenes = self.mkTmp()
484 self.execStep(TA, [
485 "-g10,19",
486 "-acount:5",
487 "-acount:14",
488 "--file1="+ovlExons,
489 "--out-file=" + ovlGenes
490 ])
491
492 # Join with genes1 to pull in total exon counts.
493 # Do an outer join so that every gene in genes1 is
494 # represented.
495 tmp1 = self.mkTmp()
496 self.execStep(TJ, [
497 "--file1=" + ovlGenes,
498 "--file2=" + genes1,
499 "--k1=1",
500 "--k2=1",
501 "--right",
502 "-n.",
503 "--out-file=" + tmp1
504 ])
505
506 # Join with genes2 to pull in total exon counts.
507 # Do a bidi-outer join so that every gene in genes2 is
508 # represented, as is every gene in genes1..
509 tmp2 = self.mkTmp()
510 self.execStep(TJ, [
511 "--file1=" + tmp1,
512 "--file2=" + genes2,
513 "--k1=2",
514 "--k2=1",
515 "--left",
516 "--right",
517 "-n.",
518 "--out-file=" + tmp2
519 ])
520
521 # Filter for final output formatting.
522 tmp3 = self.mkTmp()
523 self.execStep(TF, [
524 "--file1=" + tmp2,
525 "--out-file=" + tmp3,
526 "IN[7]=='.' and IN[13] or IN[7]" ,
527 "IN[10]=='.' and IN[16] or IN[16]=='.' and IN[10] or IN[16]==IN[10] and IN[16] or '???'",
528 "IN[5]", "IN[3]", "IN[6]", "IN[8]", "IN[9]", "IN[10]",
529 "IN[11]", "IN[4]", "IN[12]", "IN[14]", "IN[15]", "IN[16]",
530 ])
531
532 # Bucketize the overlapping genes. Output separate file for
533 # each bucket.
534 self.execStep(TB, [
535 "--file1=" + tmp3,
536 "--k1=3",
537 "--k2=9",
538 "-n.",
539 "-t" + self.options.template,
540 "IN[1:3]", "IN[4:]", ## remove the bucket id column
541 ])
542
543 def go(self):
544 self.debug("======================================\n")
545 self.debug("Starting GU pipeline\n", True)
546 self.debug("Command line:\n%s\n" % (" ".join(self.argv)))
547
548 if(self.options.noAggregate):
549 self.go_noAggregate()
550 else:
551 self.go_aggregate()
552
553 self.debug("Pipeline completed.\n", True)
554 self.debug("Cleaning up...\n")
555
556 # Delete the temp files
557 #self.cleanupTempFiles()
558 self.debug("Goodbye.\n\n")
559
560 #--------------------------------
561 GUPipeline(sys.argv).go()
562