Mercurial > repos > carlos-reyes > gu_pipeline_no
comparison GU_files/gu.py @ 0:5231cbbed6ca draft default tip
Upload GU tool
author | carlos-reyes |
---|---|
date | Fri, 15 Jun 2012 10:39:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5231cbbed6ca |
---|---|
1 #!/usr/local/bin/python | |
2 # | |
3 # gu - Gene Unification Pipeline | |
4 # | |
5 # This pipeline compares two GFF input files for coordinate overlaps, and | |
6 # bucketizes the overlapping feature pairs. The comparison step takes two lists | |
7 # of features [a1, a2, ...], [b1, b2, ...], outputs all feature pairs [ai, bj] such | |
8 # that ai overlaps bj. | |
9 # | |
10 # These feature pairs form the edges in a bipartite graph of overlapping | |
11 # a's and b's. Bucketization finds the connected components of the graph | |
12 # and segregates those components (or clusters) into buckets, according to | |
13 # how many a's and b's there are in the cluster (one-to-one, one-to-many, etc). | |
14 # The buckets are output to separate files (bucket_1-1.txt, bucket_1-n.txt, ...) | |
15 # | |
16 # There is one line output per overlapping pair [ai,bj]. This line contains | |
17 # all the columns of ai and bj plus three additional columns about the cluster | |
18 # to which this pair belongs. In detail: | |
19 # 1. The id of the cluster (connected component) to which this pair belongs. | |
20 # 2. The bucket label (e.g. "n-1"). | |
21 # 3. The actual counts of a's and b's in the cluster, e.g. "3-1" for 3 a's and 1 b. | |
22 # 4- All of the columns of ai followed by all of the columns of bj. | |
23 # | |
24 # GU operates in one of two modes. The above description corresponds to 'no-aggregate' | |
25 # mode: the overlapping features are bucketized directly. Sometimes, however, you want | |
26 # the results aggregated to a higher level. For example, you may have files of exons, | |
27 # but what you want is buckets of overlapping genes. For this, there is 'aggregate' mode, | |
28 # in which lists of overlapping exons (for example) are turned into lists of overlapping genes | |
29 # before being bucketized. Because aggregation loses the details of the underlying features, | |
30 # the output format is somewhat different from no-aggregate mode. | |
31 # | |
32 #-------------------------------------------- | |
33 | |
34 import sys | |
35 import time | |
36 import string | |
37 from tempfile import mkstemp | |
38 import os | |
39 import os.path | |
40 from TableTools import TA,TB,TD,TF,TI,TJ,TP,TS,TU,TX,FJ | |
41 from optparse import OptionParser | |
42 import re | |
43 | |
44 USAGE="%prog [options] --f1 file1.gff --f2 file2.gff\n(For help, use -h option.)" | |
45 | |
46 def now(): | |
47 return time.asctime(time.localtime(time.time())) | |
48 | |
49 class GUPipeline : | |
50 #---------------------------------------------------- | |
51 def __init__(self, argv): | |
52 # 1. initialize the option parser | |
53 # | |
54 self.argv = argv | |
55 self.parser = OptionParser(USAGE) | |
56 self.tempfiles = [] | |
57 self.types1 = [] | |
58 self.types2 = [] | |
59 | |
60 self.parser.add_option("-1", "--f1", "--file1", | |
61 dest="file1", | |
62 metavar="FILE1", | |
63 default=None, | |
64 help="The first GFF file.") | |
65 | |
66 self.parser.add_option("-2", "--f2", "--file2", | |
67 dest="file2", | |
68 metavar="FILE2", | |
69 default=None, | |
70 help="The second GFF file.") | |
71 | |
72 self.parser.add_option("-k", "--minOverlap", | |
73 dest="k", | |
74 metavar=" AMT", | |
75 default="1", | |
76 help="The minimum required overlap. (Default: 1)") | |
77 | |
78 #self.parser.set_defaults(mode="advanced") | |
79 self.parser.add_option("--t1", | |
80 dest="types1", | |
81 action="append", | |
82 metavar="GFFTYPE", | |
83 default=[], | |
84 help="A GFF type to select from file 1. Repeatable. (Default = all types)" ) | |
85 | |
86 self.parser.add_option("--t2", | |
87 dest="types2", | |
88 action="append", | |
89 metavar="GFFTYPE", | |
90 default=[], | |
91 help="A GFF type to select from file 2. Repeatable.(Default = all types)" ) | |
92 | |
93 self.parser.add_option("--nt1", | |
94 dest="notTypes1", | |
95 action="append", | |
96 metavar="GFFTYPE", | |
97 default=[], | |
98 help="A GFF type to FILTER OUT of file 1. Repeatable. (Default = filters no types)" ) | |
99 | |
100 self.parser.add_option("--nt2", | |
101 dest="notTypes2", | |
102 action="append", | |
103 metavar="GFFTYPE", | |
104 default=[], | |
105 help="A GFF type to filter out of file 2. Repeatable.(Default = filters no types)" ) | |
106 | |
107 self.parser.add_option("--ire", | |
108 dest="ire", | |
109 metavar="REGEX", | |
110 default=None, | |
111 help="A regular expression that will be used to extract ids from column 9. " + | |
112 "(Also: what you'll feel as you try to get this parameter right...) " + | |
113 "Only applicable in aggregate mode. You can specify different patterns for " + | |
114 "inputs 1 and 2 by specifying --ire1 and --ire2 instead. " + | |
115 "Example: to aggregate both inputs by MGI id: --ire 'MGI:[0-9]+'. " + | |
116 "Example: to aggregate input1 by MGI id and input 2 by VEGA id: " + | |
117 "--ire1 'MGI:[0-9]+' --ire2 'OTTMUSG[0-9]+'. " + | |
118 "Advanced usage: Sometimes you need to define a regular expression where the actual " + | |
119 "id you wish to extract is a sub-part of the whole pattern. This is called a 'capture'. " + | |
120 "To capture the id, surround that part of the pattern with the magic symbols " + | |
121 "'(?P<id>' and ')'. For example, suppose you want to capture the MGI id only when it's " + | |
122 "part of a dbref attribute like this: 'Dbxref=MGI:MGI:012345;'. You could use the following " + | |
123 "regular expression: 'Dbxref=MGI:(?P<id>MGI:[0-9]+);'." | |
124 ) | |
125 | |
126 self.parser.add_option("--ire1", | |
127 dest="ire1", | |
128 metavar="REGEX", | |
129 default=None, | |
130 help="Specify regex for input 1 only. See --ire." | |
131 ) | |
132 | |
133 self.parser.add_option("--ire2", | |
134 dest="ire2", | |
135 metavar="REGEX", | |
136 default=None, | |
137 help="Specify regex for input 2 only. See --ire." | |
138 ) | |
139 | |
140 self.parser.add_option("-i", "--ignoreStrand", | |
141 dest="ignoreStrand", | |
142 action="store_true", | |
143 default = False, | |
144 help="Ignore strand when determining overlaps (Default = strands must match)") | |
145 | |
146 self.parser.add_option("-n", "--noAggregate", | |
147 dest="noAggregate", | |
148 action="store_true", | |
149 default = False, | |
150 help="Do not aggregate features. Opposite of -a.") | |
151 | |
152 self.parser.add_option("-C", "--chrMatchLoose", | |
153 dest="chrMatchLoose", | |
154 action="store_true", | |
155 default = False, | |
156 help="If specified, chromosome matching is 'loose'. Otherwise it is exact. " + \ | |
157 "In loose matching, leading 'chr' is removed from chromosome field, " + \ | |
158 "so that '19' matches 'Chr19'. ") | |
159 | |
160 self.parser.add_option("-s", "--noSelfHits", | |
161 dest="noSelfHits", | |
162 action="store_true", | |
163 default = False, | |
164 help="Ignore self-hit in overlap detection. (default: reports self-hits)") | |
165 | |
166 self.parser.add_option("-d", "--od", | |
167 dest="outDir", | |
168 metavar="DIRECTORY", | |
169 default=".", | |
170 help="Path of output directory. (Default = current directory)") | |
171 | |
172 self.parser.add_option("-t", | |
173 dest="template", | |
174 metavar="TEMPLATE", | |
175 default="bucket_%s.txt", | |
176 help="Template string for naming output bucket files. " + \ | |
177 "Must contain the substring '%s', which is replaced by the bucket class. " + \ | |
178 "The classes are: '1-0', '0-1', '1-1', '1-n', 'n-1', 'n-m'. " + \ | |
179 "(Default: bucket_%s.txt)") | |
180 | |
181 self.parser.add_option("-l", "--lf", "--logFile", | |
182 dest="logFile", | |
183 metavar="FILE", | |
184 default=None, | |
185 help="Log file. (Default = writes to stderr)") | |
186 | |
187 # 2. Parse the command line | |
188 # | |
189 (self.options,xxx) = self.parser.parse_args(argv) | |
190 | |
191 # 3. Validate the args | |
192 # | |
193 if self.options.logFile is not None: | |
194 sys.stderr = open(self.options.logFile, 'a') | |
195 | |
196 if self.options.file1 is None \ | |
197 or self.options.file2 is None: | |
198 self.parser.error("Must specify both --f1 and --f2.") | |
199 | |
200 if not os.path.isdir(self.options.outDir): | |
201 self.parser.error("Output directory " + self.options.outDir + \ | |
202 " does not exist or is not a directory.") | |
203 | |
204 self.types1 = str( self.options.types1 ).lower() | |
205 self.types2 = str( self.options.types2 ).lower() | |
206 self.notTypes1 = str( self.options.notTypes1 ).lower() | |
207 self.notTypes2 = str( self.options.notTypes2 ).lower() | |
208 | |
209 self.minOverlap = self.options.k | |
210 | |
211 if not '%s' in self.options.template: | |
212 self.options.template += '%s' | |
213 | |
214 self.options.template = \ | |
215 os.path.join( self.options.outDir, self.options.template) | |
216 | |
217 if self.options.chrMatchLoose: | |
218 self.options.chrMatchExpr = "[re.sub('^(?i)chr(om(osome)?)?', '', IN[1])]+IN[2:]" | |
219 else: | |
220 self.options.chrMatchExpr = "IN[1:]" | |
221 | |
222 self.options.guDir = os.path.split(__file__)[0] | |
223 self.options.guUtilFile = os.path.join(self.options.guDir,'guUtil.py') | |
224 | |
225 | |
226 # local function | |
227 def groomRe(name, default): | |
228 r = getattr(self.options, name) | |
229 if r is None: | |
230 r = default | |
231 if "(?P<id>" not in r: | |
232 r = "(?P<id>%s)" % r | |
233 r = "r'%s'" % r | |
234 setattr(self.options, name, r ) | |
235 self.debug('%s=%s\n'%(name,r)) | |
236 | |
237 if self.options.ire is None: | |
238 self.options.ire = r' *(gene_?(id)?|id)? *[=:]? *"?(?P<id>[^"; ]+)"?' | |
239 dflt = self.options.ire | |
240 groomRe('ire1',dflt) | |
241 groomRe('ire2',dflt) | |
242 | |
243 | |
244 #---------------------------------------------------- | |
245 def mkTmp(self,preserve=False): | |
246 tf = mkstemp() | |
247 os.close(tf[0]) | |
248 if not preserve: | |
249 self.tempfiles.append(tf[1]) | |
250 return tf[1] | |
251 | |
252 #---------------------------------------------------- | |
253 def cleanupTempFiles(self): | |
254 for tf in self.tempfiles: | |
255 os.remove(tf) | |
256 | |
257 #---------------------------------------------------- | |
258 def debug(self,s,ts=False): | |
259 if ts: | |
260 sys.stderr.write(now()+': ') | |
261 sys.stderr.write(s) | |
262 | |
263 #---------------------------------------------------- | |
264 def execStep(self, tool, args): | |
265 self.debug( tool.__name__ + " " + string.join(args, " ") + "\n", True) | |
266 t = tool(args) | |
267 t.go() | |
268 t.closeFiles() | |
269 return t.nOutputRows | |
270 | |
271 #---------------------------------------------------- | |
272 def go_noAggregate(self): | |
273 | |
274 ### Select rows from first file. | |
275 | |
276 args = [ | |
277 "--file1="+self.options.file1, | |
278 self.options.chrMatchExpr, | |
279 ] | |
280 | |
281 if len(self.options.types1) > 0: | |
282 args.append( "?string.lower(IN[3]) in %s" % self.types1 ) | |
283 | |
284 if len(self.options.notTypes1) > 0: | |
285 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 ) | |
286 | |
287 if len(args) > 2: | |
288 f1 = self.mkTmp() | |
289 args.append( "--out-file=" + f1 ) | |
290 self.execStep(TF, args) | |
291 else: | |
292 f1 = self.options.file1 | |
293 | |
294 ### Select rows from second file. | |
295 | |
296 args = [ | |
297 "--file1="+self.options.file2, | |
298 self.options.chrMatchExpr, | |
299 ] | |
300 | |
301 if len(self.options.types2) > 0: | |
302 args.append( "?string.lower(IN[3]) in %s" % self.types2 ) | |
303 | |
304 if len(self.options.notTypes2) > 0: | |
305 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 ) | |
306 | |
307 if len(args) > 2: | |
308 f2 = self.mkTmp() | |
309 args.append( "--out-file=" + f2 ) | |
310 self.execStep(TF, args) | |
311 else: | |
312 f2 = self.options.file2 | |
313 | |
314 ### find overlapping features. | |
315 | |
316 overlaps=self.mkTmp() | |
317 args = [ | |
318 "-1", f1, | |
319 "-2", f2, | |
320 "-s", "both", | |
321 "-k", self.minOverlap, | |
322 "-o", overlaps, | |
323 ] | |
324 if self.options.ignoreStrand: | |
325 args = args + ["--columns1", "1,4,5"] | |
326 novl = self.execStep(FJ, args) | |
327 | |
328 if self.options.noSelfHits: | |
329 xxx = overlaps | |
330 overlaps = self.mkTmp() | |
331 args = [ | |
332 "?IN[10]!=IN[19]", | |
333 "-1", xxx, | |
334 "-o", overlaps, | |
335 ] | |
336 novl = self.execStep(TF, args) | |
337 #self.debug("FJ out: " + xxx) | |
338 #self.debug("TF out: " + overlaps) | |
339 #os.system("diff %s %s" %(xxx,overlaps)) | |
340 | |
341 | |
342 if novl == 0: | |
343 self.debug("No overlapping features detected.\n") | |
344 | |
345 ### bucketize the pairs. | |
346 | |
347 bucketized=self.mkTmp() | |
348 self.execStep(TB, [ | |
349 "--file1=" + overlaps, | |
350 "--k1=10", | |
351 "--k2=19", | |
352 "-t"+bucketized, | |
353 "IN[1:]", | |
354 "int(IN[11]!=IN[20])", ## compute column: 0==same strands 1==diff | |
355 "int(string.lower(IN[7])!='gene')", ## compute column: 0==all genes 1==nongene | |
356 ]) | |
357 | |
358 sorted = self.mkTmp() | |
359 self.execStep(TS, [ | |
360 "--file1=" + bucketized, | |
361 "-k 3", | |
362 "-k 1", | |
363 "--out-file=" + sorted, | |
364 ]) | |
365 | |
366 self.execStep(TP, [ | |
367 "--file1=" + sorted, | |
368 "-o" + self.options.template, | |
369 "-p 3", | |
370 ]) | |
371 | |
372 ### Bucketization did not generate 1-0 and 0-1 buckets | |
373 ### (because we only fed it overlapping pairs). | |
374 ### Generate these buckets by diff'ing the inputs | |
375 ### against the fjoin output. | |
376 | |
377 self.execStep(TD, [ | |
378 "--file1=" + f1, | |
379 "--k1=9", | |
380 "--file2=" + bucketized, | |
381 "--k2=13", | |
382 "--out-file=" + (self.options.template%"1-0") ]) | |
383 | |
384 self.execStep(TD, [ | |
385 "--file1=" + f2, | |
386 "--k1=9", | |
387 "--file2=" + bucketized, | |
388 "--k2=22", | |
389 "--out-file=" + (self.options.template%"0-1") ]) | |
390 | |
391 #---------------------------------------------------- | |
392 def go_aggregate(self): | |
393 | |
394 # Select rows from first file, and extract feature ids. | |
395 f1 = self.mkTmp() | |
396 args = [ | |
397 "--file1="+self.options.file1, | |
398 "--exec-file="+self.options.guUtilFile, | |
399 ] | |
400 if len(self.options.types1) > 0: | |
401 args.append( "?string.lower(IN[3]) in %s" % self.types1 ) | |
402 if len(self.options.notTypes1) > 0: | |
403 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 ) | |
404 args += [ | |
405 "IN[1:9]", | |
406 'extractID(IN[9],%s)'%self.options.ire1, | |
407 "--out-file=" + f1 | |
408 ] | |
409 | |
410 self.execStep(TF, args) | |
411 | |
412 # Select rows from second file, and extract feature ids. | |
413 f2 = self.mkTmp() | |
414 args = [ | |
415 "--file1="+self.options.file2, | |
416 "--exec-file="+self.options.guUtilFile, | |
417 ] | |
418 if len(self.options.types2) > 0: | |
419 args.append( "?string.lower(IN[3]) in %s" % self.types2 ) | |
420 if len(self.options.notTypes2) > 0: | |
421 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 ) | |
422 args += [ | |
423 "IN[1:9]", | |
424 'extractID(IN[9],%s)'%self.options.ire2, | |
425 "--out-file=" + f2 | |
426 ] | |
427 self.execStep(TF, args) | |
428 | |
429 # Find the distinct higher-level features in file1 and count the base features | |
430 genes1=os.path.join(self.options.outDir, "features1.txt") | |
431 self.execStep(TA, [ | |
432 "--file1="+f1, | |
433 "-g9", # id | |
434 "-acount", # num. lines w/ this id | |
435 "-afirst:1", # first chr | |
436 "-amin:4", # min start val | |
437 "-amax:5", # max end val | |
438 "-afirst:7", # first strand | |
439 "--out-file=" + genes1 | |
440 ]) | |
441 | |
442 # Find the unique genes in file2 and count the exons | |
443 genes2=os.path.join(self.options.outDir, "features2.txt") | |
444 self.execStep(TA, [ | |
445 "--file1="+f2, | |
446 "-g9", | |
447 "-acount", | |
448 "-afirst:1", | |
449 "-amin:4", | |
450 "-amax:5", | |
451 "-afirst:7", | |
452 "--out-file=" + genes2 | |
453 ]) | |
454 | |
455 # Find all overlapping feature pairs. | |
456 ovlExons=self.mkTmp() | |
457 args = [ | |
458 "-1", f1, | |
459 "-2", f2, | |
460 "-o", ovlExons, | |
461 "-s", "both", | |
462 "-k", self.minOverlap, | |
463 ] | |
464 if self.options.ignoreStrand: | |
465 args = args + ["--columns1", "1,4,5"] | |
466 novl = self.execStep(FJ, args) | |
467 | |
468 if self.options.noSelfHits: | |
469 xxx = ovlExons | |
470 ovlExons = self.mkTmp() | |
471 args = [ | |
472 "?IN[10] != IN[19]", | |
473 "-1", xxx, | |
474 "-o", ovlExons, | |
475 ] | |
476 novl = self.execStep(TF, args) | |
477 | |
478 if novl == 0: | |
479 self.debug("No overlapping features detected.\n") | |
480 | |
481 # Aggregate overlapping feature pairs into higher-level overlaps. | |
482 # Count the base features involved. | |
483 ovlGenes = self.mkTmp() | |
484 self.execStep(TA, [ | |
485 "-g10,19", | |
486 "-acount:5", | |
487 "-acount:14", | |
488 "--file1="+ovlExons, | |
489 "--out-file=" + ovlGenes | |
490 ]) | |
491 | |
492 # Join with genes1 to pull in total exon counts. | |
493 # Do an outer join so that every gene in genes1 is | |
494 # represented. | |
495 tmp1 = self.mkTmp() | |
496 self.execStep(TJ, [ | |
497 "--file1=" + ovlGenes, | |
498 "--file2=" + genes1, | |
499 "--k1=1", | |
500 "--k2=1", | |
501 "--right", | |
502 "-n.", | |
503 "--out-file=" + tmp1 | |
504 ]) | |
505 | |
506 # Join with genes2 to pull in total exon counts. | |
507 # Do a bidi-outer join so that every gene in genes2 is | |
508 # represented, as is every gene in genes1.. | |
509 tmp2 = self.mkTmp() | |
510 self.execStep(TJ, [ | |
511 "--file1=" + tmp1, | |
512 "--file2=" + genes2, | |
513 "--k1=2", | |
514 "--k2=1", | |
515 "--left", | |
516 "--right", | |
517 "-n.", | |
518 "--out-file=" + tmp2 | |
519 ]) | |
520 | |
521 # Filter for final output formatting. | |
522 tmp3 = self.mkTmp() | |
523 self.execStep(TF, [ | |
524 "--file1=" + tmp2, | |
525 "--out-file=" + tmp3, | |
526 "IN[7]=='.' and IN[13] or IN[7]" , | |
527 "IN[10]=='.' and IN[16] or IN[16]=='.' and IN[10] or IN[16]==IN[10] and IN[16] or '???'", | |
528 "IN[5]", "IN[3]", "IN[6]", "IN[8]", "IN[9]", "IN[10]", | |
529 "IN[11]", "IN[4]", "IN[12]", "IN[14]", "IN[15]", "IN[16]", | |
530 ]) | |
531 | |
532 # Bucketize the overlapping genes. Output separate file for | |
533 # each bucket. | |
534 self.execStep(TB, [ | |
535 "--file1=" + tmp3, | |
536 "--k1=3", | |
537 "--k2=9", | |
538 "-n.", | |
539 "-t" + self.options.template, | |
540 "IN[1:3]", "IN[4:]", ## remove the bucket id column | |
541 ]) | |
542 | |
543 def go(self): | |
544 self.debug("======================================\n") | |
545 self.debug("Starting GU pipeline\n", True) | |
546 self.debug("Command line:\n%s\n" % (" ".join(self.argv))) | |
547 | |
548 if(self.options.noAggregate): | |
549 self.go_noAggregate() | |
550 else: | |
551 self.go_aggregate() | |
552 | |
553 self.debug("Pipeline completed.\n", True) | |
554 self.debug("Cleaning up...\n") | |
555 | |
556 # Delete the temp files | |
557 #self.cleanupTempFiles() | |
558 self.debug("Goodbye.\n\n") | |
559 | |
560 #-------------------------------- | |
561 GUPipeline(sys.argv).go() | |
562 |