|
0
|
1 #!/usr/local/bin/python
|
|
|
2 #
|
|
|
3 # gu - Gene Unification Pipeline
|
|
|
4 #
|
|
|
5 # This pipeline compares two GFF input files for coordinate overlaps, and
|
|
|
6 # bucketizes the overlapping feature pairs. The comparison step takes two lists
|
|
|
7 # of features [a1, a2, ...], [b1, b2, ...], outputs all feature pairs [ai, bj] such
|
|
|
8 # that ai overlaps bj.
|
|
|
9 #
|
|
|
10 # These feature pairs form the edges in a bipartite graph of overlapping
|
|
|
11 # a's and b's. Bucketization finds the connected components of the graph
|
|
|
12 # and segregates those components (or clusters) into buckets, according to
|
|
|
13 # how many a's and b's there are in the cluster (one-to-one, one-to-many, etc).
|
|
|
14 # The buckets are output to separate files (bucket_1-1.txt, bucket_1-n.txt, ...)
|
|
|
15 #
|
|
|
16 # There is one line output per overlapping pair [ai,bj]. This line contains
|
|
|
17 # all the columns of ai and bj plus three additional columns about the cluster
|
|
|
18 # to which this pair belongs. In detail:
|
|
|
19 # 1. The id of the cluster (connected component) to which this pair belongs.
|
|
|
20 # 2. The bucket label (e.g. "n-1").
|
|
|
21 # 3. The actual counts of a's and b's in the cluster, e.g. "3-1" for 3 a's and 1 b.
|
|
|
22 # 4- All of the columns of ai followed by all of the columns of bj.
|
|
|
23 #
|
|
|
24 # GU operates in one of two modes. The above description corresponds to 'no-aggregate'
|
|
|
25 # mode: the overlapping features are bucketized directly. Sometimes, however, you want
|
|
|
26 # the results aggregated to a higher level. For example, you may have files of exons,
|
|
|
27 # but what you want is buckets of overlapping genes. For this, there is 'aggregate' mode,
|
|
|
28 # in which lists of overlapping exons (for example) are turned into lists of overlapping genes
|
|
|
29 # before being bucketized. Because aggregation loses the details of the underlying features,
|
|
|
30 # the output format is somewhat different from no-aggregate mode.
|
|
|
31 #
|
|
|
32 #--------------------------------------------
|
|
|
33
|
|
|
34 import sys
|
|
|
35 import time
|
|
|
36 import string
|
|
|
37 from tempfile import mkstemp
|
|
|
38 import os
|
|
|
39 import os.path
|
|
|
40 from TableTools import TA,TB,TD,TF,TI,TJ,TP,TS,TU,TX,FJ
|
|
|
41 from optparse import OptionParser
|
|
|
42 import re
|
|
|
43
|
|
|
44 USAGE="%prog [options] --f1 file1.gff --f2 file2.gff\n(For help, use -h option.)"
|
|
|
45
|
|
|
46 def now():
|
|
|
47 return time.asctime(time.localtime(time.time()))
|
|
|
48
|
|
|
49 class GUPipeline :
|
|
|
50 #----------------------------------------------------
|
|
|
51 def __init__(self, argv):
|
|
|
52 # 1. initialize the option parser
|
|
|
53 #
|
|
|
54 self.argv = argv
|
|
|
55 self.parser = OptionParser(USAGE)
|
|
|
56 self.tempfiles = []
|
|
|
57 self.types1 = []
|
|
|
58 self.types2 = []
|
|
|
59
|
|
|
60 self.parser.add_option("-1", "--f1", "--file1",
|
|
|
61 dest="file1",
|
|
|
62 metavar="FILE1",
|
|
|
63 default=None,
|
|
|
64 help="The first GFF file.")
|
|
|
65
|
|
|
66 self.parser.add_option("-2", "--f2", "--file2",
|
|
|
67 dest="file2",
|
|
|
68 metavar="FILE2",
|
|
|
69 default=None,
|
|
|
70 help="The second GFF file.")
|
|
|
71
|
|
|
72 self.parser.add_option("-k", "--minOverlap",
|
|
|
73 dest="k",
|
|
|
74 metavar=" AMT",
|
|
|
75 default="1",
|
|
|
76 help="The minimum required overlap. (Default: 1)")
|
|
|
77
|
|
|
78 #self.parser.set_defaults(mode="advanced")
|
|
|
79 self.parser.add_option("--t1",
|
|
|
80 dest="types1",
|
|
|
81 action="append",
|
|
|
82 metavar="GFFTYPE",
|
|
|
83 default=[],
|
|
|
84 help="A GFF type to select from file 1. Repeatable. (Default = all types)" )
|
|
|
85
|
|
|
86 self.parser.add_option("--t2",
|
|
|
87 dest="types2",
|
|
|
88 action="append",
|
|
|
89 metavar="GFFTYPE",
|
|
|
90 default=[],
|
|
|
91 help="A GFF type to select from file 2. Repeatable.(Default = all types)" )
|
|
|
92
|
|
|
93 self.parser.add_option("--nt1",
|
|
|
94 dest="notTypes1",
|
|
|
95 action="append",
|
|
|
96 metavar="GFFTYPE",
|
|
|
97 default=[],
|
|
|
98 help="A GFF type to FILTER OUT of file 1. Repeatable. (Default = filters no types)" )
|
|
|
99
|
|
|
100 self.parser.add_option("--nt2",
|
|
|
101 dest="notTypes2",
|
|
|
102 action="append",
|
|
|
103 metavar="GFFTYPE",
|
|
|
104 default=[],
|
|
|
105 help="A GFF type to filter out of file 2. Repeatable.(Default = filters no types)" )
|
|
|
106
|
|
|
107 self.parser.add_option("--ire",
|
|
|
108 dest="ire",
|
|
|
109 metavar="REGEX",
|
|
|
110 default=None,
|
|
|
111 help="A regular expression that will be used to extract ids from column 9. " +
|
|
|
112 "(Also: what you'll feel as you try to get this parameter right...) " +
|
|
|
113 "Only applicable in aggregate mode. You can specify different patterns for " +
|
|
|
114 "inputs 1 and 2 by specifying --ire1 and --ire2 instead. " +
|
|
|
115 "Example: to aggregate both inputs by MGI id: --ire 'MGI:[0-9]+'. " +
|
|
|
116 "Example: to aggregate input1 by MGI id and input 2 by VEGA id: " +
|
|
|
117 "--ire1 'MGI:[0-9]+' --ire2 'OTTMUSG[0-9]+'. " +
|
|
|
118 "Advanced usage: Sometimes you need to define a regular expression where the actual " +
|
|
|
119 "id you wish to extract is a sub-part of the whole pattern. This is called a 'capture'. " +
|
|
|
120 "To capture the id, surround that part of the pattern with the magic symbols " +
|
|
|
121 "'(?P<id>' and ')'. For example, suppose you want to capture the MGI id only when it's " +
|
|
|
122 "part of a dbref attribute like this: 'Dbxref=MGI:MGI:012345;'. You could use the following " +
|
|
|
123 "regular expression: 'Dbxref=MGI:(?P<id>MGI:[0-9]+);'."
|
|
|
124 )
|
|
|
125
|
|
|
126 self.parser.add_option("--ire1",
|
|
|
127 dest="ire1",
|
|
|
128 metavar="REGEX",
|
|
|
129 default=None,
|
|
|
130 help="Specify regex for input 1 only. See --ire."
|
|
|
131 )
|
|
|
132
|
|
|
133 self.parser.add_option("--ire2",
|
|
|
134 dest="ire2",
|
|
|
135 metavar="REGEX",
|
|
|
136 default=None,
|
|
|
137 help="Specify regex for input 2 only. See --ire."
|
|
|
138 )
|
|
|
139
|
|
|
140 self.parser.add_option("-i", "--ignoreStrand",
|
|
|
141 dest="ignoreStrand",
|
|
|
142 action="store_true",
|
|
|
143 default = False,
|
|
|
144 help="Ignore strand when determining overlaps (Default = strands must match)")
|
|
|
145
|
|
|
146 self.parser.add_option("-n", "--noAggregate",
|
|
|
147 dest="noAggregate",
|
|
|
148 action="store_true",
|
|
|
149 default = False,
|
|
|
150 help="Do not aggregate features. Opposite of -a.")
|
|
|
151
|
|
|
152 self.parser.add_option("-C", "--chrMatchLoose",
|
|
|
153 dest="chrMatchLoose",
|
|
|
154 action="store_true",
|
|
|
155 default = False,
|
|
|
156 help="If specified, chromosome matching is 'loose'. Otherwise it is exact. " + \
|
|
|
157 "In loose matching, leading 'chr' is removed from chromosome field, " + \
|
|
|
158 "so that '19' matches 'Chr19'. ")
|
|
|
159
|
|
|
160 self.parser.add_option("-s", "--noSelfHits",
|
|
|
161 dest="noSelfHits",
|
|
|
162 action="store_true",
|
|
|
163 default = False,
|
|
|
164 help="Ignore self-hit in overlap detection. (default: reports self-hits)")
|
|
|
165
|
|
|
166 self.parser.add_option("-d", "--od",
|
|
|
167 dest="outDir",
|
|
|
168 metavar="DIRECTORY",
|
|
|
169 default=".",
|
|
|
170 help="Path of output directory. (Default = current directory)")
|
|
|
171
|
|
|
172 self.parser.add_option("-t",
|
|
|
173 dest="template",
|
|
|
174 metavar="TEMPLATE",
|
|
|
175 default="bucket_%s.txt",
|
|
|
176 help="Template string for naming output bucket files. " + \
|
|
|
177 "Must contain the substring '%s', which is replaced by the bucket class. " + \
|
|
|
178 "The classes are: '1-0', '0-1', '1-1', '1-n', 'n-1', 'n-m'. " + \
|
|
|
179 "(Default: bucket_%s.txt)")
|
|
|
180
|
|
|
181 self.parser.add_option("-l", "--lf", "--logFile",
|
|
|
182 dest="logFile",
|
|
|
183 metavar="FILE",
|
|
|
184 default=None,
|
|
|
185 help="Log file. (Default = writes to stderr)")
|
|
|
186
|
|
|
187 # 2. Parse the command line
|
|
|
188 #
|
|
|
189 (self.options,xxx) = self.parser.parse_args(argv)
|
|
|
190
|
|
|
191 # 3. Validate the args
|
|
|
192 #
|
|
|
193 if self.options.logFile is not None:
|
|
|
194 sys.stderr = open(self.options.logFile, 'a')
|
|
|
195
|
|
|
196 if self.options.file1 is None \
|
|
|
197 or self.options.file2 is None:
|
|
|
198 self.parser.error("Must specify both --f1 and --f2.")
|
|
|
199
|
|
|
200 if not os.path.isdir(self.options.outDir):
|
|
|
201 self.parser.error("Output directory " + self.options.outDir + \
|
|
|
202 " does not exist or is not a directory.")
|
|
|
203
|
|
|
204 self.types1 = str( self.options.types1 ).lower()
|
|
|
205 self.types2 = str( self.options.types2 ).lower()
|
|
|
206 self.notTypes1 = str( self.options.notTypes1 ).lower()
|
|
|
207 self.notTypes2 = str( self.options.notTypes2 ).lower()
|
|
|
208
|
|
|
209 self.minOverlap = self.options.k
|
|
|
210
|
|
|
211 if not '%s' in self.options.template:
|
|
|
212 self.options.template += '%s'
|
|
|
213
|
|
|
214 self.options.template = \
|
|
|
215 os.path.join( self.options.outDir, self.options.template)
|
|
|
216
|
|
|
217 if self.options.chrMatchLoose:
|
|
|
218 self.options.chrMatchExpr = "[re.sub('^(?i)chr(om(osome)?)?', '', IN[1])]+IN[2:]"
|
|
|
219 else:
|
|
|
220 self.options.chrMatchExpr = "IN[1:]"
|
|
|
221
|
|
|
222 self.options.guDir = os.path.split(__file__)[0]
|
|
|
223 self.options.guUtilFile = os.path.join(self.options.guDir,'guUtil.py')
|
|
|
224
|
|
|
225
|
|
|
226 # local function
|
|
|
227 def groomRe(name, default):
|
|
|
228 r = getattr(self.options, name)
|
|
|
229 if r is None:
|
|
|
230 r = default
|
|
|
231 if "(?P<id>" not in r:
|
|
|
232 r = "(?P<id>%s)" % r
|
|
|
233 r = "r'%s'" % r
|
|
|
234 setattr(self.options, name, r )
|
|
|
235 self.debug('%s=%s\n'%(name,r))
|
|
|
236
|
|
|
237 if self.options.ire is None:
|
|
|
238 self.options.ire = r' *(gene_?(id)?|id)? *[=:]? *"?(?P<id>[^"; ]+)"?'
|
|
|
239 dflt = self.options.ire
|
|
|
240 groomRe('ire1',dflt)
|
|
|
241 groomRe('ire2',dflt)
|
|
|
242
|
|
|
243
|
|
|
244 #----------------------------------------------------
|
|
|
245 def mkTmp(self,preserve=False):
|
|
|
246 tf = mkstemp()
|
|
|
247 os.close(tf[0])
|
|
|
248 if not preserve:
|
|
|
249 self.tempfiles.append(tf[1])
|
|
|
250 return tf[1]
|
|
|
251
|
|
|
252 #----------------------------------------------------
|
|
|
253 def cleanupTempFiles(self):
|
|
|
254 for tf in self.tempfiles:
|
|
|
255 os.remove(tf)
|
|
|
256
|
|
|
257 #----------------------------------------------------
|
|
|
258 def debug(self,s,ts=False):
|
|
|
259 if ts:
|
|
|
260 sys.stderr.write(now()+': ')
|
|
|
261 sys.stderr.write(s)
|
|
|
262
|
|
|
263 #----------------------------------------------------
|
|
|
264 def execStep(self, tool, args):
|
|
|
265 self.debug( tool.__name__ + " " + string.join(args, " ") + "\n", True)
|
|
|
266 t = tool(args)
|
|
|
267 t.go()
|
|
|
268 t.closeFiles()
|
|
|
269 return t.nOutputRows
|
|
|
270
|
|
|
271 #----------------------------------------------------
|
|
|
272 def go_noAggregate(self):
|
|
|
273
|
|
|
274 ### Select rows from first file.
|
|
|
275
|
|
|
276 args = [
|
|
|
277 "--file1="+self.options.file1,
|
|
|
278 self.options.chrMatchExpr,
|
|
|
279 ]
|
|
|
280
|
|
|
281 if len(self.options.types1) > 0:
|
|
|
282 args.append( "?string.lower(IN[3]) in %s" % self.types1 )
|
|
|
283
|
|
|
284 if len(self.options.notTypes1) > 0:
|
|
|
285 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 )
|
|
|
286
|
|
|
287 if len(args) > 2:
|
|
|
288 f1 = self.mkTmp()
|
|
|
289 args.append( "--out-file=" + f1 )
|
|
|
290 self.execStep(TF, args)
|
|
|
291 else:
|
|
|
292 f1 = self.options.file1
|
|
|
293
|
|
|
294 ### Select rows from second file.
|
|
|
295
|
|
|
296 args = [
|
|
|
297 "--file1="+self.options.file2,
|
|
|
298 self.options.chrMatchExpr,
|
|
|
299 ]
|
|
|
300
|
|
|
301 if len(self.options.types2) > 0:
|
|
|
302 args.append( "?string.lower(IN[3]) in %s" % self.types2 )
|
|
|
303
|
|
|
304 if len(self.options.notTypes2) > 0:
|
|
|
305 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 )
|
|
|
306
|
|
|
307 if len(args) > 2:
|
|
|
308 f2 = self.mkTmp()
|
|
|
309 args.append( "--out-file=" + f2 )
|
|
|
310 self.execStep(TF, args)
|
|
|
311 else:
|
|
|
312 f2 = self.options.file2
|
|
|
313
|
|
|
314 ### find overlapping features.
|
|
|
315
|
|
|
316 overlaps=self.mkTmp()
|
|
|
317 args = [
|
|
|
318 "-1", f1,
|
|
|
319 "-2", f2,
|
|
|
320 "-s", "both",
|
|
|
321 "-k", self.minOverlap,
|
|
|
322 "-o", overlaps,
|
|
|
323 ]
|
|
|
324 if self.options.ignoreStrand:
|
|
|
325 args = args + ["--columns1", "1,4,5"]
|
|
|
326 novl = self.execStep(FJ, args)
|
|
|
327
|
|
|
328 if self.options.noSelfHits:
|
|
|
329 xxx = overlaps
|
|
|
330 overlaps = self.mkTmp()
|
|
|
331 args = [
|
|
|
332 "?IN[10]!=IN[19]",
|
|
|
333 "-1", xxx,
|
|
|
334 "-o", overlaps,
|
|
|
335 ]
|
|
|
336 novl = self.execStep(TF, args)
|
|
|
337 #self.debug("FJ out: " + xxx)
|
|
|
338 #self.debug("TF out: " + overlaps)
|
|
|
339 #os.system("diff %s %s" %(xxx,overlaps))
|
|
|
340
|
|
|
341
|
|
|
342 if novl == 0:
|
|
|
343 self.debug("No overlapping features detected.\n")
|
|
|
344
|
|
|
345 ### bucketize the pairs.
|
|
|
346
|
|
|
347 bucketized=self.mkTmp()
|
|
|
348 self.execStep(TB, [
|
|
|
349 "--file1=" + overlaps,
|
|
|
350 "--k1=10",
|
|
|
351 "--k2=19",
|
|
|
352 "-t"+bucketized,
|
|
|
353 "IN[1:]",
|
|
|
354 "int(IN[11]!=IN[20])", ## compute column: 0==same strands 1==diff
|
|
|
355 "int(string.lower(IN[7])!='gene')", ## compute column: 0==all genes 1==nongene
|
|
|
356 ])
|
|
|
357
|
|
|
358 sorted = self.mkTmp()
|
|
|
359 self.execStep(TS, [
|
|
|
360 "--file1=" + bucketized,
|
|
|
361 "-k 3",
|
|
|
362 "-k 1",
|
|
|
363 "--out-file=" + sorted,
|
|
|
364 ])
|
|
|
365
|
|
|
366 self.execStep(TP, [
|
|
|
367 "--file1=" + sorted,
|
|
|
368 "-o" + self.options.template,
|
|
|
369 "-p 3",
|
|
|
370 ])
|
|
|
371
|
|
|
372 ### Bucketization did not generate 1-0 and 0-1 buckets
|
|
|
373 ### (because we only fed it overlapping pairs).
|
|
|
374 ### Generate these buckets by diff'ing the inputs
|
|
|
375 ### against the fjoin output.
|
|
|
376
|
|
|
377 self.execStep(TD, [
|
|
|
378 "--file1=" + f1,
|
|
|
379 "--k1=9",
|
|
|
380 "--file2=" + bucketized,
|
|
|
381 "--k2=13",
|
|
|
382 "--out-file=" + (self.options.template%"1-0") ])
|
|
|
383
|
|
|
384 self.execStep(TD, [
|
|
|
385 "--file1=" + f2,
|
|
|
386 "--k1=9",
|
|
|
387 "--file2=" + bucketized,
|
|
|
388 "--k2=22",
|
|
|
389 "--out-file=" + (self.options.template%"0-1") ])
|
|
|
390
|
|
|
391 #----------------------------------------------------
|
|
|
392 def go_aggregate(self):
|
|
|
393
|
|
|
394 # Select rows from first file, and extract feature ids.
|
|
|
395 f1 = self.mkTmp()
|
|
|
396 args = [
|
|
|
397 "--file1="+self.options.file1,
|
|
|
398 "--exec-file="+self.options.guUtilFile,
|
|
|
399 ]
|
|
|
400 if len(self.options.types1) > 0:
|
|
|
401 args.append( "?string.lower(IN[3]) in %s" % self.types1 )
|
|
|
402 if len(self.options.notTypes1) > 0:
|
|
|
403 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 )
|
|
|
404 args += [
|
|
|
405 "IN[1:9]",
|
|
|
406 'extractID(IN[9],%s)'%self.options.ire1,
|
|
|
407 "--out-file=" + f1
|
|
|
408 ]
|
|
|
409
|
|
|
410 self.execStep(TF, args)
|
|
|
411
|
|
|
412 # Select rows from second file, and extract feature ids.
|
|
|
413 f2 = self.mkTmp()
|
|
|
414 args = [
|
|
|
415 "--file1="+self.options.file2,
|
|
|
416 "--exec-file="+self.options.guUtilFile,
|
|
|
417 ]
|
|
|
418 if len(self.options.types2) > 0:
|
|
|
419 args.append( "?string.lower(IN[3]) in %s" % self.types2 )
|
|
|
420 if len(self.options.notTypes2) > 0:
|
|
|
421 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 )
|
|
|
422 args += [
|
|
|
423 "IN[1:9]",
|
|
|
424 'extractID(IN[9],%s)'%self.options.ire2,
|
|
|
425 "--out-file=" + f2
|
|
|
426 ]
|
|
|
427 self.execStep(TF, args)
|
|
|
428
|
|
|
429 # Find the distinct higher-level features in file1 and count the base features
|
|
|
430 genes1=os.path.join(self.options.outDir, "features1.txt")
|
|
|
431 self.execStep(TA, [
|
|
|
432 "--file1="+f1,
|
|
|
433 "-g9", # id
|
|
|
434 "-acount", # num. lines w/ this id
|
|
|
435 "-afirst:1", # first chr
|
|
|
436 "-amin:4", # min start val
|
|
|
437 "-amax:5", # max end val
|
|
|
438 "-afirst:7", # first strand
|
|
|
439 "--out-file=" + genes1
|
|
|
440 ])
|
|
|
441
|
|
|
442 # Find the unique genes in file2 and count the exons
|
|
|
443 genes2=os.path.join(self.options.outDir, "features2.txt")
|
|
|
444 self.execStep(TA, [
|
|
|
445 "--file1="+f2,
|
|
|
446 "-g9",
|
|
|
447 "-acount",
|
|
|
448 "-afirst:1",
|
|
|
449 "-amin:4",
|
|
|
450 "-amax:5",
|
|
|
451 "-afirst:7",
|
|
|
452 "--out-file=" + genes2
|
|
|
453 ])
|
|
|
454
|
|
|
455 # Find all overlapping feature pairs.
|
|
|
456 ovlExons=self.mkTmp()
|
|
|
457 args = [
|
|
|
458 "-1", f1,
|
|
|
459 "-2", f2,
|
|
|
460 "-o", ovlExons,
|
|
|
461 "-s", "both",
|
|
|
462 "-k", self.minOverlap,
|
|
|
463 ]
|
|
|
464 if self.options.ignoreStrand:
|
|
|
465 args = args + ["--columns1", "1,4,5"]
|
|
|
466 novl = self.execStep(FJ, args)
|
|
|
467
|
|
|
468 if self.options.noSelfHits:
|
|
|
469 xxx = ovlExons
|
|
|
470 ovlExons = self.mkTmp()
|
|
|
471 args = [
|
|
|
472 "?IN[10] != IN[19]",
|
|
|
473 "-1", xxx,
|
|
|
474 "-o", ovlExons,
|
|
|
475 ]
|
|
|
476 novl = self.execStep(TF, args)
|
|
|
477
|
|
|
478 if novl == 0:
|
|
|
479 self.debug("No overlapping features detected.\n")
|
|
|
480
|
|
|
481 # Aggregate overlapping feature pairs into higher-level overlaps.
|
|
|
482 # Count the base features involved.
|
|
|
483 ovlGenes = self.mkTmp()
|
|
|
484 self.execStep(TA, [
|
|
|
485 "-g10,19",
|
|
|
486 "-acount:5",
|
|
|
487 "-acount:14",
|
|
|
488 "--file1="+ovlExons,
|
|
|
489 "--out-file=" + ovlGenes
|
|
|
490 ])
|
|
|
491
|
|
|
492 # Join with genes1 to pull in total exon counts.
|
|
|
493 # Do an outer join so that every gene in genes1 is
|
|
|
494 # represented.
|
|
|
495 tmp1 = self.mkTmp()
|
|
|
496 self.execStep(TJ, [
|
|
|
497 "--file1=" + ovlGenes,
|
|
|
498 "--file2=" + genes1,
|
|
|
499 "--k1=1",
|
|
|
500 "--k2=1",
|
|
|
501 "--right",
|
|
|
502 "-n.",
|
|
|
503 "--out-file=" + tmp1
|
|
|
504 ])
|
|
|
505
|
|
|
506 # Join with genes2 to pull in total exon counts.
|
|
|
507 # Do a bidi-outer join so that every gene in genes2 is
|
|
|
508 # represented, as is every gene in genes1..
|
|
|
509 tmp2 = self.mkTmp()
|
|
|
510 self.execStep(TJ, [
|
|
|
511 "--file1=" + tmp1,
|
|
|
512 "--file2=" + genes2,
|
|
|
513 "--k1=2",
|
|
|
514 "--k2=1",
|
|
|
515 "--left",
|
|
|
516 "--right",
|
|
|
517 "-n.",
|
|
|
518 "--out-file=" + tmp2
|
|
|
519 ])
|
|
|
520
|
|
|
521 # Filter for final output formatting.
|
|
|
522 tmp3 = self.mkTmp()
|
|
|
523 self.execStep(TF, [
|
|
|
524 "--file1=" + tmp2,
|
|
|
525 "--out-file=" + tmp3,
|
|
|
526 "IN[7]=='.' and IN[13] or IN[7]" ,
|
|
|
527 "IN[10]=='.' and IN[16] or IN[16]=='.' and IN[10] or IN[16]==IN[10] and IN[16] or '???'",
|
|
|
528 "IN[5]", "IN[3]", "IN[6]", "IN[8]", "IN[9]", "IN[10]",
|
|
|
529 "IN[11]", "IN[4]", "IN[12]", "IN[14]", "IN[15]", "IN[16]",
|
|
|
530 ])
|
|
|
531
|
|
|
532 # Bucketize the overlapping genes. Output separate file for
|
|
|
533 # each bucket.
|
|
|
534 self.execStep(TB, [
|
|
|
535 "--file1=" + tmp3,
|
|
|
536 "--k1=3",
|
|
|
537 "--k2=9",
|
|
|
538 "-n.",
|
|
|
539 "-t" + self.options.template,
|
|
|
540 "IN[1:3]", "IN[4:]", ## remove the bucket id column
|
|
|
541 ])
|
|
|
542
|
|
|
543 def go(self):
|
|
|
544 self.debug("======================================\n")
|
|
|
545 self.debug("Starting GU pipeline\n", True)
|
|
|
546 self.debug("Command line:\n%s\n" % (" ".join(self.argv)))
|
|
|
547
|
|
|
548 if(self.options.noAggregate):
|
|
|
549 self.go_noAggregate()
|
|
|
550 else:
|
|
|
551 self.go_aggregate()
|
|
|
552
|
|
|
553 self.debug("Pipeline completed.\n", True)
|
|
|
554 self.debug("Cleaning up...\n")
|
|
|
555
|
|
|
556 # Delete the temp files
|
|
|
557 #self.cleanupTempFiles()
|
|
|
558 self.debug("Goodbye.\n\n")
|
|
|
559
|
|
|
560 #--------------------------------
|
|
|
561 GUPipeline(sys.argv).go()
|
|
|
562
|