annotate GU_files/gu.py @ 0:a792b0548fe9 draft default tip

Upload GU tool
author carlos-reyes
date Fri, 15 Jun 2012 10:37:45 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
1 #!/usr/local/bin/python
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
2 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
3 # gu - Gene Unification Pipeline
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
4 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
5 # This pipeline compares two GFF input files for coordinate overlaps, and
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
6 # bucketizes the overlapping feature pairs. The comparison step takes two lists
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
7 # of features [a1, a2, ...], [b1, b2, ...], outputs all feature pairs [ai, bj] such
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
8 # that ai overlaps bj.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
9 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
10 # These feature pairs form the edges in a bipartite graph of overlapping
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
11 # a's and b's. Bucketization finds the connected components of the graph
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
12 # and segregates those components (or clusters) into buckets, according to
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
13 # how many a's and b's there are in the cluster (one-to-one, one-to-many, etc).
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
14 # The buckets are output to separate files (bucket_1-1.txt, bucket_1-n.txt, ...)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
15 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
16 # There is one line output per overlapping pair [ai,bj]. This line contains
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
17 # all the columns of ai and bj plus three additional columns about the cluster
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
18 # to which this pair belongs. In detail:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
19 # 1. The id of the cluster (connected component) to which this pair belongs.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
20 # 2. The bucket label (e.g. "n-1").
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
21 # 3. The actual counts of a's and b's in the cluster, e.g. "3-1" for 3 a's and 1 b.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
22 # 4- All of the columns of ai followed by all of the columns of bj.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
23 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
24 # GU operates in one of two modes. The above description corresponds to 'no-aggregate'
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
25 # mode: the overlapping features are bucketized directly. Sometimes, however, you want
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
26 # the results aggregated to a higher level. For example, you may have files of exons,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
27 # but what you want is buckets of overlapping genes. For this, there is 'aggregate' mode,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
28 # in which lists of overlapping exons (for example) are turned into lists of overlapping genes
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
29 # before being bucketized. Because aggregation loses the details of the underlying features,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
30 # the output format is somewhat different from no-aggregate mode.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
31 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
32 #--------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
33
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
34 import sys
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
35 import time
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
36 import string
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
37 from tempfile import mkstemp
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
38 import os
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
39 import os.path
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
40 from TableTools import TA,TB,TD,TF,TI,TJ,TP,TS,TU,TX,FJ
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
41 from optparse import OptionParser
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
42 import re
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
43
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
44 USAGE="%prog [options] --f1 file1.gff --f2 file2.gff\n(For help, use -h option.)"
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
45
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
46 def now():
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
47 return time.asctime(time.localtime(time.time()))
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
48
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
49 class GUPipeline :
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
50 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
51 def __init__(self, argv):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
52 # 1. initialize the option parser
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
53 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
54 self.argv = argv
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
55 self.parser = OptionParser(USAGE)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
56 self.tempfiles = []
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
57 self.types1 = []
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
58 self.types2 = []
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
59
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
60 self.parser.add_option("-1", "--f1", "--file1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
61 dest="file1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
62 metavar="FILE1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
63 default=None,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
64 help="The first GFF file.")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
65
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
66 self.parser.add_option("-2", "--f2", "--file2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
67 dest="file2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
68 metavar="FILE2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
69 default=None,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
70 help="The second GFF file.")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
71
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
72 self.parser.add_option("-k", "--minOverlap",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
73 dest="k",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
74 metavar=" AMT",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
75 default="1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
76 help="The minimum required overlap. (Default: 1)")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
77
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
78 #self.parser.set_defaults(mode="advanced")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
79 self.parser.add_option("--t1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
80 dest="types1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
81 action="append",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
82 metavar="GFFTYPE",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
83 default=[],
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
84 help="A GFF type to select from file 1. Repeatable. (Default = all types)" )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
85
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
86 self.parser.add_option("--t2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
87 dest="types2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
88 action="append",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
89 metavar="GFFTYPE",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
90 default=[],
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
91 help="A GFF type to select from file 2. Repeatable.(Default = all types)" )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
92
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
93 self.parser.add_option("--nt1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
94 dest="notTypes1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
95 action="append",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
96 metavar="GFFTYPE",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
97 default=[],
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
98 help="A GFF type to FILTER OUT of file 1. Repeatable. (Default = filters no types)" )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
99
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
100 self.parser.add_option("--nt2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
101 dest="notTypes2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
102 action="append",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
103 metavar="GFFTYPE",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
104 default=[],
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
105 help="A GFF type to filter out of file 2. Repeatable.(Default = filters no types)" )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
106
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
107 self.parser.add_option("--ire",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
108 dest="ire",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
109 metavar="REGEX",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
110 default=None,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
111 help="A regular expression that will be used to extract ids from column 9. " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
112 "(Also: what you'll feel as you try to get this parameter right...) " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
113 "Only applicable in aggregate mode. You can specify different patterns for " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
114 "inputs 1 and 2 by specifying --ire1 and --ire2 instead. " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
115 "Example: to aggregate both inputs by MGI id: --ire 'MGI:[0-9]+'. " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
116 "Example: to aggregate input1 by MGI id and input 2 by VEGA id: " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
117 "--ire1 'MGI:[0-9]+' --ire2 'OTTMUSG[0-9]+'. " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
118 "Advanced usage: Sometimes you need to define a regular expression where the actual " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
119 "id you wish to extract is a sub-part of the whole pattern. This is called a 'capture'. " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
120 "To capture the id, surround that part of the pattern with the magic symbols " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
121 "'(?P<id>' and ')'. For example, suppose you want to capture the MGI id only when it's " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
122 "part of a dbref attribute like this: 'Dbxref=MGI:MGI:012345;'. You could use the following " +
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
123 "regular expression: 'Dbxref=MGI:(?P<id>MGI:[0-9]+);'."
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
124 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
125
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
126 self.parser.add_option("--ire1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
127 dest="ire1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
128 metavar="REGEX",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
129 default=None,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
130 help="Specify regex for input 1 only. See --ire."
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
131 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
132
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
133 self.parser.add_option("--ire2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
134 dest="ire2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
135 metavar="REGEX",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
136 default=None,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
137 help="Specify regex for input 2 only. See --ire."
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
138 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
139
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
140 self.parser.add_option("-i", "--ignoreStrand",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
141 dest="ignoreStrand",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
142 action="store_true",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
143 default = False,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
144 help="Ignore strand when determining overlaps (Default = strands must match)")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
145
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
146 self.parser.add_option("-n", "--noAggregate",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
147 dest="noAggregate",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
148 action="store_true",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
149 default = False,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
150 help="Do not aggregate features. Opposite of -a.")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
151
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
152 self.parser.add_option("-C", "--chrMatchLoose",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
153 dest="chrMatchLoose",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
154 action="store_true",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
155 default = False,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
156 help="If specified, chromosome matching is 'loose'. Otherwise it is exact. " + \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
157 "In loose matching, leading 'chr' is removed from chromosome field, " + \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
158 "so that '19' matches 'Chr19'. ")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
159
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
160 self.parser.add_option("-s", "--noSelfHits",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
161 dest="noSelfHits",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
162 action="store_true",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
163 default = False,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
164 help="Ignore self-hit in overlap detection. (default: reports self-hits)")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
165
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
166 self.parser.add_option("-d", "--od",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
167 dest="outDir",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
168 metavar="DIRECTORY",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
169 default=".",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
170 help="Path of output directory. (Default = current directory)")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
171
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
172 self.parser.add_option("-t",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
173 dest="template",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
174 metavar="TEMPLATE",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
175 default="bucket_%s.txt",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
176 help="Template string for naming output bucket files. " + \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
177 "Must contain the substring '%s', which is replaced by the bucket class. " + \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
178 "The classes are: '1-0', '0-1', '1-1', '1-n', 'n-1', 'n-m'. " + \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
179 "(Default: bucket_%s.txt)")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
180
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
181 self.parser.add_option("-l", "--lf", "--logFile",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
182 dest="logFile",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
183 metavar="FILE",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
184 default=None,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
185 help="Log file. (Default = writes to stderr)")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
186
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
187 # 2. Parse the command line
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
188 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
189 (self.options,xxx) = self.parser.parse_args(argv)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
190
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
191 # 3. Validate the args
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
192 #
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
193 if self.options.logFile is not None:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
194 sys.stderr = open(self.options.logFile, 'a')
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
195
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
196 if self.options.file1 is None \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
197 or self.options.file2 is None:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
198 self.parser.error("Must specify both --f1 and --f2.")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
199
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
200 if not os.path.isdir(self.options.outDir):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
201 self.parser.error("Output directory " + self.options.outDir + \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
202 " does not exist or is not a directory.")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
203
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
204 self.types1 = str( self.options.types1 ).lower()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
205 self.types2 = str( self.options.types2 ).lower()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
206 self.notTypes1 = str( self.options.notTypes1 ).lower()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
207 self.notTypes2 = str( self.options.notTypes2 ).lower()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
208
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
209 self.minOverlap = self.options.k
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
210
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
211 if not '%s' in self.options.template:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
212 self.options.template += '%s'
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
213
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
214 self.options.template = \
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
215 os.path.join( self.options.outDir, self.options.template)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
216
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
217 if self.options.chrMatchLoose:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
218 self.options.chrMatchExpr = "[re.sub('^(?i)chr(om(osome)?)?', '', IN[1])]+IN[2:]"
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
219 else:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
220 self.options.chrMatchExpr = "IN[1:]"
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
221
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
222 self.options.guDir = os.path.split(__file__)[0]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
223 self.options.guUtilFile = os.path.join(self.options.guDir,'guUtil.py')
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
224
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
225
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
226 # local function
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
227 def groomRe(name, default):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
228 r = getattr(self.options, name)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
229 if r is None:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
230 r = default
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
231 if "(?P<id>" not in r:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
232 r = "(?P<id>%s)" % r
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
233 r = "r'%s'" % r
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
234 setattr(self.options, name, r )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
235 self.debug('%s=%s\n'%(name,r))
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
236
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
237 if self.options.ire is None:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
238 self.options.ire = r' *(gene_?(id)?|id)? *[=:]? *"?(?P<id>[^"; ]+)"?'
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
239 dflt = self.options.ire
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
240 groomRe('ire1',dflt)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
241 groomRe('ire2',dflt)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
242
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
243
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
244 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
245 def mkTmp(self,preserve=False):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
246 tf = mkstemp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
247 os.close(tf[0])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
248 if not preserve:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
249 self.tempfiles.append(tf[1])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
250 return tf[1]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
251
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
252 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
253 def cleanupTempFiles(self):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
254 for tf in self.tempfiles:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
255 os.remove(tf)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
256
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
257 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
258 def debug(self,s,ts=False):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
259 if ts:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
260 sys.stderr.write(now()+': ')
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
261 sys.stderr.write(s)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
262
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
263 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
264 def execStep(self, tool, args):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
265 self.debug( tool.__name__ + " " + string.join(args, " ") + "\n", True)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
266 t = tool(args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
267 t.go()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
268 t.closeFiles()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
269 return t.nOutputRows
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
270
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
271 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
272 def go_noAggregate(self):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
273
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
274 ### Select rows from first file.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
275
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
276 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
277 "--file1="+self.options.file1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
278 self.options.chrMatchExpr,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
279 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
280
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
281 if len(self.options.types1) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
282 args.append( "?string.lower(IN[3]) in %s" % self.types1 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
283
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
284 if len(self.options.notTypes1) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
285 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
286
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
287 if len(args) > 2:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
288 f1 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
289 args.append( "--out-file=" + f1 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
290 self.execStep(TF, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
291 else:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
292 f1 = self.options.file1
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
293
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
294 ### Select rows from second file.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
295
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
296 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
297 "--file1="+self.options.file2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
298 self.options.chrMatchExpr,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
299 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
300
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
301 if len(self.options.types2) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
302 args.append( "?string.lower(IN[3]) in %s" % self.types2 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
303
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
304 if len(self.options.notTypes2) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
305 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
306
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
307 if len(args) > 2:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
308 f2 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
309 args.append( "--out-file=" + f2 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
310 self.execStep(TF, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
311 else:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
312 f2 = self.options.file2
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
313
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
314 ### find overlapping features.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
315
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
316 overlaps=self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
317 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
318 "-1", f1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
319 "-2", f2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
320 "-s", "both",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
321 "-k", self.minOverlap,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
322 "-o", overlaps,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
323 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
324 if self.options.ignoreStrand:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
325 args = args + ["--columns1", "1,4,5"]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
326 novl = self.execStep(FJ, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
327
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
328 if self.options.noSelfHits:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
329 xxx = overlaps
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
330 overlaps = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
331 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
332 "?IN[10]!=IN[19]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
333 "-1", xxx,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
334 "-o", overlaps,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
335 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
336 novl = self.execStep(TF, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
337 #self.debug("FJ out: " + xxx)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
338 #self.debug("TF out: " + overlaps)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
339 #os.system("diff %s %s" %(xxx,overlaps))
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
340
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
341
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
342 if novl == 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
343 self.debug("No overlapping features detected.\n")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
344
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
345 ### bucketize the pairs.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
346
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
347 bucketized=self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
348 self.execStep(TB, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
349 "--file1=" + overlaps,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
350 "--k1=10",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
351 "--k2=19",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
352 "-t"+bucketized,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
353 "IN[1:]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
354 "int(IN[11]!=IN[20])", ## compute column: 0==same strands 1==diff
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
355 "int(string.lower(IN[7])!='gene')", ## compute column: 0==all genes 1==nongene
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
356 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
357
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
358 sorted = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
359 self.execStep(TS, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
360 "--file1=" + bucketized,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
361 "-k 3",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
362 "-k 1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
363 "--out-file=" + sorted,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
364 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
365
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
366 self.execStep(TP, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
367 "--file1=" + sorted,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
368 "-o" + self.options.template,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
369 "-p 3",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
370 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
371
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
372 ### Bucketization did not generate 1-0 and 0-1 buckets
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
373 ### (because we only fed it overlapping pairs).
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
374 ### Generate these buckets by diff'ing the inputs
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
375 ### against the fjoin output.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
376
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
377 self.execStep(TD, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
378 "--file1=" + f1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
379 "--k1=9",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
380 "--file2=" + bucketized,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
381 "--k2=13",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
382 "--out-file=" + (self.options.template%"1-0") ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
383
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
384 self.execStep(TD, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
385 "--file1=" + f2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
386 "--k1=9",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
387 "--file2=" + bucketized,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
388 "--k2=22",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
389 "--out-file=" + (self.options.template%"0-1") ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
390
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
391 #----------------------------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
392 def go_aggregate(self):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
393
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
394 # Select rows from first file, and extract feature ids.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
395 f1 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
396 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
397 "--file1="+self.options.file1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
398 "--exec-file="+self.options.guUtilFile,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
399 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
400 if len(self.options.types1) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
401 args.append( "?string.lower(IN[3]) in %s" % self.types1 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
402 if len(self.options.notTypes1) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
403 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes1 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
404 args += [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
405 "IN[1:9]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
406 'extractID(IN[9],%s)'%self.options.ire1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
407 "--out-file=" + f1
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
408 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
409
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
410 self.execStep(TF, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
411
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
412 # Select rows from second file, and extract feature ids.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
413 f2 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
414 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
415 "--file1="+self.options.file2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
416 "--exec-file="+self.options.guUtilFile,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
417 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
418 if len(self.options.types2) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
419 args.append( "?string.lower(IN[3]) in %s" % self.types2 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
420 if len(self.options.notTypes2) > 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
421 args.append( "?string.lower(IN[3]) not in %s" % self.notTypes2 )
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
422 args += [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
423 "IN[1:9]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
424 'extractID(IN[9],%s)'%self.options.ire2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
425 "--out-file=" + f2
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
426 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
427 self.execStep(TF, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
428
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
429 # Find the distinct higher-level features in file1 and count the base features
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
430 genes1=os.path.join(self.options.outDir, "features1.txt")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
431 self.execStep(TA, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
432 "--file1="+f1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
433 "-g9", # id
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
434 "-acount", # num. lines w/ this id
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
435 "-afirst:1", # first chr
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
436 "-amin:4", # min start val
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
437 "-amax:5", # max end val
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
438 "-afirst:7", # first strand
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
439 "--out-file=" + genes1
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
440 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
441
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
442 # Find the unique genes in file2 and count the exons
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
443 genes2=os.path.join(self.options.outDir, "features2.txt")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
444 self.execStep(TA, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
445 "--file1="+f2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
446 "-g9",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
447 "-acount",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
448 "-afirst:1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
449 "-amin:4",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
450 "-amax:5",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
451 "-afirst:7",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
452 "--out-file=" + genes2
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
453 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
454
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
455 # Find all overlapping feature pairs.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
456 ovlExons=self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
457 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
458 "-1", f1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
459 "-2", f2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
460 "-o", ovlExons,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
461 "-s", "both",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
462 "-k", self.minOverlap,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
463 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
464 if self.options.ignoreStrand:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
465 args = args + ["--columns1", "1,4,5"]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
466 novl = self.execStep(FJ, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
467
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
468 if self.options.noSelfHits:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
469 xxx = ovlExons
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
470 ovlExons = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
471 args = [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
472 "?IN[10] != IN[19]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
473 "-1", xxx,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
474 "-o", ovlExons,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
475 ]
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
476 novl = self.execStep(TF, args)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
477
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
478 if novl == 0:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
479 self.debug("No overlapping features detected.\n")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
480
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
481 # Aggregate overlapping feature pairs into higher-level overlaps.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
482 # Count the base features involved.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
483 ovlGenes = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
484 self.execStep(TA, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
485 "-g10,19",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
486 "-acount:5",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
487 "-acount:14",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
488 "--file1="+ovlExons,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
489 "--out-file=" + ovlGenes
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
490 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
491
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
492 # Join with genes1 to pull in total exon counts.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
493 # Do an outer join so that every gene in genes1 is
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
494 # represented.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
495 tmp1 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
496 self.execStep(TJ, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
497 "--file1=" + ovlGenes,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
498 "--file2=" + genes1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
499 "--k1=1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
500 "--k2=1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
501 "--right",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
502 "-n.",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
503 "--out-file=" + tmp1
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
504 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
505
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
506 # Join with genes2 to pull in total exon counts.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
507 # Do a bidi-outer join so that every gene in genes2 is
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
508 # represented, as is every gene in genes1..
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
509 tmp2 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
510 self.execStep(TJ, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
511 "--file1=" + tmp1,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
512 "--file2=" + genes2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
513 "--k1=2",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
514 "--k2=1",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
515 "--left",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
516 "--right",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
517 "-n.",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
518 "--out-file=" + tmp2
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
519 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
520
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
521 # Filter for final output formatting.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
522 tmp3 = self.mkTmp()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
523 self.execStep(TF, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
524 "--file1=" + tmp2,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
525 "--out-file=" + tmp3,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
526 "IN[7]=='.' and IN[13] or IN[7]" ,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
527 "IN[10]=='.' and IN[16] or IN[16]=='.' and IN[10] or IN[16]==IN[10] and IN[16] or '???'",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
528 "IN[5]", "IN[3]", "IN[6]", "IN[8]", "IN[9]", "IN[10]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
529 "IN[11]", "IN[4]", "IN[12]", "IN[14]", "IN[15]", "IN[16]",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
530 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
531
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
532 # Bucketize the overlapping genes. Output separate file for
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
533 # each bucket.
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
534 self.execStep(TB, [
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
535 "--file1=" + tmp3,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
536 "--k1=3",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
537 "--k2=9",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
538 "-n.",
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
539 "-t" + self.options.template,
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
540 "IN[1:3]", "IN[4:]", ## remove the bucket id column
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
541 ])
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
542
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
543 def go(self):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
544 self.debug("======================================\n")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
545 self.debug("Starting GU pipeline\n", True)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
546 self.debug("Command line:\n%s\n" % (" ".join(self.argv)))
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
547
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
548 if(self.options.noAggregate):
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
549 self.go_noAggregate()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
550 else:
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
551 self.go_aggregate()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
552
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
553 self.debug("Pipeline completed.\n", True)
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
554 self.debug("Cleaning up...\n")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
555
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
556 # Delete the temp files
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
557 #self.cleanupTempFiles()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
558 self.debug("Goodbye.\n\n")
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
559
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
560 #--------------------------------
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
561 GUPipeline(sys.argv).go()
a792b0548fe9 Upload GU tool
carlos-reyes
parents:
diff changeset
562