0
|
1 '''
|
|
2 gff2coverage.py - compute genomic coverage of gff intervals
|
|
3 ===========================================================
|
|
4
|
|
5 :Author: Andreas Heger
|
|
6 :Release: $Id$
|
|
7 :Date: |today|
|
|
8 :Tags: Genomics Intervals Summary GFF
|
|
9
|
|
10 Purpose
|
|
11 -------
|
|
12
|
|
13 This script computes the genomic coverage of intervals
|
|
14 in a :term:`gff` formatted file. The coverage is computed
|
|
15 per feature.
|
|
16
|
|
17 Usage
|
|
18 -----
|
|
19
|
|
20 Example::
|
|
21
|
|
22 python <script_name>.py --help
|
|
23
|
|
24 Type::
|
|
25
|
|
26 python <script_name>.py --help
|
|
27
|
|
28 for command line help.
|
|
29
|
|
30 Command line options
|
|
31 --------------------
|
|
32
|
|
33 '''
|
|
34
|
|
35 import sys
|
|
36 import string
|
|
37 import re
|
|
38 import optparse
|
|
39 import time
|
|
40 import os
|
|
41 import shutil
|
|
42 import tempfile
|
|
43 import math
|
|
44 import collections
|
|
45
|
|
46 import CGAT.Experiment as E
|
|
47 import CGAT.IndexedFasta as IndexedFasta
|
|
48 import CGAT.IOTools as IOTools
|
|
49 import CGAT.GTF as GTF
|
|
50
|
|
51 def printValues( contig, max_size, window_size, values, options ):
|
|
52 """output values."""
|
|
53
|
|
54 outfile = E.openOutputFile( contig, "w" )
|
|
55
|
|
56 outfile.write( "abs_pos\trel_pos" )
|
|
57
|
|
58 for feature in options.features:
|
|
59 outfile.write("\tabs_%s\trel_%s" % (feature,feature) )
|
|
60 outfile.write("\n")
|
|
61
|
|
62 max_vv = []
|
|
63
|
|
64 for f in range(len(options.features)):
|
|
65 max_vv.append( float(max(map(lambda x: x[f], values ) )))
|
|
66
|
|
67 bin = 0
|
|
68 for vv in values:
|
|
69 outfile.write( "%i\t" % bin )
|
|
70 outfile.write( options.value_format % (float(bin) / max_size) )
|
|
71
|
|
72 for x in range(len(options.features)):
|
|
73 outfile.write( "\t%i\t%s" % (vv[x] ,
|
|
74 options.value_format % (vv[x] / max_vv[x]) ) )
|
|
75 outfile.write("\n" )
|
|
76 bin += window_size
|
|
77
|
|
78 outfile.close()
|
|
79
|
|
80 def processChunk( contig, chunk, options, fasta = None ):
|
|
81 """
|
|
82 This function requires segments to be non-overlapping.
|
|
83 """
|
|
84
|
|
85 if len(chunk) == 0: return
|
|
86
|
|
87 # check whether there are overlapping features or not
|
|
88 checked = []
|
|
89 for feature in chunk:
|
|
90 checked.append(feature)
|
|
91 others = [x for x in chunk if x not in checked]
|
|
92 for otherFeature in others:
|
|
93 if GTF.Overlap(feature, otherFeature):
|
|
94 raise ValueError( " Histogram could not be created since the file contains overlapping features! \n%s\n%s " % (feature, otherFeature) )
|
|
95 # clear auxiliary list
|
|
96 del checked[:]
|
|
97
|
|
98 # compute min_coordinate, max_coordinate for the histogram
|
|
99 min_coordinate = 1
|
|
100 max_coordinate = max( map( lambda x: x.end, chunk) )
|
|
101 ## compute window size
|
|
102 if options.window_size:
|
|
103 window_size = options.window_size
|
|
104 num_bins = int(math.ceil((float(max_coordinate) / window_size)))
|
|
105 elif options.num_bins and fasta:
|
|
106 contig_length = fasta.getLength( contig )
|
|
107 assert max_coordinate <= contig_length, "maximum coordinate (%i) larger than contig size (%i) for contig %s" % (max_coordinate, contig_length, contig )
|
|
108 max_coordinate = contig_length
|
|
109 window_size = int(math.floor( float(contig_length) / options.num_bins))
|
|
110 num_bins = options.num_bins
|
|
111 else:
|
|
112 raise ValueError("please specify a window size of provide genomic sequence with number of bins.")
|
|
113
|
|
114 values = [ [] for x in range(num_bins) ]
|
|
115
|
|
116 ## do several parses for each feature, slow, but easier to code
|
|
117 ## alternatively: sort by feature and location.
|
|
118 for feature in options.features:
|
|
119 total = 0
|
|
120 bin = 0
|
|
121 end = window_size
|
|
122 for entry in chunk:
|
|
123 if entry.feature != feature: continue
|
|
124
|
|
125 while end < entry.start:
|
|
126 values[bin].append( total )
|
|
127 bin += 1
|
|
128 end += window_size
|
|
129
|
|
130 while entry.end > end:
|
|
131 seg_start = max(entry.start, end - window_size)
|
|
132 seg_end = min(entry.end, end)
|
|
133 total += seg_end - seg_start
|
|
134 values[bin].append( total )
|
|
135 end += window_size
|
|
136 bin += 1
|
|
137 else:
|
|
138 seg_start = max(entry.start, end - window_size)
|
|
139 seg_end = min(entry.end, end)
|
|
140 total += seg_end - seg_start
|
|
141
|
|
142 while bin < num_bins:
|
|
143 values[bin].append(total)
|
|
144 bin += 1
|
|
145
|
|
146 printValues( contig, max_coordinate, window_size, values, options )
|
|
147
|
|
148 def main( argv = None ):
|
|
149 """script main.
|
|
150
|
|
151 parses command line options in sys.argv, unless *argv* is given.
|
|
152 """
|
|
153
|
|
154 if argv == None: argv = sys.argv
|
|
155
|
|
156 parser = E.OptionParser( version = "%prog version: $Id: gff2coverage.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])
|
|
157
|
|
158 parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
|
|
159 help="filename with genome [default=%default]" )
|
|
160
|
|
161 parser.add_option( "-f", "--features", dest="features", type="string", action="append",
|
|
162 help="features to collect "
|
|
163 "[default=%default]" )
|
|
164
|
|
165 parser.add_option( "-w", "--window-size", dest="window_size", type="int",
|
|
166 help="window size in bp for histogram computation. "
|
|
167 "Determines the bin size. "
|
|
168 "[default=%default]" )
|
|
169
|
|
170 parser.add_option( "-b", "--num-bins", dest="num_bins", type="int",
|
|
171 help="number of bins for histogram computation "
|
|
172 "if window size is not given. "
|
|
173 "[default=%default]" )
|
|
174
|
|
175 parser.add_option( "-m", "--method", dest="method", type="choice",
|
|
176 choices = ("genomic", "histogram", ),
|
|
177 help="methods to apply. "
|
|
178 "[default=%default]" )
|
|
179
|
|
180 parser.set_defaults(
|
|
181 genome_file = None,
|
|
182 window_size = None,
|
|
183 num_bins = 1000,
|
|
184 value_format = "%6.4f",
|
|
185 features = [],
|
|
186 method = "genomic",
|
|
187 )
|
|
188
|
|
189 (options, args) = E.Start( parser, add_output_options = True )
|
|
190
|
|
191 if options.genome_file:
|
|
192 fasta = IndexedFasta.IndexedFasta( options.genome_file )
|
|
193 else:
|
|
194 fasta = None
|
|
195
|
|
196 if options.method == "histogram":
|
|
197
|
|
198 gff = GTF.readFromFile( sys.stdin )
|
|
199
|
|
200 gff.sort( lambda x,y: cmp( (x.contig, x.start), (y.contig, y.start) ) )
|
|
201
|
|
202 chunk = []
|
|
203 last_contig = None
|
|
204
|
|
205 for entry in gff:
|
|
206
|
|
207 if last_contig != entry.contig:
|
|
208 processChunk( last_contig, chunk, options, fasta )
|
|
209 last_contig = entry.contig
|
|
210 chunk = []
|
|
211
|
|
212 chunk.append( entry )
|
|
213
|
|
214 processChunk( last_contig, chunk, options, fasta )
|
|
215
|
|
216 elif options.method == "genomic":
|
|
217 intervals = collections.defaultdict( int )
|
|
218 bases = collections.defaultdict( int )
|
|
219 total = 0
|
|
220 for entry in GTF.iterator( sys.stdin ):
|
|
221 intervals[ (entry.contig, entry.source, entry.feature) ] += 1
|
|
222 bases[ (entry.contig, entry.source, entry.feature) ] += entry.end - entry.start
|
|
223 total += entry.end - entry.start
|
|
224
|
|
225 options.stdout.write( "contig\tsource\tfeature\tintervals\tbases" )
|
|
226 if fasta:
|
|
227 options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n" )
|
|
228 else:
|
|
229 options.stdout.write( "\n" )
|
|
230
|
|
231 total_genome_size = sum( fasta.getContigSizes( with_synonyms = False ).values() )
|
|
232
|
|
233 for key in sorted (intervals.keys() ):
|
|
234 nbases = bases[key]
|
|
235 nintervals = intervals[key]
|
|
236 contig, source, feature = key
|
|
237 options.stdout.write( "\t".join( ( "\t".join(key),
|
|
238 str(nintervals),
|
|
239 str(nbases) ) ) )
|
|
240 if fasta:
|
|
241 options.stdout.write( "\t%f" % ( 100.0 * float(nbases) / fasta.getLength( contig )))
|
|
242 options.stdout.write( "\t%f\n" % ( 100.0 * float(nbases) / total_genome_size ))
|
|
243 else: options.stdout.write( "\n" )
|
|
244
|
|
245
|
|
246 E.Stop()
|
|
247
|
|
248 if __name__ == "__main__":
|
|
249 sys.exit( main( sys.argv) )
|
|
250
|
|
251
|