comparison gff2coverage.py @ 0:0d66ec694153 draft

Uploaded test first test script
author sebastian
date Fri, 21 Feb 2014 08:04:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0d66ec694153
1 '''
2 gff2coverage.py - compute genomic coverage of gff intervals
3 ===========================================================
4
5 :Author: Andreas Heger
6 :Release: $Id$
7 :Date: |today|
8 :Tags: Genomics Intervals Summary GFF
9
10 Purpose
11 -------
12
13 This script computes the genomic coverage of intervals
14 in a :term:`gff` formatted file. The coverage is computed
15 per feature.
16
17 Usage
18 -----
19
20 Example::
21
22 python <script_name>.py --help
23
24 Type::
25
26 python <script_name>.py --help
27
28 for command line help.
29
30 Command line options
31 --------------------
32
33 '''
34
35 import sys
36 import string
37 import re
38 import optparse
39 import time
40 import os
41 import shutil
42 import tempfile
43 import math
44 import collections
45
46 import CGAT.Experiment as E
47 import CGAT.IndexedFasta as IndexedFasta
48 import CGAT.IOTools as IOTools
49 import CGAT.GTF as GTF
50
51 def printValues( contig, max_size, window_size, values, options ):
52 """output values."""
53
54 outfile = E.openOutputFile( contig, "w" )
55
56 outfile.write( "abs_pos\trel_pos" )
57
58 for feature in options.features:
59 outfile.write("\tabs_%s\trel_%s" % (feature,feature) )
60 outfile.write("\n")
61
62 max_vv = []
63
64 for f in range(len(options.features)):
65 max_vv.append( float(max(map(lambda x: x[f], values ) )))
66
67 bin = 0
68 for vv in values:
69 outfile.write( "%i\t" % bin )
70 outfile.write( options.value_format % (float(bin) / max_size) )
71
72 for x in range(len(options.features)):
73 outfile.write( "\t%i\t%s" % (vv[x] ,
74 options.value_format % (vv[x] / max_vv[x]) ) )
75 outfile.write("\n" )
76 bin += window_size
77
78 outfile.close()
79
80 def processChunk( contig, chunk, options, fasta = None ):
81 """
82 This function requires segments to be non-overlapping.
83 """
84
85 if len(chunk) == 0: return
86
87 # check whether there are overlapping features or not
88 checked = []
89 for feature in chunk:
90 checked.append(feature)
91 others = [x for x in chunk if x not in checked]
92 for otherFeature in others:
93 if GTF.Overlap(feature, otherFeature):
94 raise ValueError( " Histogram could not be created since the file contains overlapping features! \n%s\n%s " % (feature, otherFeature) )
95 # clear auxiliary list
96 del checked[:]
97
98 # compute min_coordinate, max_coordinate for the histogram
99 min_coordinate = 1
100 max_coordinate = max( map( lambda x: x.end, chunk) )
101 ## compute window size
102 if options.window_size:
103 window_size = options.window_size
104 num_bins = int(math.ceil((float(max_coordinate) / window_size)))
105 elif options.num_bins and fasta:
106 contig_length = fasta.getLength( contig )
107 assert max_coordinate <= contig_length, "maximum coordinate (%i) larger than contig size (%i) for contig %s" % (max_coordinate, contig_length, contig )
108 max_coordinate = contig_length
109 window_size = int(math.floor( float(contig_length) / options.num_bins))
110 num_bins = options.num_bins
111 else:
112 raise ValueError("please specify a window size of provide genomic sequence with number of bins.")
113
114 values = [ [] for x in range(num_bins) ]
115
116 ## do several parses for each feature, slow, but easier to code
117 ## alternatively: sort by feature and location.
118 for feature in options.features:
119 total = 0
120 bin = 0
121 end = window_size
122 for entry in chunk:
123 if entry.feature != feature: continue
124
125 while end < entry.start:
126 values[bin].append( total )
127 bin += 1
128 end += window_size
129
130 while entry.end > end:
131 seg_start = max(entry.start, end - window_size)
132 seg_end = min(entry.end, end)
133 total += seg_end - seg_start
134 values[bin].append( total )
135 end += window_size
136 bin += 1
137 else:
138 seg_start = max(entry.start, end - window_size)
139 seg_end = min(entry.end, end)
140 total += seg_end - seg_start
141
142 while bin < num_bins:
143 values[bin].append(total)
144 bin += 1
145
146 printValues( contig, max_coordinate, window_size, values, options )
147
148 def main( argv = None ):
149 """script main.
150
151 parses command line options in sys.argv, unless *argv* is given.
152 """
153
154 if argv == None: argv = sys.argv
155
156 parser = E.OptionParser( version = "%prog version: $Id: gff2coverage.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])
157
158 parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
159 help="filename with genome [default=%default]" )
160
161 parser.add_option( "-f", "--features", dest="features", type="string", action="append",
162 help="features to collect "
163 "[default=%default]" )
164
165 parser.add_option( "-w", "--window-size", dest="window_size", type="int",
166 help="window size in bp for histogram computation. "
167 "Determines the bin size. "
168 "[default=%default]" )
169
170 parser.add_option( "-b", "--num-bins", dest="num_bins", type="int",
171 help="number of bins for histogram computation "
172 "if window size is not given. "
173 "[default=%default]" )
174
175 parser.add_option( "-m", "--method", dest="method", type="choice",
176 choices = ("genomic", "histogram", ),
177 help="methods to apply. "
178 "[default=%default]" )
179
180 parser.set_defaults(
181 genome_file = None,
182 window_size = None,
183 num_bins = 1000,
184 value_format = "%6.4f",
185 features = [],
186 method = "genomic",
187 )
188
189 (options, args) = E.Start( parser, add_output_options = True )
190
191 if options.genome_file:
192 fasta = IndexedFasta.IndexedFasta( options.genome_file )
193 else:
194 fasta = None
195
196 if options.method == "histogram":
197
198 gff = GTF.readFromFile( sys.stdin )
199
200 gff.sort( lambda x,y: cmp( (x.contig, x.start), (y.contig, y.start) ) )
201
202 chunk = []
203 last_contig = None
204
205 for entry in gff:
206
207 if last_contig != entry.contig:
208 processChunk( last_contig, chunk, options, fasta )
209 last_contig = entry.contig
210 chunk = []
211
212 chunk.append( entry )
213
214 processChunk( last_contig, chunk, options, fasta )
215
216 elif options.method == "genomic":
217 intervals = collections.defaultdict( int )
218 bases = collections.defaultdict( int )
219 total = 0
220 for entry in GTF.iterator( sys.stdin ):
221 intervals[ (entry.contig, entry.source, entry.feature) ] += 1
222 bases[ (entry.contig, entry.source, entry.feature) ] += entry.end - entry.start
223 total += entry.end - entry.start
224
225 options.stdout.write( "contig\tsource\tfeature\tintervals\tbases" )
226 if fasta:
227 options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n" )
228 else:
229 options.stdout.write( "\n" )
230
231 total_genome_size = sum( fasta.getContigSizes( with_synonyms = False ).values() )
232
233 for key in sorted (intervals.keys() ):
234 nbases = bases[key]
235 nintervals = intervals[key]
236 contig, source, feature = key
237 options.stdout.write( "\t".join( ( "\t".join(key),
238 str(nintervals),
239 str(nbases) ) ) )
240 if fasta:
241 options.stdout.write( "\t%f" % ( 100.0 * float(nbases) / fasta.getLength( contig )))
242 options.stdout.write( "\t%f\n" % ( 100.0 * float(nbases) / total_genome_size ))
243 else: options.stdout.write( "\n" )
244
245
246 E.Stop()
247
248 if __name__ == "__main__":
249 sys.exit( main( sys.argv) )
250
251