Mercurial > repos > sebastian > my_first_test
comparison gff2coverage.py @ 0:0d66ec694153 draft
Uploaded test first test script
author | sebastian |
---|---|
date | Fri, 21 Feb 2014 08:04:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0d66ec694153 |
---|---|
1 ''' | |
2 gff2coverage.py - compute genomic coverage of gff intervals | |
3 =========================================================== | |
4 | |
5 :Author: Andreas Heger | |
6 :Release: $Id$ | |
7 :Date: |today| | |
8 :Tags: Genomics Intervals Summary GFF | |
9 | |
10 Purpose | |
11 ------- | |
12 | |
13 This script computes the genomic coverage of intervals | |
14 in a :term:`gff` formatted file. The coverage is computed | |
15 per feature. | |
16 | |
17 Usage | |
18 ----- | |
19 | |
20 Example:: | |
21 | |
22 python <script_name>.py --help | |
23 | |
24 Type:: | |
25 | |
26 python <script_name>.py --help | |
27 | |
28 for command line help. | |
29 | |
30 Command line options | |
31 -------------------- | |
32 | |
33 ''' | |
34 | |
35 import sys | |
36 import string | |
37 import re | |
38 import optparse | |
39 import time | |
40 import os | |
41 import shutil | |
42 import tempfile | |
43 import math | |
44 import collections | |
45 | |
46 import CGAT.Experiment as E | |
47 import CGAT.IndexedFasta as IndexedFasta | |
48 import CGAT.IOTools as IOTools | |
49 import CGAT.GTF as GTF | |
50 | |
51 def printValues( contig, max_size, window_size, values, options ): | |
52 """output values.""" | |
53 | |
54 outfile = E.openOutputFile( contig, "w" ) | |
55 | |
56 outfile.write( "abs_pos\trel_pos" ) | |
57 | |
58 for feature in options.features: | |
59 outfile.write("\tabs_%s\trel_%s" % (feature,feature) ) | |
60 outfile.write("\n") | |
61 | |
62 max_vv = [] | |
63 | |
64 for f in range(len(options.features)): | |
65 max_vv.append( float(max(map(lambda x: x[f], values ) ))) | |
66 | |
67 bin = 0 | |
68 for vv in values: | |
69 outfile.write( "%i\t" % bin ) | |
70 outfile.write( options.value_format % (float(bin) / max_size) ) | |
71 | |
72 for x in range(len(options.features)): | |
73 outfile.write( "\t%i\t%s" % (vv[x] , | |
74 options.value_format % (vv[x] / max_vv[x]) ) ) | |
75 outfile.write("\n" ) | |
76 bin += window_size | |
77 | |
78 outfile.close() | |
79 | |
80 def processChunk( contig, chunk, options, fasta = None ): | |
81 """ | |
82 This function requires segments to be non-overlapping. | |
83 """ | |
84 | |
85 if len(chunk) == 0: return | |
86 | |
87 # check whether there are overlapping features or not | |
88 checked = [] | |
89 for feature in chunk: | |
90 checked.append(feature) | |
91 others = [x for x in chunk if x not in checked] | |
92 for otherFeature in others: | |
93 if GTF.Overlap(feature, otherFeature): | |
94 raise ValueError( " Histogram could not be created since the file contains overlapping features! \n%s\n%s " % (feature, otherFeature) ) | |
95 # clear auxiliary list | |
96 del checked[:] | |
97 | |
98 # compute min_coordinate, max_coordinate for the histogram | |
99 min_coordinate = 1 | |
100 max_coordinate = max( map( lambda x: x.end, chunk) ) | |
101 ## compute window size | |
102 if options.window_size: | |
103 window_size = options.window_size | |
104 num_bins = int(math.ceil((float(max_coordinate) / window_size))) | |
105 elif options.num_bins and fasta: | |
106 contig_length = fasta.getLength( contig ) | |
107 assert max_coordinate <= contig_length, "maximum coordinate (%i) larger than contig size (%i) for contig %s" % (max_coordinate, contig_length, contig ) | |
108 max_coordinate = contig_length | |
109 window_size = int(math.floor( float(contig_length) / options.num_bins)) | |
110 num_bins = options.num_bins | |
111 else: | |
112 raise ValueError("please specify a window size of provide genomic sequence with number of bins.") | |
113 | |
114 values = [ [] for x in range(num_bins) ] | |
115 | |
116 ## do several parses for each feature, slow, but easier to code | |
117 ## alternatively: sort by feature and location. | |
118 for feature in options.features: | |
119 total = 0 | |
120 bin = 0 | |
121 end = window_size | |
122 for entry in chunk: | |
123 if entry.feature != feature: continue | |
124 | |
125 while end < entry.start: | |
126 values[bin].append( total ) | |
127 bin += 1 | |
128 end += window_size | |
129 | |
130 while entry.end > end: | |
131 seg_start = max(entry.start, end - window_size) | |
132 seg_end = min(entry.end, end) | |
133 total += seg_end - seg_start | |
134 values[bin].append( total ) | |
135 end += window_size | |
136 bin += 1 | |
137 else: | |
138 seg_start = max(entry.start, end - window_size) | |
139 seg_end = min(entry.end, end) | |
140 total += seg_end - seg_start | |
141 | |
142 while bin < num_bins: | |
143 values[bin].append(total) | |
144 bin += 1 | |
145 | |
146 printValues( contig, max_coordinate, window_size, values, options ) | |
147 | |
148 def main( argv = None ): | |
149 """script main. | |
150 | |
151 parses command line options in sys.argv, unless *argv* is given. | |
152 """ | |
153 | |
154 if argv == None: argv = sys.argv | |
155 | |
156 parser = E.OptionParser( version = "%prog version: $Id: gff2coverage.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) | |
157 | |
158 parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", | |
159 help="filename with genome [default=%default]" ) | |
160 | |
161 parser.add_option( "-f", "--features", dest="features", type="string", action="append", | |
162 help="features to collect " | |
163 "[default=%default]" ) | |
164 | |
165 parser.add_option( "-w", "--window-size", dest="window_size", type="int", | |
166 help="window size in bp for histogram computation. " | |
167 "Determines the bin size. " | |
168 "[default=%default]" ) | |
169 | |
170 parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", | |
171 help="number of bins for histogram computation " | |
172 "if window size is not given. " | |
173 "[default=%default]" ) | |
174 | |
175 parser.add_option( "-m", "--method", dest="method", type="choice", | |
176 choices = ("genomic", "histogram", ), | |
177 help="methods to apply. " | |
178 "[default=%default]" ) | |
179 | |
180 parser.set_defaults( | |
181 genome_file = None, | |
182 window_size = None, | |
183 num_bins = 1000, | |
184 value_format = "%6.4f", | |
185 features = [], | |
186 method = "genomic", | |
187 ) | |
188 | |
189 (options, args) = E.Start( parser, add_output_options = True ) | |
190 | |
191 if options.genome_file: | |
192 fasta = IndexedFasta.IndexedFasta( options.genome_file ) | |
193 else: | |
194 fasta = None | |
195 | |
196 if options.method == "histogram": | |
197 | |
198 gff = GTF.readFromFile( sys.stdin ) | |
199 | |
200 gff.sort( lambda x,y: cmp( (x.contig, x.start), (y.contig, y.start) ) ) | |
201 | |
202 chunk = [] | |
203 last_contig = None | |
204 | |
205 for entry in gff: | |
206 | |
207 if last_contig != entry.contig: | |
208 processChunk( last_contig, chunk, options, fasta ) | |
209 last_contig = entry.contig | |
210 chunk = [] | |
211 | |
212 chunk.append( entry ) | |
213 | |
214 processChunk( last_contig, chunk, options, fasta ) | |
215 | |
216 elif options.method == "genomic": | |
217 intervals = collections.defaultdict( int ) | |
218 bases = collections.defaultdict( int ) | |
219 total = 0 | |
220 for entry in GTF.iterator( sys.stdin ): | |
221 intervals[ (entry.contig, entry.source, entry.feature) ] += 1 | |
222 bases[ (entry.contig, entry.source, entry.feature) ] += entry.end - entry.start | |
223 total += entry.end - entry.start | |
224 | |
225 options.stdout.write( "contig\tsource\tfeature\tintervals\tbases" ) | |
226 if fasta: | |
227 options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n" ) | |
228 else: | |
229 options.stdout.write( "\n" ) | |
230 | |
231 total_genome_size = sum( fasta.getContigSizes( with_synonyms = False ).values() ) | |
232 | |
233 for key in sorted (intervals.keys() ): | |
234 nbases = bases[key] | |
235 nintervals = intervals[key] | |
236 contig, source, feature = key | |
237 options.stdout.write( "\t".join( ( "\t".join(key), | |
238 str(nintervals), | |
239 str(nbases) ) ) ) | |
240 if fasta: | |
241 options.stdout.write( "\t%f" % ( 100.0 * float(nbases) / fasta.getLength( contig ))) | |
242 options.stdout.write( "\t%f\n" % ( 100.0 * float(nbases) / total_genome_size )) | |
243 else: options.stdout.write( "\n" ) | |
244 | |
245 | |
246 E.Stop() | |
247 | |
248 if __name__ == "__main__": | |
249 sys.exit( main( sys.argv) ) | |
250 | |
251 |