Mercurial > repos > devteam > subtract
comparison utils/gff_util.py @ 5:0d97b11ed3d5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/subtract commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689
author | devteam |
---|---|
date | Thu, 22 Jun 2017 18:51:24 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:a43e5a6390c1 | 5:0d97b11ed3d5 |
---|---|
1 """ | |
2 Provides utilities for working with GFF files. | |
3 """ | |
4 import copy | |
5 | |
6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper | |
7 from bx.tabular.io import Comment, Header, ParseError | |
8 | |
9 from .odict import odict | |
10 | |
11 | |
12 class GFFInterval( GenomicInterval ): | |
13 """ | |
14 A GFF interval, including attributes. If file is strictly a GFF file, | |
15 only attribute is 'group.' | |
16 """ | |
17 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, | |
18 strand_col=6, score_col=5, default_strand='.', fix_strand=False ): | |
19 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, | |
20 # temporarily set strand and then unset after initing GenomicInterval. | |
21 unknown_strand = False | |
22 if not fix_strand and fields[ strand_col ] == '.': | |
23 unknown_strand = True | |
24 fields[ strand_col ] = '+' | |
25 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, | |
26 default_strand, fix_strand=fix_strand ) | |
27 if unknown_strand: | |
28 self.strand = '.' | |
29 self.fields[ strand_col ] = '.' | |
30 | |
31 # Handle feature, score column. | |
32 self.feature_col = feature_col | |
33 if self.feature_col >= self.nfields: | |
34 raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) | |
35 self.feature = self.fields[ self.feature_col ] | |
36 self.score_col = score_col | |
37 if self.score_col >= self.nfields: | |
38 raise MissingFieldError( "No field for score_col (%d)" % score_col ) | |
39 self.score = self.fields[ self.score_col ] | |
40 | |
41 # GFF attributes. | |
42 self.attributes = parse_gff_attributes( fields[8] ) | |
43 | |
44 def copy( self ): | |
45 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, | |
46 self.end_col, self.strand_col, self.score_col, self.strand) | |
47 | |
48 | |
49 class GFFFeature( GFFInterval ): | |
50 """ | |
51 A GFF feature, which can include multiple intervals. | |
52 """ | |
53 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, | |
54 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], | |
55 raw_size=0 ): | |
56 # Use copy so that first interval and feature do not share fields. | |
57 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, | |
58 start_col, end_col, strand_col, score_col, default_strand, | |
59 fix_strand=fix_strand ) | |
60 self.intervals = intervals | |
61 self.raw_size = raw_size | |
62 # Use intervals to set feature attributes. | |
63 for interval in self.intervals: | |
64 # Error checking. NOTE: intervals need not share the same strand. | |
65 if interval.chrom != self.chrom: | |
66 raise ValueError( "interval chrom does not match self chrom: %s != %s" % | |
67 ( interval.chrom, self.chrom ) ) | |
68 # Set start, end of interval. | |
69 if interval.start < self.start: | |
70 self.start = interval.start | |
71 if interval.end > self.end: | |
72 self.end = interval.end | |
73 | |
74 def name( self ): | |
75 """ Returns feature's name. """ | |
76 name = None | |
77 # Preference for name: GTF, GFF3, GFF. | |
78 for attr_name in ['gene_id', 'transcript_id', # GTF | |
79 'ID', 'id', # GFF3 | |
80 'group' ]: # GFF (TODO) | |
81 name = self.attributes.get( attr_name, None ) | |
82 if name is not None: | |
83 break | |
84 return name | |
85 | |
86 def copy( self ): | |
87 intervals_copy = [] | |
88 for interval in self.intervals: | |
89 intervals_copy.append( interval.copy() ) | |
90 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, | |
91 self.score_col, self.strand, intervals=intervals_copy ) | |
92 | |
93 def lines( self ): | |
94 lines = [] | |
95 for interval in self.intervals: | |
96 lines.append( '\t'.join( interval.fields ) ) | |
97 return lines | |
98 | |
99 | |
100 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): | |
101 """ | |
102 Reader wrapper that reads GFF intervals/lines and automatically converts | |
103 them to BED format. | |
104 """ | |
105 | |
106 def parse_row( self, line ): | |
107 # HACK: this should return a GFF interval, but bx-python operations | |
108 # require GenomicInterval objects and subclasses will not work. | |
109 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, | |
110 self.end_col, self.strand_col, self.default_strand, | |
111 fix_strand=self.fix_strand ) | |
112 interval = convert_gff_coords_to_bed( interval ) | |
113 return interval | |
114 | |
115 | |
116 class GFFReaderWrapper( NiceReaderWrapper ): | |
117 """ | |
118 Reader wrapper for GFF files. | |
119 | |
120 Wrapper has two major functions: | |
121 | |
122 1. group entries for GFF file (via group column), GFF3 (via id attribute), | |
123 or GTF (via gene_id/transcript id); | |
124 2. convert coordinates from GFF format--starting and ending coordinates | |
125 are 1-based, closed--to the 'traditional'/BED interval format--0 based, | |
126 half-open. This is useful when using GFF files as inputs to tools that | |
127 expect traditional interval format. | |
128 """ | |
129 | |
130 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, | |
131 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): | |
132 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, | |
133 strand_col=strand_col, fix_strand=fix_strand, **kwargs ) | |
134 self.feature_col = feature_col | |
135 self.score_col = score_col | |
136 self.convert_to_bed_coord = convert_to_bed_coord | |
137 self.last_line = None | |
138 self.cur_offset = 0 | |
139 self.seed_interval = None | |
140 self.seed_interval_line_len = 0 | |
141 | |
142 def parse_row( self, line ): | |
143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, | |
144 self.start_col, self.end_col, self.strand_col, self.score_col, | |
145 self.default_strand, fix_strand=self.fix_strand ) | |
146 return interval | |
147 | |
148 def __next__( self ): | |
149 """ Returns next GFFFeature. """ | |
150 | |
151 # | |
152 # Helper function. | |
153 # | |
154 | |
155 def handle_parse_error( parse_error ): | |
156 """ Actions to take when ParseError found. """ | |
157 if self.outstream: | |
158 if self.print_delegate and hasattr(self.print_delegate, "__call__"): | |
159 self.print_delegate( self.outstream, e, self ) | |
160 self.skipped += 1 | |
161 # no reason to stuff an entire bad file into memmory | |
162 if self.skipped < 10: | |
163 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) | |
164 | |
165 # For debugging, uncomment this to propogate parsing exceptions up. | |
166 # I.e. the underlying reason for an unexpected StopIteration exception | |
167 # can be found by uncommenting this. | |
168 # raise e | |
169 | |
170 # | |
171 # Get next GFFFeature | |
172 # | |
173 raw_size = self.seed_interval_line_len | |
174 | |
175 # If there is no seed interval, set one. Also, if there are no more | |
176 # intervals to read, this is where iterator dies. | |
177 if not self.seed_interval: | |
178 while not self.seed_interval: | |
179 try: | |
180 self.seed_interval = GenomicIntervalReader.next( self ) | |
181 except ParseError as e: | |
182 handle_parse_error( e ) | |
183 # TODO: When no longer supporting python 2.4 use finally: | |
184 # finally: | |
185 raw_size += len( self.current_line ) | |
186 | |
187 # If header or comment, clear seed interval and return it with its size. | |
188 if isinstance( self.seed_interval, ( Header, Comment ) ): | |
189 return_val = self.seed_interval | |
190 return_val.raw_size = len( self.current_line ) | |
191 self.seed_interval = None | |
192 self.seed_interval_line_len = 0 | |
193 return return_val | |
194 | |
195 # Initialize feature identifier from seed. | |
196 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF | |
197 # For GFF3 | |
198 feature_id = self.seed_interval.attributes.get( 'ID', None ) | |
199 # For GTF. | |
200 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) | |
201 | |
202 # Read all intervals associated with seed. | |
203 feature_intervals = [] | |
204 feature_intervals.append( self.seed_interval ) | |
205 while True: | |
206 try: | |
207 interval = GenomicIntervalReader.next( self ) | |
208 raw_size += len( self.current_line ) | |
209 except StopIteration as e: | |
210 # No more intervals to read, but last feature needs to be | |
211 # returned. | |
212 interval = None | |
213 raw_size += len( self.current_line ) | |
214 break | |
215 except ParseError as e: | |
216 handle_parse_error( e ) | |
217 raw_size += len( self.current_line ) | |
218 continue | |
219 # TODO: When no longer supporting python 2.4 use finally: | |
220 # finally: | |
221 # raw_size += len( self.current_line ) | |
222 | |
223 # Ignore comments. | |
224 if isinstance( interval, Comment ): | |
225 continue | |
226 | |
227 # Determine if interval is part of feature. | |
228 part_of = False | |
229 group = interval.attributes.get( 'group', None ) | |
230 # GFF test: | |
231 if group and feature_group == group: | |
232 part_of = True | |
233 # GFF3 test: | |
234 parent_id = interval.attributes.get( 'Parent', None ) | |
235 cur_id = interval.attributes.get( 'ID', None ) | |
236 if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ): | |
237 part_of = True | |
238 # GTF test: | |
239 transcript_id = interval.attributes.get( 'transcript_id', None ) | |
240 if transcript_id and transcript_id == feature_transcript_id: | |
241 part_of = True | |
242 | |
243 # If interval is not part of feature, clean up and break. | |
244 if not part_of: | |
245 # Adjust raw size because current line is not part of feature. | |
246 raw_size -= len( self.current_line ) | |
247 break | |
248 | |
249 # Interval associated with feature. | |
250 feature_intervals.append( interval ) | |
251 | |
252 # Last interval read is the seed for the next interval. | |
253 self.seed_interval = interval | |
254 self.seed_interval_line_len = len( self.current_line ) | |
255 | |
256 # Return feature. | |
257 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, | |
258 self.end_col, self.strand_col, self.score_col, | |
259 self.default_strand, fix_strand=self.fix_strand, | |
260 intervals=feature_intervals, raw_size=raw_size ) | |
261 | |
262 # Convert to BED coords? | |
263 if self.convert_to_bed_coord: | |
264 convert_gff_coords_to_bed( feature ) | |
265 | |
266 return feature | |
267 next = __next__ # This line should be removed once the bx-python port to Python3 is finished | |
268 | |
269 | |
270 def convert_bed_coords_to_gff( interval ): | |
271 """ | |
272 Converts an interval object's coordinates from BED format to GFF format. | |
273 Accepted object types include GenomicInterval and list (where the first | |
274 element in the list is the interval's start, and the second element is | |
275 the interval's end). | |
276 """ | |
277 if isinstance( interval, GenomicInterval ): | |
278 interval.start += 1 | |
279 if isinstance( interval, GFFFeature ): | |
280 for subinterval in interval.intervals: | |
281 convert_bed_coords_to_gff( subinterval ) | |
282 elif type( interval ) is list: | |
283 interval[ 0 ] += 1 | |
284 return interval | |
285 | |
286 | |
287 def convert_gff_coords_to_bed( interval ): | |
288 """ | |
289 Converts an interval object's coordinates from GFF format to BED format. | |
290 Accepted object types include GFFFeature, GenomicInterval, and list (where | |
291 the first element in the list is the interval's start, and the second | |
292 element is the interval's end). | |
293 """ | |
294 if isinstance( interval, GenomicInterval ): | |
295 interval.start -= 1 | |
296 if isinstance( interval, GFFFeature ): | |
297 for subinterval in interval.intervals: | |
298 convert_gff_coords_to_bed( subinterval ) | |
299 elif type( interval ) is list: | |
300 interval[ 0 ] -= 1 | |
301 return interval | |
302 | |
303 | |
304 def parse_gff_attributes( attr_str ): | |
305 """ | |
306 Parses a GFF/GTF attribute string and returns a dictionary of name-value | |
307 pairs. The general format for a GFF3 attributes string is | |
308 | |
309 name1=value1;name2=value2 | |
310 | |
311 The general format for a GTF attribute string is | |
312 | |
313 name1 "value1" ; name2 "value2" | |
314 | |
315 The general format for a GFF attribute string is a single string that | |
316 denotes the interval's group; in this case, method returns a dictionary | |
317 with a single key-value pair, and key name is 'group' | |
318 """ | |
319 attributes_list = attr_str.split(";") | |
320 attributes = {} | |
321 for name_value_pair in attributes_list: | |
322 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 | |
323 # attribute; next, try double quotes for GTF. | |
324 pair = name_value_pair.strip().split("=") | |
325 if len( pair ) == 1: | |
326 pair = name_value_pair.strip().split("\"") | |
327 if len( pair ) == 1: | |
328 # Could not split for some reason -- raise exception? | |
329 continue | |
330 if pair == '': | |
331 continue | |
332 name = pair[0].strip() | |
333 if name == '': | |
334 continue | |
335 # Need to strip double quote from values | |
336 value = pair[1].strip(" \"") | |
337 attributes[ name ] = value | |
338 | |
339 if len( attributes ) == 0: | |
340 # Could not split attributes string, so entire string must be | |
341 # 'group' attribute. This is the case for strictly GFF files. | |
342 attributes['group'] = attr_str | |
343 return attributes | |
344 | |
345 | |
346 def gff_attributes_to_str( attrs, gff_format ): | |
347 """ | |
348 Convert GFF attributes to string. Supported formats are GFF3, GTF. | |
349 """ | |
350 if gff_format == 'GTF': | |
351 format_string = '%s "%s"' | |
352 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id | |
353 id_attr = None | |
354 if 'group' in attrs: | |
355 id_attr = 'group' | |
356 elif 'ID' in attrs: | |
357 id_attr = 'ID' | |
358 elif 'Parent' in attrs: | |
359 id_attr = 'Parent' | |
360 if id_attr: | |
361 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] | |
362 elif gff_format == 'GFF3': | |
363 format_string = '%s=%s' | |
364 attrs_strs = [] | |
365 for name, value in attrs.items(): | |
366 attrs_strs.append( format_string % ( name, value ) ) | |
367 return " ; ".join( attrs_strs ) | |
368 | |
369 | |
370 def read_unordered_gtf( iterator, strict=False ): | |
371 """ | |
372 Returns GTF features found in an iterator. GTF lines need not be ordered | |
373 or clustered for reader to work. Reader returns GFFFeature objects sorted | |
374 by transcript_id, chrom, and start position. | |
375 """ | |
376 | |
377 # -- Get function that generates line/feature key. -- | |
378 | |
379 def get_transcript_id(fields): | |
380 return parse_gff_attributes( fields[8] )[ 'transcript_id' ] | |
381 | |
382 if strict: | |
383 # Strict GTF parsing uses transcript_id only to group lines into feature. | |
384 key_fn = get_transcript_id | |
385 else: | |
386 # Use lenient parsing where chromosome + transcript_id is the key. This allows | |
387 # transcripts with same ID on different chromosomes; this occurs in some popular | |
388 # datasources, such as RefGenes in UCSC. | |
389 def key_fn(fields): | |
390 return fields[0] + '_' + get_transcript_id( fields ) | |
391 | |
392 # Aggregate intervals by transcript_id and collect comments. | |
393 feature_intervals = odict() | |
394 comments = [] | |
395 for count, line in enumerate( iterator ): | |
396 if line.startswith( '#' ): | |
397 comments.append( Comment( line ) ) | |
398 continue | |
399 | |
400 line_key = key_fn( line.split('\t') ) | |
401 if line_key in feature_intervals: | |
402 feature = feature_intervals[ line_key ] | |
403 else: | |
404 feature = [] | |
405 feature_intervals[ line_key ] = feature | |
406 feature.append( GFFInterval( None, line.split( '\t' ) ) ) | |
407 | |
408 # Create features. | |
409 chroms_features = {} | |
410 for count, intervals in enumerate( feature_intervals.values() ): | |
411 # Sort intervals by start position. | |
412 intervals.sort( lambda a, b: cmp( a.start, b.start ) ) | |
413 feature = GFFFeature( None, intervals=intervals ) | |
414 if feature.chrom not in chroms_features: | |
415 chroms_features[ feature.chrom ] = [] | |
416 chroms_features[ feature.chrom ].append( feature ) | |
417 | |
418 # Sort features by chrom, start position. | |
419 chroms_features_sorted = [] | |
420 for chrom_features in chroms_features.values(): | |
421 chroms_features_sorted.append( chrom_features ) | |
422 chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) | |
423 for features in chroms_features_sorted: | |
424 features.sort( lambda a, b: cmp( a.start, b.start ) ) | |
425 | |
426 # Yield comments first, then features. | |
427 # FIXME: comments can appear anywhere in file, not just the beginning. | |
428 # Ideally, then comments would be associated with features and output | |
429 # just before feature/line. | |
430 for comment in comments: | |
431 yield comment | |
432 | |
433 for chrom_features in chroms_features_sorted: | |
434 for feature in chrom_features: | |
435 yield feature |