comparison utils/gff_util.py @ 5:60925436ca5f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/intersect commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689
author devteam
date Thu, 22 Jun 2017 18:50:20 -0400
parents
children
comparison
equal deleted inserted replaced
4:21717d77aee5 5:60925436ca5f
1 """
2 Provides utilities for working with GFF files.
3 """
4 import copy
5
6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
7 from bx.tabular.io import Comment, Header, ParseError
8
9 from .odict import odict
10
11
12 class GFFInterval( GenomicInterval ):
13 """
14 A GFF interval, including attributes. If file is strictly a GFF file,
15 only attribute is 'group.'
16 """
17 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
18 strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
19 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
20 # temporarily set strand and then unset after initing GenomicInterval.
21 unknown_strand = False
22 if not fix_strand and fields[ strand_col ] == '.':
23 unknown_strand = True
24 fields[ strand_col ] = '+'
25 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
26 default_strand, fix_strand=fix_strand )
27 if unknown_strand:
28 self.strand = '.'
29 self.fields[ strand_col ] = '.'
30
31 # Handle feature, score column.
32 self.feature_col = feature_col
33 if self.feature_col >= self.nfields:
34 raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
35 self.feature = self.fields[ self.feature_col ]
36 self.score_col = score_col
37 if self.score_col >= self.nfields:
38 raise MissingFieldError( "No field for score_col (%d)" % score_col )
39 self.score = self.fields[ self.score_col ]
40
41 # GFF attributes.
42 self.attributes = parse_gff_attributes( fields[8] )
43
44 def copy( self ):
45 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
46 self.end_col, self.strand_col, self.score_col, self.strand)
47
48
49 class GFFFeature( GFFInterval ):
50 """
51 A GFF feature, which can include multiple intervals.
52 """
53 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
54 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
55 raw_size=0 ):
56 # Use copy so that first interval and feature do not share fields.
57 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
58 start_col, end_col, strand_col, score_col, default_strand,
59 fix_strand=fix_strand )
60 self.intervals = intervals
61 self.raw_size = raw_size
62 # Use intervals to set feature attributes.
63 for interval in self.intervals:
64 # Error checking. NOTE: intervals need not share the same strand.
65 if interval.chrom != self.chrom:
66 raise ValueError( "interval chrom does not match self chrom: %s != %s" %
67 ( interval.chrom, self.chrom ) )
68 # Set start, end of interval.
69 if interval.start < self.start:
70 self.start = interval.start
71 if interval.end > self.end:
72 self.end = interval.end
73
74 def name( self ):
75 """ Returns feature's name. """
76 name = None
77 # Preference for name: GTF, GFF3, GFF.
78 for attr_name in ['gene_id', 'transcript_id', # GTF
79 'ID', 'id', # GFF3
80 'group' ]: # GFF (TODO)
81 name = self.attributes.get( attr_name, None )
82 if name is not None:
83 break
84 return name
85
86 def copy( self ):
87 intervals_copy = []
88 for interval in self.intervals:
89 intervals_copy.append( interval.copy() )
90 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
91 self.score_col, self.strand, intervals=intervals_copy )
92
93 def lines( self ):
94 lines = []
95 for interval in self.intervals:
96 lines.append( '\t'.join( interval.fields ) )
97 return lines
98
99
100 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
101 """
102 Reader wrapper that reads GFF intervals/lines and automatically converts
103 them to BED format.
104 """
105
106 def parse_row( self, line ):
107 # HACK: this should return a GFF interval, but bx-python operations
108 # require GenomicInterval objects and subclasses will not work.
109 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
110 self.end_col, self.strand_col, self.default_strand,
111 fix_strand=self.fix_strand )
112 interval = convert_gff_coords_to_bed( interval )
113 return interval
114
115
116 class GFFReaderWrapper( NiceReaderWrapper ):
117 """
118 Reader wrapper for GFF files.
119
120 Wrapper has two major functions:
121
122 1. group entries for GFF file (via group column), GFF3 (via id attribute),
123 or GTF (via gene_id/transcript id);
124 2. convert coordinates from GFF format--starting and ending coordinates
125 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
126 half-open. This is useful when using GFF files as inputs to tools that
127 expect traditional interval format.
128 """
129
130 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
131 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
132 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
133 strand_col=strand_col, fix_strand=fix_strand, **kwargs )
134 self.feature_col = feature_col
135 self.score_col = score_col
136 self.convert_to_bed_coord = convert_to_bed_coord
137 self.last_line = None
138 self.cur_offset = 0
139 self.seed_interval = None
140 self.seed_interval_line_len = 0
141
142 def parse_row( self, line ):
143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
144 self.start_col, self.end_col, self.strand_col, self.score_col,
145 self.default_strand, fix_strand=self.fix_strand )
146 return interval
147
148 def __next__( self ):
149 """ Returns next GFFFeature. """
150
151 #
152 # Helper function.
153 #
154
155 def handle_parse_error( parse_error ):
156 """ Actions to take when ParseError found. """
157 if self.outstream:
158 if self.print_delegate and hasattr(self.print_delegate, "__call__"):
159 self.print_delegate( self.outstream, e, self )
160 self.skipped += 1
161 # no reason to stuff an entire bad file into memmory
162 if self.skipped < 10:
163 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
164
165 # For debugging, uncomment this to propogate parsing exceptions up.
166 # I.e. the underlying reason for an unexpected StopIteration exception
167 # can be found by uncommenting this.
168 # raise e
169
170 #
171 # Get next GFFFeature
172 #
173 raw_size = self.seed_interval_line_len
174
175 # If there is no seed interval, set one. Also, if there are no more
176 # intervals to read, this is where iterator dies.
177 if not self.seed_interval:
178 while not self.seed_interval:
179 try:
180 self.seed_interval = GenomicIntervalReader.next( self )
181 except ParseError as e:
182 handle_parse_error( e )
183 # TODO: When no longer supporting python 2.4 use finally:
184 # finally:
185 raw_size += len( self.current_line )
186
187 # If header or comment, clear seed interval and return it with its size.
188 if isinstance( self.seed_interval, ( Header, Comment ) ):
189 return_val = self.seed_interval
190 return_val.raw_size = len( self.current_line )
191 self.seed_interval = None
192 self.seed_interval_line_len = 0
193 return return_val
194
195 # Initialize feature identifier from seed.
196 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
197 # For GFF3
198 feature_id = self.seed_interval.attributes.get( 'ID', None )
199 # For GTF.
200 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
201
202 # Read all intervals associated with seed.
203 feature_intervals = []
204 feature_intervals.append( self.seed_interval )
205 while True:
206 try:
207 interval = GenomicIntervalReader.next( self )
208 raw_size += len( self.current_line )
209 except StopIteration as e:
210 # No more intervals to read, but last feature needs to be
211 # returned.
212 interval = None
213 raw_size += len( self.current_line )
214 break
215 except ParseError as e:
216 handle_parse_error( e )
217 raw_size += len( self.current_line )
218 continue
219 # TODO: When no longer supporting python 2.4 use finally:
220 # finally:
221 # raw_size += len( self.current_line )
222
223 # Ignore comments.
224 if isinstance( interval, Comment ):
225 continue
226
227 # Determine if interval is part of feature.
228 part_of = False
229 group = interval.attributes.get( 'group', None )
230 # GFF test:
231 if group and feature_group == group:
232 part_of = True
233 # GFF3 test:
234 parent_id = interval.attributes.get( 'Parent', None )
235 cur_id = interval.attributes.get( 'ID', None )
236 if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
237 part_of = True
238 # GTF test:
239 transcript_id = interval.attributes.get( 'transcript_id', None )
240 if transcript_id and transcript_id == feature_transcript_id:
241 part_of = True
242
243 # If interval is not part of feature, clean up and break.
244 if not part_of:
245 # Adjust raw size because current line is not part of feature.
246 raw_size -= len( self.current_line )
247 break
248
249 # Interval associated with feature.
250 feature_intervals.append( interval )
251
252 # Last interval read is the seed for the next interval.
253 self.seed_interval = interval
254 self.seed_interval_line_len = len( self.current_line )
255
256 # Return feature.
257 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
258 self.end_col, self.strand_col, self.score_col,
259 self.default_strand, fix_strand=self.fix_strand,
260 intervals=feature_intervals, raw_size=raw_size )
261
262 # Convert to BED coords?
263 if self.convert_to_bed_coord:
264 convert_gff_coords_to_bed( feature )
265
266 return feature
267 next = __next__ # This line should be removed once the bx-python port to Python3 is finished
268
269
270 def convert_bed_coords_to_gff( interval ):
271 """
272 Converts an interval object's coordinates from BED format to GFF format.
273 Accepted object types include GenomicInterval and list (where the first
274 element in the list is the interval's start, and the second element is
275 the interval's end).
276 """
277 if isinstance( interval, GenomicInterval ):
278 interval.start += 1
279 if isinstance( interval, GFFFeature ):
280 for subinterval in interval.intervals:
281 convert_bed_coords_to_gff( subinterval )
282 elif type( interval ) is list:
283 interval[ 0 ] += 1
284 return interval
285
286
287 def convert_gff_coords_to_bed( interval ):
288 """
289 Converts an interval object's coordinates from GFF format to BED format.
290 Accepted object types include GFFFeature, GenomicInterval, and list (where
291 the first element in the list is the interval's start, and the second
292 element is the interval's end).
293 """
294 if isinstance( interval, GenomicInterval ):
295 interval.start -= 1
296 if isinstance( interval, GFFFeature ):
297 for subinterval in interval.intervals:
298 convert_gff_coords_to_bed( subinterval )
299 elif type( interval ) is list:
300 interval[ 0 ] -= 1
301 return interval
302
303
304 def parse_gff_attributes( attr_str ):
305 """
306 Parses a GFF/GTF attribute string and returns a dictionary of name-value
307 pairs. The general format for a GFF3 attributes string is
308
309 name1=value1;name2=value2
310
311 The general format for a GTF attribute string is
312
313 name1 "value1" ; name2 "value2"
314
315 The general format for a GFF attribute string is a single string that
316 denotes the interval's group; in this case, method returns a dictionary
317 with a single key-value pair, and key name is 'group'
318 """
319 attributes_list = attr_str.split(";")
320 attributes = {}
321 for name_value_pair in attributes_list:
322 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
323 # attribute; next, try double quotes for GTF.
324 pair = name_value_pair.strip().split("=")
325 if len( pair ) == 1:
326 pair = name_value_pair.strip().split("\"")
327 if len( pair ) == 1:
328 # Could not split for some reason -- raise exception?
329 continue
330 if pair == '':
331 continue
332 name = pair[0].strip()
333 if name == '':
334 continue
335 # Need to strip double quote from values
336 value = pair[1].strip(" \"")
337 attributes[ name ] = value
338
339 if len( attributes ) == 0:
340 # Could not split attributes string, so entire string must be
341 # 'group' attribute. This is the case for strictly GFF files.
342 attributes['group'] = attr_str
343 return attributes
344
345
346 def gff_attributes_to_str( attrs, gff_format ):
347 """
348 Convert GFF attributes to string. Supported formats are GFF3, GTF.
349 """
350 if gff_format == 'GTF':
351 format_string = '%s "%s"'
352 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
353 id_attr = None
354 if 'group' in attrs:
355 id_attr = 'group'
356 elif 'ID' in attrs:
357 id_attr = 'ID'
358 elif 'Parent' in attrs:
359 id_attr = 'Parent'
360 if id_attr:
361 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
362 elif gff_format == 'GFF3':
363 format_string = '%s=%s'
364 attrs_strs = []
365 for name, value in attrs.items():
366 attrs_strs.append( format_string % ( name, value ) )
367 return " ; ".join( attrs_strs )
368
369
370 def read_unordered_gtf( iterator, strict=False ):
371 """
372 Returns GTF features found in an iterator. GTF lines need not be ordered
373 or clustered for reader to work. Reader returns GFFFeature objects sorted
374 by transcript_id, chrom, and start position.
375 """
376
377 # -- Get function that generates line/feature key. --
378
379 def get_transcript_id(fields):
380 return parse_gff_attributes( fields[8] )[ 'transcript_id' ]
381
382 if strict:
383 # Strict GTF parsing uses transcript_id only to group lines into feature.
384 key_fn = get_transcript_id
385 else:
386 # Use lenient parsing where chromosome + transcript_id is the key. This allows
387 # transcripts with same ID on different chromosomes; this occurs in some popular
388 # datasources, such as RefGenes in UCSC.
389 def key_fn(fields):
390 return fields[0] + '_' + get_transcript_id( fields )
391
392 # Aggregate intervals by transcript_id and collect comments.
393 feature_intervals = odict()
394 comments = []
395 for count, line in enumerate( iterator ):
396 if line.startswith( '#' ):
397 comments.append( Comment( line ) )
398 continue
399
400 line_key = key_fn( line.split('\t') )
401 if line_key in feature_intervals:
402 feature = feature_intervals[ line_key ]
403 else:
404 feature = []
405 feature_intervals[ line_key ] = feature
406 feature.append( GFFInterval( None, line.split( '\t' ) ) )
407
408 # Create features.
409 chroms_features = {}
410 for count, intervals in enumerate( feature_intervals.values() ):
411 # Sort intervals by start position.
412 intervals.sort( lambda a, b: cmp( a.start, b.start ) )
413 feature = GFFFeature( None, intervals=intervals )
414 if feature.chrom not in chroms_features:
415 chroms_features[ feature.chrom ] = []
416 chroms_features[ feature.chrom ].append( feature )
417
418 # Sort features by chrom, start position.
419 chroms_features_sorted = []
420 for chrom_features in chroms_features.values():
421 chroms_features_sorted.append( chrom_features )
422 chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
423 for features in chroms_features_sorted:
424 features.sort( lambda a, b: cmp( a.start, b.start ) )
425
426 # Yield comments first, then features.
427 # FIXME: comments can appear anywhere in file, not just the beginning.
428 # Ideally, then comments would be associated with features and output
429 # just before feature/line.
430 for comment in comments:
431 yield comment
432
433 for chrom_features in chroms_features_sorted:
434 for feature in chrom_features:
435 yield feature