comparison utils/gff_util.py @ 0:e928e029f6eb draft

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 09:13:13 -0400
parents
children 850c05b9af00
comparison
equal deleted inserted replaced
-1:000000000000 0:e928e029f6eb
1 """
2 Provides utilities for working with GFF files.
3 """
4
5 import copy
6 from bx.intervals.io import *
7 from bx.tabular.io import Header, Comment
8 from utils.odict import odict
9
10 class GFFInterval( GenomicInterval ):
11 """
12 A GFF interval, including attributes. If file is strictly a GFF file,
13 only attribute is 'group.'
14 """
15 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
16 strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
17 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
18 # temporarily set strand and then unset after initing GenomicInterval.
19 unknown_strand = False
20 if not fix_strand and fields[ strand_col ] == '.':
21 unknown_strand = True
22 fields[ strand_col ] = '+'
23 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
24 default_strand, fix_strand=fix_strand )
25 if unknown_strand:
26 self.strand = '.'
27 self.fields[ strand_col ] = '.'
28
29 # Handle feature, score column.
30 self.feature_col = feature_col
31 if self.feature_col >= self.nfields:
32 raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
33 self.feature = self.fields[ self.feature_col ]
34 self.score_col = score_col
35 if self.score_col >= self.nfields:
36 raise MissingFieldError( "No field for score_col (%d)" % score_col )
37 self.score = self.fields[ self.score_col ]
38
39 # GFF attributes.
40 self.attributes = parse_gff_attributes( fields[8] )
41
42 def copy( self ):
43 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
44 self.end_col, self.strand_col, self.score_col, self.strand)
45
46 class GFFFeature( GFFInterval ):
47 """
48 A GFF feature, which can include multiple intervals.
49 """
50 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
51 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
52 raw_size=0 ):
53 # Use copy so that first interval and feature do not share fields.
54 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
55 start_col, end_col, strand_col, score_col, default_strand, \
56 fix_strand=fix_strand )
57 self.intervals = intervals
58 self.raw_size = raw_size
59 # Use intervals to set feature attributes.
60 for interval in self.intervals:
61 # Error checking. NOTE: intervals need not share the same strand.
62 if interval.chrom != self.chrom:
63 raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
64 ( interval.chrom, self.chrom ) )
65 # Set start, end of interval.
66 if interval.start < self.start:
67 self.start = interval.start
68 if interval.end > self.end:
69 self.end = interval.end
70
71 def name( self ):
72 """ Returns feature's name. """
73 name = None
74 # Preference for name: GTF, GFF3, GFF.
75 for attr_name in [
76 # GTF:
77 'gene_id', 'transcript_id',
78 # GFF3:
79 'ID', 'id',
80 # GFF (TODO):
81 'group' ]:
82 name = self.attributes.get( attr_name, None )
83 if name is not None:
84 break
85 return name
86
87 def copy( self ):
88 intervals_copy = []
89 for interval in self.intervals:
90 intervals_copy.append( interval.copy() )
91 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
92 self.score_col, self.strand, intervals=intervals_copy )
93
94 def lines( self ):
95 lines = []
96 for interval in self.intervals:
97 lines.append( '\t'.join( interval.fields ) )
98 return lines
99
100
101 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
102 """
103 Reader wrapper that reads GFF intervals/lines and automatically converts
104 them to BED format.
105 """
106
107 def parse_row( self, line ):
108 # HACK: this should return a GFF interval, but bx-python operations
109 # require GenomicInterval objects and subclasses will not work.
110 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
111 self.end_col, self.strand_col, self.default_strand, \
112 fix_strand=self.fix_strand )
113 interval = convert_gff_coords_to_bed( interval )
114 return interval
115
116 class GFFReaderWrapper( NiceReaderWrapper ):
117 """
118 Reader wrapper for GFF files.
119
120 Wrapper has two major functions:
121
122 1. group entries for GFF file (via group column), GFF3 (via id attribute),
123 or GTF (via gene_id/transcript id);
124 2. convert coordinates from GFF format--starting and ending coordinates
125 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
126 half-open. This is useful when using GFF files as inputs to tools that
127 expect traditional interval format.
128 """
129
130 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
131 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
132 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
133 strand_col=strand_col, fix_strand=fix_strand, **kwargs )
134 self.feature_col = feature_col
135 self.score_col = score_col
136 self.convert_to_bed_coord = convert_to_bed_coord
137 self.last_line = None
138 self.cur_offset = 0
139 self.seed_interval = None
140 self.seed_interval_line_len = 0
141
142 def parse_row( self, line ):
143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
144 self.start_col, self.end_col, self.strand_col, self.score_col, \
145 self.default_strand, fix_strand=self.fix_strand )
146 return interval
147
148 def next( self ):
149 """ Returns next GFFFeature. """
150
151 #
152 # Helper function.
153 #
154
155 def handle_parse_error( parse_error ):
156 """ Actions to take when ParseError found. """
157 if self.outstream:
158 if self.print_delegate and hasattr(self.print_delegate,"__call__"):
159 self.print_delegate( self.outstream, e, self )
160 self.skipped += 1
161 # no reason to stuff an entire bad file into memmory
162 if self.skipped < 10:
163 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
164
165 # For debugging, uncomment this to propogate parsing exceptions up.
166 # I.e. the underlying reason for an unexpected StopIteration exception
167 # can be found by uncommenting this.
168 # raise e
169
170 #
171 # Get next GFFFeature
172 #
173 raw_size = self.seed_interval_line_len
174
175 # If there is no seed interval, set one. Also, if there are no more
176 # intervals to read, this is where iterator dies.
177 if not self.seed_interval:
178 while not self.seed_interval:
179 try:
180 self.seed_interval = GenomicIntervalReader.next( self )
181 except ParseError, e:
182 handle_parse_error( e )
183 # TODO: When no longer supporting python 2.4 use finally:
184 #finally:
185 raw_size += len( self.current_line )
186
187 # If header or comment, clear seed interval and return it with its size.
188 if isinstance( self.seed_interval, ( Header, Comment ) ):
189 return_val = self.seed_interval
190 return_val.raw_size = len( self.current_line )
191 self.seed_interval = None
192 self.seed_interval_line_len = 0
193 return return_val
194
195 # Initialize feature identifier from seed.
196 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
197 # For GFF3
198 feature_id = self.seed_interval.attributes.get( 'ID', None )
199 feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
200 # For GTF.
201 feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
202 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
203
204 # Read all intervals associated with seed.
205 feature_intervals = []
206 feature_intervals.append( self.seed_interval )
207 while True:
208 try:
209 interval = GenomicIntervalReader.next( self )
210 raw_size += len( self.current_line )
211 except StopIteration, e:
212 # No more intervals to read, but last feature needs to be
213 # returned.
214 interval = None
215 raw_size += len( self.current_line )
216 break
217 except ParseError, e:
218 handle_parse_error( e )
219 raw_size += len( self.current_line )
220 continue
221 # TODO: When no longer supporting python 2.4 use finally:
222 #finally:
223 #raw_size += len( self.current_line )
224
225 # Ignore comments.
226 if isinstance( interval, Comment ):
227 continue
228
229 # Determine if interval is part of feature.
230 part_of = False
231 group = interval.attributes.get( 'group', None )
232 # GFF test:
233 if group and feature_group == group:
234 part_of = True
235 # GFF3 test:
236 parent_id = interval.attributes.get( 'Parent', None )
237 cur_id = interval.attributes.get( 'ID', None )
238 if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
239 part_of = True
240 # GTF test:
241 transcript_id = interval.attributes.get( 'transcript_id', None )
242 if transcript_id and transcript_id == feature_transcript_id:
243 part_of = True
244
245 # If interval is not part of feature, clean up and break.
246 if not part_of:
247 # Adjust raw size because current line is not part of feature.
248 raw_size -= len( self.current_line )
249 break
250
251 # Interval associated with feature.
252 feature_intervals.append( interval )
253
254 # Last interval read is the seed for the next interval.
255 self.seed_interval = interval
256 self.seed_interval_line_len = len( self.current_line )
257
258 # Return feature.
259 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
260 self.end_col, self.strand_col, self.score_col, \
261 self.default_strand, fix_strand=self.fix_strand, \
262 intervals=feature_intervals, raw_size=raw_size )
263
264 # Convert to BED coords?
265 if self.convert_to_bed_coord:
266 convert_gff_coords_to_bed( feature )
267
268 return feature
269
270 def convert_bed_coords_to_gff( interval ):
271 """
272 Converts an interval object's coordinates from BED format to GFF format.
273 Accepted object types include GenomicInterval and list (where the first
274 element in the list is the interval's start, and the second element is
275 the interval's end).
276 """
277 if isinstance( interval, GenomicInterval ):
278 interval.start += 1
279 if isinstance( interval, GFFFeature ):
280 for subinterval in interval.intervals:
281 convert_bed_coords_to_gff( subinterval )
282 elif type ( interval ) is list:
283 interval[ 0 ] += 1
284 return interval
285
286 def convert_gff_coords_to_bed( interval ):
287 """
288 Converts an interval object's coordinates from GFF format to BED format.
289 Accepted object types include GFFFeature, GenomicInterval, and list (where
290 the first element in the list is the interval's start, and the second
291 element is the interval's end).
292 """
293 if isinstance( interval, GenomicInterval ):
294 interval.start -= 1
295 if isinstance( interval, GFFFeature ):
296 for subinterval in interval.intervals:
297 convert_gff_coords_to_bed( subinterval )
298 elif type ( interval ) is list:
299 interval[ 0 ] -= 1
300 return interval
301
302 def parse_gff_attributes( attr_str ):
303 """
304 Parses a GFF/GTF attribute string and returns a dictionary of name-value
305 pairs. The general format for a GFF3 attributes string is
306
307 name1=value1;name2=value2
308
309 The general format for a GTF attribute string is
310
311 name1 "value1" ; name2 "value2"
312
313 The general format for a GFF attribute string is a single string that
314 denotes the interval's group; in this case, method returns a dictionary
315 with a single key-value pair, and key name is 'group'
316 """
317 attributes_list = attr_str.split(";")
318 attributes = {}
319 for name_value_pair in attributes_list:
320 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
321 # attribute; next, try double quotes for GTF.
322 pair = name_value_pair.strip().split("=")
323 if len( pair ) == 1:
324 pair = name_value_pair.strip().split("\"")
325 if len( pair ) == 1:
326 # Could not split for some reason -- raise exception?
327 continue
328 if pair == '':
329 continue
330 name = pair[0].strip()
331 if name == '':
332 continue
333 # Need to strip double quote from values
334 value = pair[1].strip(" \"")
335 attributes[ name ] = value
336
337 if len( attributes ) == 0:
338 # Could not split attributes string, so entire string must be
339 # 'group' attribute. This is the case for strictly GFF files.
340 attributes['group'] = attr_str
341 return attributes
342
343 def gff_attributes_to_str( attrs, gff_format ):
344 """
345 Convert GFF attributes to string. Supported formats are GFF3, GTF.
346 """
347 if gff_format == 'GTF':
348 format_string = '%s "%s"'
349 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
350 id_attr = None
351 if 'group' in attrs:
352 id_attr = 'group'
353 elif 'ID' in attrs:
354 id_attr = 'ID'
355 elif 'Parent' in attrs:
356 id_attr = 'Parent'
357 if id_attr:
358 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
359 elif gff_format == 'GFF3':
360 format_string = '%s=%s'
361 attrs_strs = []
362 for name, value in attrs.items():
363 attrs_strs.append( format_string % ( name, value ) )
364 return " ; ".join( attrs_strs )
365
366 def read_unordered_gtf( iterator, strict=False ):
367 """
368 Returns GTF features found in an iterator. GTF lines need not be ordered
369 or clustered for reader to work. Reader returns GFFFeature objects sorted
370 by transcript_id, chrom, and start position.
371 """
372
373 # -- Get function that generates line/feature key. --
374
375 get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
376 if strict:
377 # Strict GTF parsing uses transcript_id only to group lines into feature.
378 key_fn = get_transcript_id
379 else:
380 # Use lenient parsing where chromosome + transcript_id is the key. This allows
381 # transcripts with same ID on different chromosomes; this occurs in some popular
382 # datasources, such as RefGenes in UCSC.
383 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
384
385
386 # Aggregate intervals by transcript_id and collect comments.
387 feature_intervals = odict()
388 comments = []
389 for count, line in enumerate( iterator ):
390 if line.startswith( '#' ):
391 comments.append( Comment( line ) )
392 continue
393
394 line_key = key_fn( line.split('\t') )
395 if line_key in feature_intervals:
396 feature = feature_intervals[ line_key ]
397 else:
398 feature = []
399 feature_intervals[ line_key ] = feature
400 feature.append( GFFInterval( None, line.split( '\t' ) ) )
401
402 # Create features.
403 chroms_features = {}
404 for count, intervals in enumerate( feature_intervals.values() ):
405 # Sort intervals by start position.
406 intervals.sort( lambda a,b: cmp( a.start, b.start ) )
407 feature = GFFFeature( None, intervals=intervals )
408 if feature.chrom not in chroms_features:
409 chroms_features[ feature.chrom ] = []
410 chroms_features[ feature.chrom ].append( feature )
411
412 # Sort features by chrom, start position.
413 chroms_features_sorted = []
414 for chrom_features in chroms_features.values():
415 chroms_features_sorted.append( chrom_features )
416 chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
417 for features in chroms_features_sorted:
418 features.sort( lambda a,b: cmp( a.start, b.start ) )
419
420 # Yield comments first, then features.
421 # FIXME: comments can appear anywhere in file, not just the beginning.
422 # Ideally, then comments would be associated with features and output
423 # just before feature/line.
424 for comment in comments:
425 yield comment
426
427 for chrom_features in chroms_features_sorted:
428 for feature in chrom_features:
429 yield feature
430