comparison utils/gff_util.py @ 1:850c05b9af00 draft

planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author devteam
date Tue, 13 Oct 2015 12:50:14 -0400
parents e928e029f6eb
children 94248d5b9b8b
comparison
equal deleted inserted replaced
0:e928e029f6eb 1:850c05b9af00
1 """ 1 """
2 Provides utilities for working with GFF files. 2 Provides utilities for working with GFF files.
3 """ 3 """
4 4
5 import copy 5 import copy
6 from bx.intervals.io import * 6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
7 from bx.tabular.io import Header, Comment 7 from bx.tabular.io import Header, Comment, ParseError
8 from utils.odict import odict 8 from utils.odict import odict
9
9 10
10 class GFFInterval( GenomicInterval ): 11 class GFFInterval( GenomicInterval ):
11 """ 12 """
12 A GFF interval, including attributes. If file is strictly a GFF file, 13 A GFF interval, including attributes. If file is strictly a GFF file,
13 only attribute is 'group.' 14 only attribute is 'group.'
14 """ 15 """
15 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ 16 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
16 strand_col=6, score_col=5, default_strand='.', fix_strand=False ): 17 strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
17 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, 18 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
18 # temporarily set strand and then unset after initing GenomicInterval. 19 # temporarily set strand and then unset after initing GenomicInterval.
19 unknown_strand = False 20 unknown_strand = False
20 if not fix_strand and fields[ strand_col ] == '.': 21 if not fix_strand and fields[ strand_col ] == '.':
21 unknown_strand = True 22 unknown_strand = True
22 fields[ strand_col ] = '+' 23 fields[ strand_col ] = '+'
23 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ 24 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
24 default_strand, fix_strand=fix_strand ) 25 default_strand, fix_strand=fix_strand )
25 if unknown_strand: 26 if unknown_strand:
26 self.strand = '.' 27 self.strand = '.'
27 self.fields[ strand_col ] = '.' 28 self.fields[ strand_col ] = '.'
28 29
41 42
42 def copy( self ): 43 def copy( self ):
43 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, 44 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
44 self.end_col, self.strand_col, self.score_col, self.strand) 45 self.end_col, self.strand_col, self.score_col, self.strand)
45 46
47
46 class GFFFeature( GFFInterval ): 48 class GFFFeature( GFFInterval ):
47 """ 49 """
48 A GFF feature, which can include multiple intervals. 50 A GFF feature, which can include multiple intervals.
49 """ 51 """
50 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ 52 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
51 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \ 53 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
52 raw_size=0 ): 54 raw_size=0 ):
53 # Use copy so that first interval and feature do not share fields. 55 # Use copy so that first interval and feature do not share fields.
54 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \ 56 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
55 start_col, end_col, strand_col, score_col, default_strand, \ 57 start_col, end_col, strand_col, score_col, default_strand,
56 fix_strand=fix_strand ) 58 fix_strand=fix_strand )
57 self.intervals = intervals 59 self.intervals = intervals
58 self.raw_size = raw_size 60 self.raw_size = raw_size
59 # Use intervals to set feature attributes. 61 # Use intervals to set feature attributes.
60 for interval in self.intervals: 62 for interval in self.intervals:
61 # Error checking. NOTE: intervals need not share the same strand. 63 # Error checking. NOTE: intervals need not share the same strand.
62 if interval.chrom != self.chrom: 64 if interval.chrom != self.chrom:
63 raise ValueError( "interval chrom does not match self chrom: %s != %s" % \ 65 raise ValueError( "interval chrom does not match self chrom: %s != %s" %
64 ( interval.chrom, self.chrom ) ) 66 ( interval.chrom, self.chrom ) )
65 # Set start, end of interval. 67 # Set start, end of interval.
66 if interval.start < self.start: 68 if interval.start < self.start:
67 self.start = interval.start 69 self.start = interval.start
68 if interval.end > self.end: 70 if interval.end > self.end:
70 72
71 def name( self ): 73 def name( self ):
72 """ Returns feature's name. """ 74 """ Returns feature's name. """
73 name = None 75 name = None
74 # Preference for name: GTF, GFF3, GFF. 76 # Preference for name: GTF, GFF3, GFF.
75 for attr_name in [ 77 for attr_name in ['gene_id', 'transcript_id', # GTF
76 # GTF: 78 'ID', 'id', # GFF3
77 'gene_id', 'transcript_id', 79 'group' ]: # GFF (TODO)
78 # GFF3:
79 'ID', 'id',
80 # GFF (TODO):
81 'group' ]:
82 name = self.attributes.get( attr_name, None ) 80 name = self.attributes.get( attr_name, None )
83 if name is not None: 81 if name is not None:
84 break 82 break
85 return name 83 return name
86 84
105 """ 103 """
106 104
107 def parse_row( self, line ): 105 def parse_row( self, line ):
108 # HACK: this should return a GFF interval, but bx-python operations 106 # HACK: this should return a GFF interval, but bx-python operations
109 # require GenomicInterval objects and subclasses will not work. 107 # require GenomicInterval objects and subclasses will not work.
110 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ 108 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
111 self.end_col, self.strand_col, self.default_strand, \ 109 self.end_col, self.strand_col, self.default_strand,
112 fix_strand=self.fix_strand ) 110 fix_strand=self.fix_strand )
113 interval = convert_gff_coords_to_bed( interval ) 111 interval = convert_gff_coords_to_bed( interval )
114 return interval 112 return interval
113
115 114
116 class GFFReaderWrapper( NiceReaderWrapper ): 115 class GFFReaderWrapper( NiceReaderWrapper ):
117 """ 116 """
118 Reader wrapper for GFF files. 117 Reader wrapper for GFF files.
119 118
125 are 1-based, closed--to the 'traditional'/BED interval format--0 based, 124 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
126 half-open. This is useful when using GFF files as inputs to tools that 125 half-open. This is useful when using GFF files as inputs to tools that
127 expect traditional interval format. 126 expect traditional interval format.
128 """ 127 """
129 128
130 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \ 129 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
131 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): 130 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
132 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \ 131 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
133 strand_col=strand_col, fix_strand=fix_strand, **kwargs ) 132 strand_col=strand_col, fix_strand=fix_strand, **kwargs )
134 self.feature_col = feature_col 133 self.feature_col = feature_col
135 self.score_col = score_col 134 self.score_col = score_col
136 self.convert_to_bed_coord = convert_to_bed_coord 135 self.convert_to_bed_coord = convert_to_bed_coord
137 self.last_line = None 136 self.last_line = None
138 self.cur_offset = 0 137 self.cur_offset = 0
139 self.seed_interval = None 138 self.seed_interval = None
140 self.seed_interval_line_len = 0 139 self.seed_interval_line_len = 0
141 140
142 def parse_row( self, line ): 141 def parse_row( self, line ):
143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \ 142 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
144 self.start_col, self.end_col, self.strand_col, self.score_col, \ 143 self.start_col, self.end_col, self.strand_col, self.score_col,
145 self.default_strand, fix_strand=self.fix_strand ) 144 self.default_strand, fix_strand=self.fix_strand )
146 return interval 145 return interval
147 146
148 def next( self ): 147 def next( self ):
149 """ Returns next GFFFeature. """ 148 """ Returns next GFFFeature. """
153 # 152 #
154 153
155 def handle_parse_error( parse_error ): 154 def handle_parse_error( parse_error ):
156 """ Actions to take when ParseError found. """ 155 """ Actions to take when ParseError found. """
157 if self.outstream: 156 if self.outstream:
158 if self.print_delegate and hasattr(self.print_delegate,"__call__"): 157 if self.print_delegate and hasattr(self.print_delegate, "__call__"):
159 self.print_delegate( self.outstream, e, self ) 158 self.print_delegate( self.outstream, e, self )
160 self.skipped += 1 159 self.skipped += 1
161 # no reason to stuff an entire bad file into memmory 160 # no reason to stuff an entire bad file into memmory
162 if self.skipped < 10: 161 if self.skipped < 10:
163 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) 162 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
164 163
165 # For debugging, uncomment this to propogate parsing exceptions up. 164 # For debugging, uncomment this to propogate parsing exceptions up.
166 # I.e. the underlying reason for an unexpected StopIteration exception 165 # I.e. the underlying reason for an unexpected StopIteration exception
167 # can be found by uncommenting this. 166 # can be found by uncommenting this.
168 # raise e 167 # raise e
191 self.seed_interval = None 190 self.seed_interval = None
192 self.seed_interval_line_len = 0 191 self.seed_interval_line_len = 0
193 return return_val 192 return return_val
194 193
195 # Initialize feature identifier from seed. 194 # Initialize feature identifier from seed.
196 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF 195 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
197 # For GFF3 196 # For GFF3
198 feature_id = self.seed_interval.attributes.get( 'ID', None ) 197 feature_id = self.seed_interval.attributes.get( 'ID', None )
199 feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
200 # For GTF. 198 # For GTF.
201 feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
202 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) 199 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
203 200
204 # Read all intervals associated with seed. 201 # Read all intervals associated with seed.
205 feature_intervals = [] 202 feature_intervals = []
206 feature_intervals.append( self.seed_interval ) 203 feature_intervals.append( self.seed_interval )
254 # Last interval read is the seed for the next interval. 251 # Last interval read is the seed for the next interval.
255 self.seed_interval = interval 252 self.seed_interval = interval
256 self.seed_interval_line_len = len( self.current_line ) 253 self.seed_interval_line_len = len( self.current_line )
257 254
258 # Return feature. 255 # Return feature.
259 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \ 256 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
260 self.end_col, self.strand_col, self.score_col, \ 257 self.end_col, self.strand_col, self.score_col,
261 self.default_strand, fix_strand=self.fix_strand, \ 258 self.default_strand, fix_strand=self.fix_strand,
262 intervals=feature_intervals, raw_size=raw_size ) 259 intervals=feature_intervals, raw_size=raw_size )
263 260
264 # Convert to BED coords? 261 # Convert to BED coords?
265 if self.convert_to_bed_coord: 262 if self.convert_to_bed_coord:
266 convert_gff_coords_to_bed( feature ) 263 convert_gff_coords_to_bed( feature )
267 264
268 return feature 265 return feature
266
269 267
270 def convert_bed_coords_to_gff( interval ): 268 def convert_bed_coords_to_gff( interval ):
271 """ 269 """
272 Converts an interval object's coordinates from BED format to GFF format. 270 Converts an interval object's coordinates from BED format to GFF format.
273 Accepted object types include GenomicInterval and list (where the first 271 Accepted object types include GenomicInterval and list (where the first
277 if isinstance( interval, GenomicInterval ): 275 if isinstance( interval, GenomicInterval ):
278 interval.start += 1 276 interval.start += 1
279 if isinstance( interval, GFFFeature ): 277 if isinstance( interval, GFFFeature ):
280 for subinterval in interval.intervals: 278 for subinterval in interval.intervals:
281 convert_bed_coords_to_gff( subinterval ) 279 convert_bed_coords_to_gff( subinterval )
282 elif type ( interval ) is list: 280 elif type( interval ) is list:
283 interval[ 0 ] += 1 281 interval[ 0 ] += 1
284 return interval 282 return interval
283
285 284
286 def convert_gff_coords_to_bed( interval ): 285 def convert_gff_coords_to_bed( interval ):
287 """ 286 """
288 Converts an interval object's coordinates from GFF format to BED format. 287 Converts an interval object's coordinates from GFF format to BED format.
289 Accepted object types include GFFFeature, GenomicInterval, and list (where 288 Accepted object types include GFFFeature, GenomicInterval, and list (where
293 if isinstance( interval, GenomicInterval ): 292 if isinstance( interval, GenomicInterval ):
294 interval.start -= 1 293 interval.start -= 1
295 if isinstance( interval, GFFFeature ): 294 if isinstance( interval, GFFFeature ):
296 for subinterval in interval.intervals: 295 for subinterval in interval.intervals:
297 convert_gff_coords_to_bed( subinterval ) 296 convert_gff_coords_to_bed( subinterval )
298 elif type ( interval ) is list: 297 elif type( interval ) is list:
299 interval[ 0 ] -= 1 298 interval[ 0 ] -= 1
300 return interval 299 return interval
300
301 301
302 def parse_gff_attributes( attr_str ): 302 def parse_gff_attributes( attr_str ):
303 """ 303 """
304 Parses a GFF/GTF attribute string and returns a dictionary of name-value 304 Parses a GFF/GTF attribute string and returns a dictionary of name-value
305 pairs. The general format for a GFF3 attributes string is 305 pairs. The general format for a GFF3 attributes string is
338 # Could not split attributes string, so entire string must be 338 # Could not split attributes string, so entire string must be
339 # 'group' attribute. This is the case for strictly GFF files. 339 # 'group' attribute. This is the case for strictly GFF files.
340 attributes['group'] = attr_str 340 attributes['group'] = attr_str
341 return attributes 341 return attributes
342 342
343
343 def gff_attributes_to_str( attrs, gff_format ): 344 def gff_attributes_to_str( attrs, gff_format ):
344 """ 345 """
345 Convert GFF attributes to string. Supported formats are GFF3, GTF. 346 Convert GFF attributes to string. Supported formats are GFF3, GTF.
346 """ 347 """
347 if gff_format == 'GTF': 348 if gff_format == 'GTF':
361 attrs_strs = [] 362 attrs_strs = []
362 for name, value in attrs.items(): 363 for name, value in attrs.items():
363 attrs_strs.append( format_string % ( name, value ) ) 364 attrs_strs.append( format_string % ( name, value ) )
364 return " ; ".join( attrs_strs ) 365 return " ; ".join( attrs_strs )
365 366
367
366 def read_unordered_gtf( iterator, strict=False ): 368 def read_unordered_gtf( iterator, strict=False ):
367 """ 369 """
368 Returns GTF features found in an iterator. GTF lines need not be ordered 370 Returns GTF features found in an iterator. GTF lines need not be ordered
369 or clustered for reader to work. Reader returns GFFFeature objects sorted 371 or clustered for reader to work. Reader returns GFFFeature objects sorted
370 by transcript_id, chrom, and start position. 372 by transcript_id, chrom, and start position.
380 # Use lenient parsing where chromosome + transcript_id is the key. This allows 382 # Use lenient parsing where chromosome + transcript_id is the key. This allows
381 # transcripts with same ID on different chromosomes; this occurs in some popular 383 # transcripts with same ID on different chromosomes; this occurs in some popular
382 # datasources, such as RefGenes in UCSC. 384 # datasources, such as RefGenes in UCSC.
383 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) 385 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
384 386
385
386 # Aggregate intervals by transcript_id and collect comments. 387 # Aggregate intervals by transcript_id and collect comments.
387 feature_intervals = odict() 388 feature_intervals = odict()
388 comments = [] 389 comments = []
389 for count, line in enumerate( iterator ): 390 for count, line in enumerate( iterator ):
390 if line.startswith( '#' ): 391 if line.startswith( '#' ):
401 402
402 # Create features. 403 # Create features.
403 chroms_features = {} 404 chroms_features = {}
404 for count, intervals in enumerate( feature_intervals.values() ): 405 for count, intervals in enumerate( feature_intervals.values() ):
405 # Sort intervals by start position. 406 # Sort intervals by start position.
406 intervals.sort( lambda a,b: cmp( a.start, b.start ) ) 407 intervals.sort( lambda a, b: cmp( a.start, b.start ) )
407 feature = GFFFeature( None, intervals=intervals ) 408 feature = GFFFeature( None, intervals=intervals )
408 if feature.chrom not in chroms_features: 409 if feature.chrom not in chroms_features:
409 chroms_features[ feature.chrom ] = [] 410 chroms_features[ feature.chrom ] = []
410 chroms_features[ feature.chrom ].append( feature ) 411 chroms_features[ feature.chrom ].append( feature )
411 412
412 # Sort features by chrom, start position. 413 # Sort features by chrom, start position.
413 chroms_features_sorted = [] 414 chroms_features_sorted = []
414 for chrom_features in chroms_features.values(): 415 for chrom_features in chroms_features.values():
415 chroms_features_sorted.append( chrom_features ) 416 chroms_features_sorted.append( chrom_features )
416 chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) 417 chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
417 for features in chroms_features_sorted: 418 for features in chroms_features_sorted:
418 features.sort( lambda a,b: cmp( a.start, b.start ) ) 419 features.sort( lambda a, b: cmp( a.start, b.start ) )
419 420
420 # Yield comments first, then features. 421 # Yield comments first, then features.
421 # FIXME: comments can appear anywhere in file, not just the beginning. 422 # FIXME: comments can appear anywhere in file, not just the beginning.
422 # Ideally, then comments would be associated with features and output 423 # Ideally, then comments would be associated with features and output
423 # just before feature/line. 424 # just before feature/line.
425 yield comment 426 yield comment
426 427
427 for chrom_features in chroms_features_sorted: 428 for chrom_features in chroms_features_sorted:
428 for feature in chrom_features: 429 for feature in chrom_features:
429 yield feature 430 yield feature
430