comparison utils/gff_util.py @ 3:94248d5b9b8b draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689
author devteam
date Thu, 22 Jun 2017 18:39:31 -0400
parents 850c05b9af00
children
comparison
equal deleted inserted replaced
2:d94e778c3ad1 3:94248d5b9b8b
1 """ 1 """
2 Provides utilities for working with GFF files. 2 Provides utilities for working with GFF files.
3 """ 3 """
4
5 import copy 4 import copy
5
6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper 6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
7 from bx.tabular.io import Header, Comment, ParseError 7 from bx.tabular.io import Comment, Header, ParseError
8 from utils.odict import odict 8
9 from .odict import odict
9 10
10 11
11 class GFFInterval( GenomicInterval ): 12 class GFFInterval( GenomicInterval ):
12 """ 13 """
13 A GFF interval, including attributes. If file is strictly a GFF file, 14 A GFF interval, including attributes. If file is strictly a GFF file,
142 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, 143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
143 self.start_col, self.end_col, self.strand_col, self.score_col, 144 self.start_col, self.end_col, self.strand_col, self.score_col,
144 self.default_strand, fix_strand=self.fix_strand ) 145 self.default_strand, fix_strand=self.fix_strand )
145 return interval 146 return interval
146 147
147 def next( self ): 148 def __next__( self ):
148 """ Returns next GFFFeature. """ 149 """ Returns next GFFFeature. """
149 150
150 # 151 #
151 # Helper function. 152 # Helper function.
152 # 153 #
175 # intervals to read, this is where iterator dies. 176 # intervals to read, this is where iterator dies.
176 if not self.seed_interval: 177 if not self.seed_interval:
177 while not self.seed_interval: 178 while not self.seed_interval:
178 try: 179 try:
179 self.seed_interval = GenomicIntervalReader.next( self ) 180 self.seed_interval = GenomicIntervalReader.next( self )
180 except ParseError, e: 181 except ParseError as e:
181 handle_parse_error( e ) 182 handle_parse_error( e )
182 # TODO: When no longer supporting python 2.4 use finally: 183 # TODO: When no longer supporting python 2.4 use finally:
183 #finally: 184 # finally:
184 raw_size += len( self.current_line ) 185 raw_size += len( self.current_line )
185 186
186 # If header or comment, clear seed interval and return it with its size. 187 # If header or comment, clear seed interval and return it with its size.
187 if isinstance( self.seed_interval, ( Header, Comment ) ): 188 if isinstance( self.seed_interval, ( Header, Comment ) ):
188 return_val = self.seed_interval 189 return_val = self.seed_interval
203 feature_intervals.append( self.seed_interval ) 204 feature_intervals.append( self.seed_interval )
204 while True: 205 while True:
205 try: 206 try:
206 interval = GenomicIntervalReader.next( self ) 207 interval = GenomicIntervalReader.next( self )
207 raw_size += len( self.current_line ) 208 raw_size += len( self.current_line )
208 except StopIteration, e: 209 except StopIteration as e:
209 # No more intervals to read, but last feature needs to be 210 # No more intervals to read, but last feature needs to be
210 # returned. 211 # returned.
211 interval = None 212 interval = None
212 raw_size += len( self.current_line ) 213 raw_size += len( self.current_line )
213 break 214 break
214 except ParseError, e: 215 except ParseError as e:
215 handle_parse_error( e ) 216 handle_parse_error( e )
216 raw_size += len( self.current_line ) 217 raw_size += len( self.current_line )
217 continue 218 continue
218 # TODO: When no longer supporting python 2.4 use finally: 219 # TODO: When no longer supporting python 2.4 use finally:
219 #finally: 220 # finally:
220 #raw_size += len( self.current_line ) 221 # raw_size += len( self.current_line )
221 222
222 # Ignore comments. 223 # Ignore comments.
223 if isinstance( interval, Comment ): 224 if isinstance( interval, Comment ):
224 continue 225 continue
225 226
261 # Convert to BED coords? 262 # Convert to BED coords?
262 if self.convert_to_bed_coord: 263 if self.convert_to_bed_coord:
263 convert_gff_coords_to_bed( feature ) 264 convert_gff_coords_to_bed( feature )
264 265
265 return feature 266 return feature
267 next = __next__ # This line should be removed once the bx-python port to Python3 is finished
266 268
267 269
268 def convert_bed_coords_to_gff( interval ): 270 def convert_bed_coords_to_gff( interval ):
269 """ 271 """
270 Converts an interval object's coordinates from BED format to GFF format. 272 Converts an interval object's coordinates from BED format to GFF format.
372 by transcript_id, chrom, and start position. 374 by transcript_id, chrom, and start position.
373 """ 375 """
374 376
375 # -- Get function that generates line/feature key. -- 377 # -- Get function that generates line/feature key. --
376 378
377 get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ] 379 def get_transcript_id(fields):
380 return parse_gff_attributes( fields[8] )[ 'transcript_id' ]
381
378 if strict: 382 if strict:
379 # Strict GTF parsing uses transcript_id only to group lines into feature. 383 # Strict GTF parsing uses transcript_id only to group lines into feature.
380 key_fn = get_transcript_id 384 key_fn = get_transcript_id
381 else: 385 else:
382 # Use lenient parsing where chromosome + transcript_id is the key. This allows 386 # Use lenient parsing where chromosome + transcript_id is the key. This allows
383 # transcripts with same ID on different chromosomes; this occurs in some popular 387 # transcripts with same ID on different chromosomes; this occurs in some popular
384 # datasources, such as RefGenes in UCSC. 388 # datasources, such as RefGenes in UCSC.
385 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) 389 def key_fn(fields):
390 return fields[0] + '_' + get_transcript_id( fields )
386 391
387 # Aggregate intervals by transcript_id and collect comments. 392 # Aggregate intervals by transcript_id and collect comments.
388 feature_intervals = odict() 393 feature_intervals = odict()
389 comments = [] 394 comments = []
390 for count, line in enumerate( iterator ): 395 for count, line in enumerate( iterator ):