flanking_features: utils/gff_util.py comparison

comparison utils/gff_util.py @ 1:850c05b9af00 draft

planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734

author	devteam
date	Tue, 13 Oct 2015 12:50:14 -0400
parents	e928e029f6eb
children	94248d5b9b8b

comparison

equal deleted inserted replaced

-:e928e029f6eb
+:850c05b9af00
 """
 Provides utilities for working with GFF files.
 """
 import copy
-from bx.intervals.io import *
+from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
-from bx.tabular.io import Header, Comment
+from bx.tabular.io import Header, Comment, ParseError
 from utils.odict import odict
 class GFFInterval( GenomicInterval ):
 """
 A GFF interval, including attributes. If file is strictly a GFF file,
 only attribute is 'group.'
 """
-def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
+def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
 strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
 # temporarily set strand and then unset after initing GenomicInterval.
 unknown_strand = False
 if not fix_strand and fields[ strand_col ] == '.':
 unknown_strand = True
 fields[ strand_col ] = '+'
-GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
+GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
 default_strand, fix_strand=fix_strand )
 if unknown_strand:
 self.strand = '.'
 self.fields[ strand_col ] = '.'
 def copy( self ):
 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
 self.end_col, self.strand_col, self.score_col, self.strand)
 class GFFFeature( GFFInterval ):
 """
 A GFF feature, which can include multiple intervals.
 """
-def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
+def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
-strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
+strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
 raw_size=0 ):
 # Use copy so that first interval and feature do not share fields.
-GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
+GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
-start_col, end_col, strand_col, score_col, default_strand, \
+start_col, end_col, strand_col, score_col, default_strand,
 fix_strand=fix_strand )
 self.intervals = intervals
 self.raw_size = raw_size
 # Use intervals to set feature attributes.
 for interval in self.intervals:
 # Error checking. NOTE: intervals need not share the same strand.
 if interval.chrom != self.chrom:
-raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
+raise ValueError( "interval chrom does not match self chrom: %s != %s" %
 ( interval.chrom, self.chrom ) )
 # Set start, end of interval.
 if interval.start < self.start:
 self.start = interval.start
 if interval.end > self.end:
 def name( self ):
 """ Returns feature's name. """
 name = None
 # Preference for name: GTF, GFF3, GFF.
-for attr_name in [
+for attr_name in ['gene_id', 'transcript_id',  # GTF
-# GTF:
+'ID', 'id',  # GFF3
-'gene_id', 'transcript_id',
+'group' ]:  # GFF (TODO)
-# GFF3:
-'ID', 'id',
-# GFF (TODO):
-'group' ]:
 name = self.attributes.get( attr_name, None )
 if name is not None:
 break
 return name
 """
 def parse_row( self, line ):
 # HACK: this should return a GFF interval, but bx-python operations
 # require GenomicInterval objects and subclasses will not work.
-interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
+interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
-self.end_col, self.strand_col, self.default_strand, \
+self.end_col, self.strand_col, self.default_strand,
 fix_strand=self.fix_strand )
 interval = convert_gff_coords_to_bed( interval )
 return interval
 class GFFReaderWrapper( NiceReaderWrapper ):
 """
 Reader wrapper for GFF files.
 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
 half-open. This is useful when using GFF files as inputs to tools that
 expect traditional interval format.
 """
-def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
+def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
-NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
+NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
 strand_col=strand_col, fix_strand=fix_strand, **kwargs )
 self.feature_col = feature_col
 self.score_col = score_col
 self.convert_to_bed_coord = convert_to_bed_coord
 self.last_line = None
 self.cur_offset = 0
 self.seed_interval = None
 self.seed_interval_line_len = 0
 def parse_row( self, line ):
-interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
+interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
-self.start_col, self.end_col, self.strand_col, self.score_col, \
+self.start_col, self.end_col, self.strand_col, self.score_col,
 self.default_strand, fix_strand=self.fix_strand )
 return interval
 def next( self ):
 """ Returns next GFFFeature. """
 #
 def handle_parse_error( parse_error ):
 """ Actions to take when ParseError found. """
 if self.outstream:
-if self.print_delegate and hasattr(self.print_delegate,"__call__"):
+if self.print_delegate and hasattr(self.print_delegate, "__call__"):
 self.print_delegate( self.outstream, e, self )
 self.skipped += 1
 # no reason to stuff an entire bad file into memmory
 if self.skipped < 10:
 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
 # For debugging, uncomment this to propogate parsing exceptions up.
 # I.e. the underlying reason for an unexpected StopIteration exception
 # can be found by uncommenting this.
 # raise e
 self.seed_interval = None
 self.seed_interval_line_len = 0
 return return_val
 # Initialize feature identifier from seed.
-feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
+feature_group = self.seed_interval.attributes.get( 'group', None )  # For GFF
 # For GFF3
 feature_id = self.seed_interval.attributes.get( 'ID', None )
-feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
 # For GTF.
-feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
 # Read all intervals associated with seed.
 feature_intervals = []
 feature_intervals.append( self.seed_interval )
 # Last interval read is the seed for the next interval.
 self.seed_interval = interval
 self.seed_interval_line_len = len( self.current_line )
 # Return feature.
-feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
+feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
-self.end_col, self.strand_col, self.score_col, \
+self.end_col, self.strand_col, self.score_col,
-self.default_strand, fix_strand=self.fix_strand, \
+self.default_strand, fix_strand=self.fix_strand,
 intervals=feature_intervals, raw_size=raw_size )
 # Convert to BED coords?
 if self.convert_to_bed_coord:
 convert_gff_coords_to_bed( feature )
 return feature
 def convert_bed_coords_to_gff( interval ):
 """
 Converts an interval object's coordinates from BED format to GFF format.
 Accepted object types include GenomicInterval and list (where the first
 if isinstance( interval, GenomicInterval ):
 interval.start += 1
 if isinstance( interval, GFFFeature ):
 for subinterval in interval.intervals:
 convert_bed_coords_to_gff( subinterval )
-elif type ( interval ) is list:
+elif type( interval ) is list:
 interval[ 0 ] += 1
 return interval
 def convert_gff_coords_to_bed( interval ):
 """
 Converts an interval object's coordinates from GFF format to BED format.
 Accepted object types include GFFFeature, GenomicInterval, and list (where
 if isinstance( interval, GenomicInterval ):
 interval.start -= 1
 if isinstance( interval, GFFFeature ):
 for subinterval in interval.intervals:
 convert_gff_coords_to_bed( subinterval )
-elif type ( interval ) is list:
+elif type( interval ) is list:
 interval[ 0 ] -= 1
 return interval
 def parse_gff_attributes( attr_str ):
 """
 Parses a GFF/GTF attribute string and returns a dictionary of name-value
 pairs. The general format for a GFF3 attributes string is
 # Could not split attributes string, so entire string must be
 # 'group' attribute. This is the case for strictly GFF files.
 attributes['group'] = attr_str
 return attributes
 def gff_attributes_to_str( attrs, gff_format ):
 """
 Convert GFF attributes to string. Supported formats are GFF3, GTF.
 """
 if gff_format == 'GTF':
 attrs_strs = []
 for name, value in attrs.items():
 attrs_strs.append( format_string % ( name, value ) )
 return " ; ".join( attrs_strs )
 def read_unordered_gtf( iterator, strict=False ):
 """
 Returns GTF features found in an iterator. GTF lines need not be ordered
 or clustered for reader to work. Reader returns GFFFeature objects sorted
 by transcript_id, chrom, and start position.
 # Use lenient parsing where chromosome + transcript_id is the key. This allows
 # transcripts with same ID on different chromosomes; this occurs in some popular
 # datasources, such as RefGenes in UCSC.
 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
 # Aggregate intervals by transcript_id and collect comments.
 feature_intervals = odict()
 comments = []
 for count, line in enumerate( iterator ):
 if line.startswith( '#' ):
 # Create features.
 chroms_features = {}
 for count, intervals in enumerate( feature_intervals.values() ):
 # Sort intervals by start position.
-intervals.sort( lambda a,b: cmp( a.start, b.start ) )
+intervals.sort( lambda a, b: cmp( a.start, b.start ) )
 feature = GFFFeature( None, intervals=intervals )
 if feature.chrom not in chroms_features:
 chroms_features[ feature.chrom ] = []
 chroms_features[ feature.chrom ].append( feature )
 # Sort features by chrom, start position.
 chroms_features_sorted = []
 for chrom_features in chroms_features.values():
 chroms_features_sorted.append( chrom_features )
-chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
+chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
 for features in chroms_features_sorted:
-features.sort( lambda a,b: cmp( a.start, b.start ) )
+features.sort( lambda a, b: cmp( a.start, b.start ) )
 # Yield comments first, then features.
 # FIXME: comments can appear anywhere in file, not just the beginning.
 # Ideally, then comments would be associated with features and output
 # just before feature/line.
 yield comment
 for chrom_features in chroms_features_sorted:
 for feature in chrom_features:
 yield feature

Mercurial > repos > devteam > flanking_features

comparison utils/gff_util.py @ 1:850c05b9af00 draft