diff utils/gff_util.py @ 1:850c05b9af00 draft

planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author devteam
date Tue, 13 Oct 2015 12:50:14 -0400
parents e928e029f6eb
children 94248d5b9b8b
line wrap: on
line diff
--- a/utils/gff_util.py	Tue Apr 01 09:13:13 2014 -0400
+++ b/utils/gff_util.py	Tue Oct 13 12:50:14 2015 -0400
@@ -3,16 +3,17 @@
 """
 
 import copy
-from bx.intervals.io import *
-from bx.tabular.io import Header, Comment
+from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
+from bx.tabular.io import Header, Comment, ParseError
 from utils.odict import odict
 
+
 class GFFInterval( GenomicInterval ):
     """
     A GFF interval, including attributes. If file is strictly a GFF file,
     only attribute is 'group.'
     """
-    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
+    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
                   strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
         # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
         # temporarily set strand and then unset after initing GenomicInterval.
@@ -20,7 +21,7 @@
         if not fix_strand and fields[ strand_col ] == '.':
             unknown_strand = True
             fields[ strand_col ] = '+'
-        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
+        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
                                   default_strand, fix_strand=fix_strand )
         if unknown_strand:
             self.strand = '.'
@@ -43,16 +44,17 @@
         return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
                            self.end_col, self.strand_col, self.score_col, self.strand)
 
+
 class GFFFeature( GFFInterval ):
     """
     A GFF feature, which can include multiple intervals.
     """
-    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
-                  strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
+                  strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
                   raw_size=0 ):
         # Use copy so that first interval and feature do not share fields.
-        GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
-                              start_col, end_col, strand_col, score_col, default_strand, \
+        GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
+                              start_col, end_col, strand_col, score_col, default_strand,
                               fix_strand=fix_strand )
         self.intervals = intervals
         self.raw_size = raw_size
@@ -60,7 +62,7 @@
         for interval in self.intervals:
             # Error checking. NOTE: intervals need not share the same strand.
             if interval.chrom != self.chrom:
-                raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
+                raise ValueError( "interval chrom does not match self chrom: %s != %s" %
                                   ( interval.chrom, self.chrom ) )
             # Set start, end of interval.
             if interval.start < self.start:
@@ -72,13 +74,9 @@
         """ Returns feature's name. """
         name = None
         # Preference for name: GTF, GFF3, GFF.
-        for attr_name in [
-                           # GTF:
-                           'gene_id', 'transcript_id',
-                           # GFF3:
-                           'ID', 'id',
-                           # GFF (TODO):
-                           'group' ]:
+        for attr_name in ['gene_id', 'transcript_id',  # GTF
+                          'ID', 'id',  # GFF3
+                          'group' ]:  # GFF (TODO)
             name = self.attributes.get( attr_name, None )
             if name is not None:
                 break
@@ -107,12 +105,13 @@
     def parse_row( self, line ):
         # HACK: this should return a GFF interval, but bx-python operations
         # require GenomicInterval objects and subclasses will not work.
-        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
-                                    self.end_col, self.strand_col, self.default_strand, \
+        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
+                                    self.end_col, self.strand_col, self.default_strand,
                                     fix_strand=self.fix_strand )
         interval = convert_gff_coords_to_bed( interval )
         return interval
 
+
 class GFFReaderWrapper( NiceReaderWrapper ):
     """
     Reader wrapper for GFF files.
@@ -127,9 +126,9 @@
        expect traditional interval format.
     """
 
-    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
                   end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
-        NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
+        NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
                                     strand_col=strand_col, fix_strand=fix_strand, **kwargs )
         self.feature_col = feature_col
         self.score_col = score_col
@@ -140,8 +139,8 @@
         self.seed_interval_line_len = 0
 
     def parse_row( self, line ):
-        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
-                                self.start_col, self.end_col, self.strand_col, self.score_col, \
+        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
+                                self.start_col, self.end_col, self.strand_col, self.score_col,
                                 self.default_strand, fix_strand=self.fix_strand )
         return interval
 
@@ -155,12 +154,12 @@
         def handle_parse_error( parse_error ):
             """ Actions to take when ParseError found. """
             if self.outstream:
-               if self.print_delegate and hasattr(self.print_delegate,"__call__"):
-                   self.print_delegate( self.outstream, e, self )
+                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
+                    self.print_delegate( self.outstream, e, self )
             self.skipped += 1
             # no reason to stuff an entire bad file into memmory
             if self.skipped < 10:
-               self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
 
             # For debugging, uncomment this to propogate parsing exceptions up.
             # I.e. the underlying reason for an unexpected StopIteration exception
@@ -193,12 +192,10 @@
             return return_val
 
         # Initialize feature identifier from seed.
-        feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
+        feature_group = self.seed_interval.attributes.get( 'group', None )  # For GFF
         # For GFF3
         feature_id = self.seed_interval.attributes.get( 'ID', None )
-        feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
         # For GTF.
-        feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
         feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
 
         # Read all intervals associated with seed.
@@ -256,9 +253,9 @@
         self.seed_interval_line_len = len( self.current_line )
 
         # Return feature.
-        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
-                              self.end_col, self.strand_col, self.score_col, \
-                              self.default_strand, fix_strand=self.fix_strand, \
+        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
+                              self.end_col, self.strand_col, self.score_col,
+                              self.default_strand, fix_strand=self.fix_strand,
                               intervals=feature_intervals, raw_size=raw_size )
 
         # Convert to BED coords?
@@ -267,6 +264,7 @@
 
         return feature
 
+
 def convert_bed_coords_to_gff( interval ):
     """
     Converts an interval object's coordinates from BED format to GFF format.
@@ -279,10 +277,11 @@
         if isinstance( interval, GFFFeature ):
             for subinterval in interval.intervals:
                 convert_bed_coords_to_gff( subinterval )
-    elif type ( interval ) is list:
+    elif type( interval ) is list:
         interval[ 0 ] += 1
     return interval
 
+
 def convert_gff_coords_to_bed( interval ):
     """
     Converts an interval object's coordinates from GFF format to BED format.
@@ -295,10 +294,11 @@
         if isinstance( interval, GFFFeature ):
             for subinterval in interval.intervals:
                 convert_gff_coords_to_bed( subinterval )
-    elif type ( interval ) is list:
+    elif type( interval ) is list:
         interval[ 0 ] -= 1
     return interval
 
+
 def parse_gff_attributes( attr_str ):
     """
     Parses a GFF/GTF attribute string and returns a dictionary of name-value
@@ -340,6 +340,7 @@
         attributes['group'] = attr_str
     return attributes
 
+
 def gff_attributes_to_str( attrs, gff_format ):
     """
     Convert GFF attributes to string. Supported formats are GFF3, GTF.
@@ -363,6 +364,7 @@
         attrs_strs.append( format_string % ( name, value ) )
     return " ; ".join( attrs_strs )
 
+
 def read_unordered_gtf( iterator, strict=False ):
     """
     Returns GTF features found in an iterator. GTF lines need not be ordered
@@ -382,7 +384,6 @@
         # datasources, such as RefGenes in UCSC.
         key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
 
-
     # Aggregate intervals by transcript_id and collect comments.
     feature_intervals = odict()
     comments = []
@@ -403,7 +404,7 @@
     chroms_features = {}
     for count, intervals in enumerate( feature_intervals.values() ):
         # Sort intervals by start position.
-        intervals.sort( lambda a,b: cmp( a.start, b.start ) )
+        intervals.sort( lambda a, b: cmp( a.start, b.start ) )
         feature = GFFFeature( None, intervals=intervals )
         if feature.chrom not in chroms_features:
             chroms_features[ feature.chrom ] = []
@@ -413,9 +414,9 @@
     chroms_features_sorted = []
     for chrom_features in chroms_features.values():
         chroms_features_sorted.append( chrom_features )
-    chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
+    chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
     for features in chroms_features_sorted:
-        features.sort( lambda a,b: cmp( a.start, b.start ) )
+        features.sort( lambda a, b: cmp( a.start, b.start ) )
 
     # Yield comments first, then features.
     # FIXME: comments can appear anywhere in file, not just the beginning.
@@ -427,4 +428,3 @@
     for chrom_features in chroms_features_sorted:
         for feature in chrom_features:
             yield feature
-