Mercurial > repos > devteam > flanking_features
diff utils/gff_util.py @ 1:850c05b9af00 draft
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author | devteam |
---|---|
date | Tue, 13 Oct 2015 12:50:14 -0400 |
parents | e928e029f6eb |
children | 94248d5b9b8b |
line wrap: on
line diff
--- a/utils/gff_util.py Tue Apr 01 09:13:13 2014 -0400 +++ b/utils/gff_util.py Tue Oct 13 12:50:14 2015 -0400 @@ -3,16 +3,17 @@ """ import copy -from bx.intervals.io import * -from bx.tabular.io import Header, Comment +from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper +from bx.tabular.io import Header, Comment, ParseError from utils.odict import odict + class GFFInterval( GenomicInterval ): """ A GFF interval, including attributes. If file is strictly a GFF file, only attribute is 'group.' """ - def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ + def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, default_strand='.', fix_strand=False ): # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, # temporarily set strand and then unset after initing GenomicInterval. @@ -20,7 +21,7 @@ if not fix_strand and fields[ strand_col ] == '.': unknown_strand = True fields[ strand_col ] = '+' - GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ + GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=fix_strand ) if unknown_strand: self.strand = '.' @@ -43,16 +44,17 @@ return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, self.score_col, self.strand) + class GFFFeature( GFFInterval ): """ A GFF feature, which can include multiple intervals. """ - def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ - strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \ + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, + strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], raw_size=0 ): # Use copy so that first interval and feature do not share fields. - GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \ - start_col, end_col, strand_col, score_col, default_strand, \ + GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, + start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand ) self.intervals = intervals self.raw_size = raw_size @@ -60,7 +62,7 @@ for interval in self.intervals: # Error checking. NOTE: intervals need not share the same strand. if interval.chrom != self.chrom: - raise ValueError( "interval chrom does not match self chrom: %s != %s" % \ + raise ValueError( "interval chrom does not match self chrom: %s != %s" % ( interval.chrom, self.chrom ) ) # Set start, end of interval. if interval.start < self.start: @@ -72,13 +74,9 @@ """ Returns feature's name. """ name = None # Preference for name: GTF, GFF3, GFF. - for attr_name in [ - # GTF: - 'gene_id', 'transcript_id', - # GFF3: - 'ID', 'id', - # GFF (TODO): - 'group' ]: + for attr_name in ['gene_id', 'transcript_id', # GTF + 'ID', 'id', # GFF3 + 'group' ]: # GFF (TODO) name = self.attributes.get( attr_name, None ) if name is not None: break @@ -107,12 +105,13 @@ def parse_row( self, line ): # HACK: this should return a GFF interval, but bx-python operations # require GenomicInterval objects and subclasses will not work. - interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ - self.end_col, self.strand_col, self.default_strand, \ + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, + self.end_col, self.strand_col, self.default_strand, fix_strand=self.fix_strand ) interval = convert_gff_coords_to_bed( interval ) return interval + class GFFReaderWrapper( NiceReaderWrapper ): """ Reader wrapper for GFF files. @@ -127,9 +126,9 @@ expect traditional interval format. """ - def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \ + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): - NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \ + NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, strand_col=strand_col, fix_strand=fix_strand, **kwargs ) self.feature_col = feature_col self.score_col = score_col @@ -140,8 +139,8 @@ self.seed_interval_line_len = 0 def parse_row( self, line ): - interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \ - self.start_col, self.end_col, self.strand_col, self.score_col, \ + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, + self.start_col, self.end_col, self.strand_col, self.score_col, self.default_strand, fix_strand=self.fix_strand ) return interval @@ -155,12 +154,12 @@ def handle_parse_error( parse_error ): """ Actions to take when ParseError found. """ if self.outstream: - if self.print_delegate and hasattr(self.print_delegate,"__call__"): - self.print_delegate( self.outstream, e, self ) + if self.print_delegate and hasattr(self.print_delegate, "__call__"): + self.print_delegate( self.outstream, e, self ) self.skipped += 1 # no reason to stuff an entire bad file into memmory if self.skipped < 10: - self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception @@ -193,12 +192,10 @@ return return_val # Initialize feature identifier from seed. - feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF + feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF # For GFF3 feature_id = self.seed_interval.attributes.get( 'ID', None ) - feature_parent_id = self.seed_interval.attributes.get( 'Parent', None ) # For GTF. - feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # Read all intervals associated with seed. @@ -256,9 +253,9 @@ self.seed_interval_line_len = len( self.current_line ) # Return feature. - feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \ - self.end_col, self.strand_col, self.score_col, \ - self.default_strand, fix_strand=self.fix_strand, \ + feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, + self.end_col, self.strand_col, self.score_col, + self.default_strand, fix_strand=self.fix_strand, intervals=feature_intervals, raw_size=raw_size ) # Convert to BED coords? @@ -267,6 +264,7 @@ return feature + def convert_bed_coords_to_gff( interval ): """ Converts an interval object's coordinates from BED format to GFF format. @@ -279,10 +277,11 @@ if isinstance( interval, GFFFeature ): for subinterval in interval.intervals: convert_bed_coords_to_gff( subinterval ) - elif type ( interval ) is list: + elif type( interval ) is list: interval[ 0 ] += 1 return interval + def convert_gff_coords_to_bed( interval ): """ Converts an interval object's coordinates from GFF format to BED format. @@ -295,10 +294,11 @@ if isinstance( interval, GFFFeature ): for subinterval in interval.intervals: convert_gff_coords_to_bed( subinterval ) - elif type ( interval ) is list: + elif type( interval ) is list: interval[ 0 ] -= 1 return interval + def parse_gff_attributes( attr_str ): """ Parses a GFF/GTF attribute string and returns a dictionary of name-value @@ -340,6 +340,7 @@ attributes['group'] = attr_str return attributes + def gff_attributes_to_str( attrs, gff_format ): """ Convert GFF attributes to string. Supported formats are GFF3, GTF. @@ -363,6 +364,7 @@ attrs_strs.append( format_string % ( name, value ) ) return " ; ".join( attrs_strs ) + def read_unordered_gtf( iterator, strict=False ): """ Returns GTF features found in an iterator. GTF lines need not be ordered @@ -382,7 +384,6 @@ # datasources, such as RefGenes in UCSC. key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) - # Aggregate intervals by transcript_id and collect comments. feature_intervals = odict() comments = [] @@ -403,7 +404,7 @@ chroms_features = {} for count, intervals in enumerate( feature_intervals.values() ): # Sort intervals by start position. - intervals.sort( lambda a,b: cmp( a.start, b.start ) ) + intervals.sort( lambda a, b: cmp( a.start, b.start ) ) feature = GFFFeature( None, intervals=intervals ) if feature.chrom not in chroms_features: chroms_features[ feature.chrom ] = [] @@ -413,9 +414,9 @@ chroms_features_sorted = [] for chrom_features in chroms_features.values(): chroms_features_sorted.append( chrom_features ) - chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) + chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) for features in chroms_features_sorted: - features.sort( lambda a,b: cmp( a.start, b.start ) ) + features.sort( lambda a, b: cmp( a.start, b.start ) ) # Yield comments first, then features. # FIXME: comments can appear anywhere in file, not just the beginning. @@ -427,4 +428,3 @@ for chrom_features in chroms_features_sorted: for feature in chrom_features: yield feature -