Mercurial > repos > devteam > flanking_features
changeset 1:850c05b9af00 draft
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author | devteam |
---|---|
date | Tue, 13 Oct 2015 12:50:14 -0400 |
parents | e928e029f6eb |
children | d94e778c3ad1 |
files | flanking_features.py tool_dependencies.xml utils/gff_util.py utils/odict.py |
diffstat | 4 files changed, 87 insertions(+), 82 deletions(-) [+] |
line wrap: on
line diff
--- a/flanking_features.py Tue Apr 01 09:13:13 2014 -0400 +++ b/flanking_features.py Tue Oct 13 12:50:14 2015 -0400 @@ -10,21 +10,23 @@ -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval """ -import sys, traceback, fileinput -from warnings import warn +import fileinput +import sys from bx.cookbook import doc_optparse -from galaxy.tools.util.galaxyops import * -from bx.intervals.io import * +from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper from bx.intervals.operations import quicksect -from utils.gff_util import * +from bx.tabular.io import ParseError +from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped +from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper assert sys.version_info[:2] >= ( 2, 4 ) -def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): + +def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down): #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand - if direction == 1: + if direction == 1: if node.maxend <= threshold_up: if node.end == node.maxend: report_func_up(node) @@ -60,10 +62,11 @@ if node.right: get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) + def proximal_region_finder(readers, region, comments=True): """ - Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. - Intervals are GenomicInterval objects. + Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. + Intervals are GenomicInterval objects. """ primary = readers[0] features = readers[1] @@ -76,13 +79,13 @@ up, down = True, True if region == 'Either': either = True - + # Read features into memory: rightTree = quicksect.IntervalTree() for item in features: if type( item ) is GenomicInterval: rightTree.insert( item, features.linenum, item ) - + for interval in primary: if type( interval ) is Header: yield interval @@ -96,33 +99,33 @@ if chrom not in rightTree.chroms: continue else: - root = rightTree.chroms[chrom] #root node for the chrom tree + root = rightTree.chroms[chrom] # root node for the chrom tree result_up = [] result_down = [] - if (strand == '+' and up) or (strand == '-' and down): + if (strand == '+' and up) or (strand == '-' and down): #upstream +ve strand and downstream -ve strand cases - get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) - + get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None) + if (strand == '+' and down) or (strand == '-' and up): #downstream +ve strand and upstream -ve strand case - get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) - + get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node )) + if result_up: - if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. + if len(result_up) > 1: # The results_up list has a list of intervals upstream to the given interval. ends = [] for n in result_up: ends.append(n.end) - res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list + res_ind = ends.index(max(ends)) # fetch the index of the closest interval i.e. the interval with the max end from the results_up list else: res_ind = 0 if not(either): yield [ interval, result_up[res_ind].other ] - - if result_down: + + if result_down: if not(either): #The last element of result_down will be the closest element to the given interval - yield [ interval, result_down[-1].other ] - + yield [ interval, result_down[-1].other ] + if either and (result_up or result_down): iter_val = [] if result_up and result_down: @@ -137,7 +140,8 @@ #The last element of result_down will be the closest element to the given interval iter_val = [ interval, result_down[-1].other ] yield iter_val - + + def main(): options, args = doc_optparse.parse( __doc__ ) try: @@ -148,7 +152,7 @@ in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() - + # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper @@ -160,22 +164,22 @@ in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), - chrom_col=chr_col_1, - start_col=start_col_1, - end_col=end_col_1, - strand_col=strand_col_1, - fix_strand=True ) + chrom_col=chr_col_1, + start_col=start_col_1, + end_col=end_col_1, + strand_col=strand_col_1, + fix_strand=True ) g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), - chrom_col=chr_col_2, - start_col=start_col_2, - end_col=end_col_2, - strand_col=strand_col_2, - fix_strand=True ) + chrom_col=chr_col_2, + start_col=start_col_2, + end_col=end_col_2, + strand_col=strand_col_2, + fix_strand=True ) # Find flanking features. out_file = open( out_fname, "w" ) try: - for result in proximal_region_finder([g1,g2], direction): + for result in proximal_region_finder([g1, g2], direction): if type( result ) is list: line, closest_feature = result # Need to join outputs differently depending on file types. @@ -185,12 +189,12 @@ # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff( line ) closest_feature = convert_bed_coords_to_gff( closest_feature ) - + # Replace double quotes with single quotes in closest feature's attributes. - out_file.write( "%s closest_feature \"%s\" \n" % - ( "\t".join( line.fields ), \ + out_file.write( "%s closest_feature \"%s\" \n" % + ( "\t".join( line.fields ), "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) - ) ) + ) ) else: # Output is BED + closest feature fields. output_line_fields = [] @@ -202,7 +206,7 @@ except ParseError, exc: fail( "Invalid file format: %s" % str( exc ) ) - print "Direction: %s" %(direction) + print "Direction: %s" % (direction) if g1.skipped > 0: print skipped( g1, filedesc=" of 1st dataset" ) if g2.skipped > 0:
--- a/tool_dependencies.xml Tue Apr 01 09:13:13 2014 -0400 +++ b/tool_dependencies.xml Tue Oct 13 12:50:14 2015 -0400 @@ -1,9 +1,9 @@ <?xml version="1.0"?> <tool_dependency> <package name="bx-python" version="0.7.1"> - <repository changeset_revision="cdb5991f9790" name="package_bx_python_0_7" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> <package name="galaxy-ops" version="1.0.0"> - <repository changeset_revision="3287c55c02b8" name="package_galaxy_ops_1_0_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="3287c55c02b8" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> </tool_dependency>
--- a/utils/gff_util.py Tue Apr 01 09:13:13 2014 -0400 +++ b/utils/gff_util.py Tue Oct 13 12:50:14 2015 -0400 @@ -3,16 +3,17 @@ """ import copy -from bx.intervals.io import * -from bx.tabular.io import Header, Comment +from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper +from bx.tabular.io import Header, Comment, ParseError from utils.odict import odict + class GFFInterval( GenomicInterval ): """ A GFF interval, including attributes. If file is strictly a GFF file, only attribute is 'group.' """ - def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ + def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, default_strand='.', fix_strand=False ): # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, # temporarily set strand and then unset after initing GenomicInterval. @@ -20,7 +21,7 @@ if not fix_strand and fields[ strand_col ] == '.': unknown_strand = True fields[ strand_col ] = '+' - GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ + GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=fix_strand ) if unknown_strand: self.strand = '.' @@ -43,16 +44,17 @@ return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, self.score_col, self.strand) + class GFFFeature( GFFInterval ): """ A GFF feature, which can include multiple intervals. """ - def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ - strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \ + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, + strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], raw_size=0 ): # Use copy so that first interval and feature do not share fields. - GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \ - start_col, end_col, strand_col, score_col, default_strand, \ + GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, + start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand ) self.intervals = intervals self.raw_size = raw_size @@ -60,7 +62,7 @@ for interval in self.intervals: # Error checking. NOTE: intervals need not share the same strand. if interval.chrom != self.chrom: - raise ValueError( "interval chrom does not match self chrom: %s != %s" % \ + raise ValueError( "interval chrom does not match self chrom: %s != %s" % ( interval.chrom, self.chrom ) ) # Set start, end of interval. if interval.start < self.start: @@ -72,13 +74,9 @@ """ Returns feature's name. """ name = None # Preference for name: GTF, GFF3, GFF. - for attr_name in [ - # GTF: - 'gene_id', 'transcript_id', - # GFF3: - 'ID', 'id', - # GFF (TODO): - 'group' ]: + for attr_name in ['gene_id', 'transcript_id', # GTF + 'ID', 'id', # GFF3 + 'group' ]: # GFF (TODO) name = self.attributes.get( attr_name, None ) if name is not None: break @@ -107,12 +105,13 @@ def parse_row( self, line ): # HACK: this should return a GFF interval, but bx-python operations # require GenomicInterval objects and subclasses will not work. - interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ - self.end_col, self.strand_col, self.default_strand, \ + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, + self.end_col, self.strand_col, self.default_strand, fix_strand=self.fix_strand ) interval = convert_gff_coords_to_bed( interval ) return interval + class GFFReaderWrapper( NiceReaderWrapper ): """ Reader wrapper for GFF files. @@ -127,9 +126,9 @@ expect traditional interval format. """ - def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \ + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): - NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \ + NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, strand_col=strand_col, fix_strand=fix_strand, **kwargs ) self.feature_col = feature_col self.score_col = score_col @@ -140,8 +139,8 @@ self.seed_interval_line_len = 0 def parse_row( self, line ): - interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \ - self.start_col, self.end_col, self.strand_col, self.score_col, \ + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, + self.start_col, self.end_col, self.strand_col, self.score_col, self.default_strand, fix_strand=self.fix_strand ) return interval @@ -155,12 +154,12 @@ def handle_parse_error( parse_error ): """ Actions to take when ParseError found. """ if self.outstream: - if self.print_delegate and hasattr(self.print_delegate,"__call__"): - self.print_delegate( self.outstream, e, self ) + if self.print_delegate and hasattr(self.print_delegate, "__call__"): + self.print_delegate( self.outstream, e, self ) self.skipped += 1 # no reason to stuff an entire bad file into memmory if self.skipped < 10: - self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception @@ -193,12 +192,10 @@ return return_val # Initialize feature identifier from seed. - feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF + feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF # For GFF3 feature_id = self.seed_interval.attributes.get( 'ID', None ) - feature_parent_id = self.seed_interval.attributes.get( 'Parent', None ) # For GTF. - feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # Read all intervals associated with seed. @@ -256,9 +253,9 @@ self.seed_interval_line_len = len( self.current_line ) # Return feature. - feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \ - self.end_col, self.strand_col, self.score_col, \ - self.default_strand, fix_strand=self.fix_strand, \ + feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, + self.end_col, self.strand_col, self.score_col, + self.default_strand, fix_strand=self.fix_strand, intervals=feature_intervals, raw_size=raw_size ) # Convert to BED coords? @@ -267,6 +264,7 @@ return feature + def convert_bed_coords_to_gff( interval ): """ Converts an interval object's coordinates from BED format to GFF format. @@ -279,10 +277,11 @@ if isinstance( interval, GFFFeature ): for subinterval in interval.intervals: convert_bed_coords_to_gff( subinterval ) - elif type ( interval ) is list: + elif type( interval ) is list: interval[ 0 ] += 1 return interval + def convert_gff_coords_to_bed( interval ): """ Converts an interval object's coordinates from GFF format to BED format. @@ -295,10 +294,11 @@ if isinstance( interval, GFFFeature ): for subinterval in interval.intervals: convert_gff_coords_to_bed( subinterval ) - elif type ( interval ) is list: + elif type( interval ) is list: interval[ 0 ] -= 1 return interval + def parse_gff_attributes( attr_str ): """ Parses a GFF/GTF attribute string and returns a dictionary of name-value @@ -340,6 +340,7 @@ attributes['group'] = attr_str return attributes + def gff_attributes_to_str( attrs, gff_format ): """ Convert GFF attributes to string. Supported formats are GFF3, GTF. @@ -363,6 +364,7 @@ attrs_strs.append( format_string % ( name, value ) ) return " ; ".join( attrs_strs ) + def read_unordered_gtf( iterator, strict=False ): """ Returns GTF features found in an iterator. GTF lines need not be ordered @@ -382,7 +384,6 @@ # datasources, such as RefGenes in UCSC. key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) - # Aggregate intervals by transcript_id and collect comments. feature_intervals = odict() comments = [] @@ -403,7 +404,7 @@ chroms_features = {} for count, intervals in enumerate( feature_intervals.values() ): # Sort intervals by start position. - intervals.sort( lambda a,b: cmp( a.start, b.start ) ) + intervals.sort( lambda a, b: cmp( a.start, b.start ) ) feature = GFFFeature( None, intervals=intervals ) if feature.chrom not in chroms_features: chroms_features[ feature.chrom ] = [] @@ -413,9 +414,9 @@ chroms_features_sorted = [] for chrom_features in chroms_features.values(): chroms_features_sorted.append( chrom_features ) - chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) + chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) for features in chroms_features_sorted: - features.sort( lambda a,b: cmp( a.start, b.start ) ) + features.sort( lambda a, b: cmp( a.start, b.start ) ) # Yield comments first, then features. # FIXME: comments can appear anywhere in file, not just the beginning. @@ -427,4 +428,3 @@ for chrom_features in chroms_features_sorted: for feature in chrom_features: yield feature -
--- a/utils/odict.py Tue Apr 01 09:13:13 2014 -0400 +++ b/utils/odict.py Tue Oct 13 12:50:14 2015 -0400 @@ -4,6 +4,7 @@ from UserDict import UserDict + class odict(UserDict): """ http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747 @@ -12,7 +13,7 @@ added. Calling keys(), values(), items(), etc. will return results in this order. """ - def __init__( self, dict = None ): + def __init__( self, dict=None ): self._keys = [] UserDict.__init__( self, dict )