Mercurial > repos > devteam > flanking_features

--- a/flanking_features.py	Tue Apr 01 09:13:13 2014 -0400
+++ b/flanking_features.py	Tue Oct 13 12:50:14 2015 -0400
@@ -10,21 +10,23 @@
     -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
 """

-import sys, traceback, fileinput
-from warnings import warn
+import fileinput
+import sys
 from bx.cookbook import doc_optparse
-from galaxy.tools.util.galaxyops import *
-from bx.intervals.io import *
+from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper
 from bx.intervals.operations import quicksect
-from utils.gff_util import *
+from bx.tabular.io import ParseError
+from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped
+from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper

 assert sys.version_info[:2] >= ( 2, 4 )

-def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
+
+def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
     #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
     #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
     #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
-    if direction == 1:
+    if direction == 1:
         if node.maxend <= threshold_up:
             if node.end == node.maxend:
                 report_func_up(node)
@@ -60,10 +62,11 @@
             if node.right:
                 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)

+
 def proximal_region_finder(readers, region, comments=True):
     """
-    Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ].
-    Intervals are GenomicInterval objects.
+    Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ].
+    Intervals are GenomicInterval objects.
     """
     primary = readers[0]
     features = readers[1]
@@ -76,13 +79,13 @@
         up, down = True, True
         if region == 'Either':
             either = True
-
+
     # Read features into memory:
     rightTree = quicksect.IntervalTree()
     for item in features:
         if type( item ) is GenomicInterval:
             rightTree.insert( item, features.linenum, item )
-
+
     for interval in primary:
         if type( interval ) is Header:
             yield interval
@@ -96,33 +99,33 @@
             if chrom not in rightTree.chroms:
                 continue
             else:
-                root = rightTree.chroms[chrom]    #root node for the chrom tree
+                root = rightTree.chroms[chrom]  # root node for the chrom tree
                 result_up = []
                 result_down = []
-                if (strand == '+' and up) or (strand == '-' and down):
+                if (strand == '+' and up) or (strand == '-' and down):
                     #upstream +ve strand and downstream -ve strand cases
-                    get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
-
+                    get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None)
+
                 if (strand == '+' and down) or (strand == '-' and up):
                     #downstream +ve strand and upstream -ve strand case
-                    get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node ))
-
+                    get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node ))
+
                 if result_up:
-                    if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval.
+                    if len(result_up) > 1:  # The results_up list has a list of intervals upstream to the given interval.
                         ends = []
                         for n in result_up:
                             ends.append(n.end)
-                        res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
+                        res_ind = ends.index(max(ends))  # fetch the index of the closest interval i.e. the interval with the max end from the results_up list
                     else:
                         res_ind = 0
                     if not(either):
                         yield [ interval, result_up[res_ind].other ]
-
-                if result_down:
+
+                if result_down:
                     if not(either):
                         #The last element of result_down will be the closest element to the given interval
-                        yield [ interval, result_down[-1].other ]
-
+                        yield [ interval, result_down[-1].other ]
+
                 if either and (result_up or result_down):
                     iter_val = []
                     if result_up and result_down:
@@ -137,7 +140,8 @@
                         #The last element of result_down will be the closest element to the given interval
                         iter_val = [ interval, result_down[-1].other ]
                     yield iter_val
-
+
+
 def main():
     options, args = doc_optparse.parse( __doc__ )
     try:
@@ -148,7 +152,7 @@
         in_fname, in2_fname, out_fname, direction = args
     except:
         doc_optparse.exception()
-
+
     # Set readers to handle either GFF or default format.
     if in1_gff_format:
         in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
@@ -160,22 +164,22 @@
         in2_reader_wrapper = NiceReaderWrapper

     g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
-                            chrom_col=chr_col_1,
-                            start_col=start_col_1,
-                            end_col=end_col_1,
-                            strand_col=strand_col_1,
-                            fix_strand=True )
+                             chrom_col=chr_col_1,
+                             start_col=start_col_1,
+                             end_col=end_col_1,
+                             strand_col=strand_col_1,
+                             fix_strand=True )
     g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
-                            chrom_col=chr_col_2,
-                            start_col=start_col_2,
-                            end_col=end_col_2,
-                            strand_col=strand_col_2,
-                            fix_strand=True )
+                             chrom_col=chr_col_2,
+                             start_col=start_col_2,
+                             end_col=end_col_2,
+                             strand_col=strand_col_2,
+                             fix_strand=True )

     # Find flanking features.
     out_file = open( out_fname, "w" )
     try:
-        for result in proximal_region_finder([g1,g2], direction):
+        for result in proximal_region_finder([g1, g2], direction):
             if type( result ) is list:
                 line, closest_feature = result
                 # Need to join outputs differently depending on file types.
@@ -185,12 +189,12 @@
                     # Invervals are in BED coordinates; need to convert to GFF.
                     line = convert_bed_coords_to_gff( line )
                     closest_feature = convert_bed_coords_to_gff( closest_feature )
-
+
                     # Replace double quotes with single quotes in closest feature's attributes.
-                    out_file.write( "%s closest_feature \"%s\" \n" %
-                                    ( "\t".join( line.fields ), \
+                    out_file.write( "%s closest_feature \"%s\" \n" %
+                                    ( "\t".join( line.fields ),
                                       "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
-                                     ) )
+                                      ) )
                 else:
                     # Output is BED + closest feature fields.
                     output_line_fields = []
@@ -202,7 +206,7 @@
     except ParseError, exc:
         fail( "Invalid file format: %s" % str( exc ) )

-    print "Direction: %s" %(direction)
+    print "Direction: %s" % (direction)
     if g1.skipped > 0:
         print skipped( g1, filedesc=" of 1st dataset" )
     if g2.skipped > 0:
--- a/tool_dependencies.xml	Tue Apr 01 09:13:13 2014 -0400
+++ b/tool_dependencies.xml	Tue Oct 13 12:50:14 2015 -0400
@@ -1,9 +1,9 @@
 <?xml version="1.0"?>
 <tool_dependency>
   <package name="bx-python" version="0.7.1">
-      <repository changeset_revision="cdb5991f9790" name="package_bx_python_0_7" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+      <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <package name="galaxy-ops" version="1.0.0">
-      <repository changeset_revision="3287c55c02b8" name="package_galaxy_ops_1_0_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+      <repository changeset_revision="3287c55c02b8" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>
--- a/utils/gff_util.py	Tue Apr 01 09:13:13 2014 -0400
+++ b/utils/gff_util.py	Tue Oct 13 12:50:14 2015 -0400
@@ -3,16 +3,17 @@
 """

 import copy
-from bx.intervals.io import *
-from bx.tabular.io import Header, Comment
+from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
+from bx.tabular.io import Header, Comment, ParseError
 from utils.odict import odict

+
 class GFFInterval( GenomicInterval ):
     """
     A GFF interval, including attributes. If file is strictly a GFF file,
     only attribute is 'group.'
     """
-    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
+    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
                   strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
         # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
         # temporarily set strand and then unset after initing GenomicInterval.
@@ -20,7 +21,7 @@
         if not fix_strand and fields[ strand_col ] == '.':
             unknown_strand = True
             fields[ strand_col ] = '+'
-        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
+        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
                                   default_strand, fix_strand=fix_strand )
         if unknown_strand:
             self.strand = '.'
@@ -43,16 +44,17 @@
         return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
                            self.end_col, self.strand_col, self.score_col, self.strand)

+
 class GFFFeature( GFFInterval ):
     """
     A GFF feature, which can include multiple intervals.
     """
-    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
-                  strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
+                  strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
                   raw_size=0 ):
         # Use copy so that first interval and feature do not share fields.
-        GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
-                              start_col, end_col, strand_col, score_col, default_strand, \
+        GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
+                              start_col, end_col, strand_col, score_col, default_strand,
                               fix_strand=fix_strand )
         self.intervals = intervals
         self.raw_size = raw_size
@@ -60,7 +62,7 @@
         for interval in self.intervals:
             # Error checking. NOTE: intervals need not share the same strand.
             if interval.chrom != self.chrom:
-                raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
+                raise ValueError( "interval chrom does not match self chrom: %s != %s" %
                                   ( interval.chrom, self.chrom ) )
             # Set start, end of interval.
             if interval.start < self.start:
@@ -72,13 +74,9 @@
         """ Returns feature's name. """
         name = None
         # Preference for name: GTF, GFF3, GFF.
-        for attr_name in [
-                           # GTF:
-                           'gene_id', 'transcript_id',
-                           # GFF3:
-                           'ID', 'id',
-                           # GFF (TODO):
-                           'group' ]:
+        for attr_name in ['gene_id', 'transcript_id',  # GTF
+                          'ID', 'id',  # GFF3
+                          'group' ]:  # GFF (TODO)
             name = self.attributes.get( attr_name, None )
             if name is not None:
                 break
@@ -107,12 +105,13 @@
     def parse_row( self, line ):
         # HACK: this should return a GFF interval, but bx-python operations
         # require GenomicInterval objects and subclasses will not work.
-        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
-                                    self.end_col, self.strand_col, self.default_strand, \
+        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
+                                    self.end_col, self.strand_col, self.default_strand,
                                     fix_strand=self.fix_strand )
         interval = convert_gff_coords_to_bed( interval )
         return interval

+
 class GFFReaderWrapper( NiceReaderWrapper ):
     """
     Reader wrapper for GFF files.
@@ -127,9 +126,9 @@
        expect traditional interval format.
     """

-    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
                   end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
-        NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
+        NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
                                     strand_col=strand_col, fix_strand=fix_strand, **kwargs )
         self.feature_col = feature_col
         self.score_col = score_col
@@ -140,8 +139,8 @@
         self.seed_interval_line_len = 0

     def parse_row( self, line ):
-        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
-                                self.start_col, self.end_col, self.strand_col, self.score_col, \
+        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
+                                self.start_col, self.end_col, self.strand_col, self.score_col,
                                 self.default_strand, fix_strand=self.fix_strand )
         return interval

@@ -155,12 +154,12 @@
         def handle_parse_error( parse_error ):
             """ Actions to take when ParseError found. """
             if self.outstream:
-               if self.print_delegate and hasattr(self.print_delegate,"__call__"):
-                   self.print_delegate( self.outstream, e, self )
+                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
+                    self.print_delegate( self.outstream, e, self )
             self.skipped += 1
             # no reason to stuff an entire bad file into memmory
             if self.skipped < 10:
-               self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )

             # For debugging, uncomment this to propogate parsing exceptions up.
             # I.e. the underlying reason for an unexpected StopIteration exception
@@ -193,12 +192,10 @@
             return return_val

         # Initialize feature identifier from seed.
-        feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
+        feature_group = self.seed_interval.attributes.get( 'group', None )  # For GFF
         # For GFF3
         feature_id = self.seed_interval.attributes.get( 'ID', None )
-        feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
         # For GTF.
-        feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
         feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )

         # Read all intervals associated with seed.
@@ -256,9 +253,9 @@
         self.seed_interval_line_len = len( self.current_line )

         # Return feature.
-        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
-                              self.end_col, self.strand_col, self.score_col, \
-                              self.default_strand, fix_strand=self.fix_strand, \
+        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
+                              self.end_col, self.strand_col, self.score_col,
+                              self.default_strand, fix_strand=self.fix_strand,
                               intervals=feature_intervals, raw_size=raw_size )

         # Convert to BED coords?
@@ -267,6 +264,7 @@

         return feature

+
 def convert_bed_coords_to_gff( interval ):
     """
     Converts an interval object's coordinates from BED format to GFF format.
@@ -279,10 +277,11 @@
         if isinstance( interval, GFFFeature ):
             for subinterval in interval.intervals:
                 convert_bed_coords_to_gff( subinterval )
-    elif type ( interval ) is list:
+    elif type( interval ) is list:
         interval[ 0 ] += 1
     return interval

+
 def convert_gff_coords_to_bed( interval ):
     """
     Converts an interval object's coordinates from GFF format to BED format.
@@ -295,10 +294,11 @@
         if isinstance( interval, GFFFeature ):
             for subinterval in interval.intervals:
                 convert_gff_coords_to_bed( subinterval )
-    elif type ( interval ) is list:
+    elif type( interval ) is list:
         interval[ 0 ] -= 1
     return interval

+
 def parse_gff_attributes( attr_str ):
     """
     Parses a GFF/GTF attribute string and returns a dictionary of name-value
@@ -340,6 +340,7 @@
         attributes['group'] = attr_str
     return attributes

+
 def gff_attributes_to_str( attrs, gff_format ):
     """
     Convert GFF attributes to string. Supported formats are GFF3, GTF.
@@ -363,6 +364,7 @@
         attrs_strs.append( format_string % ( name, value ) )
     return " ; ".join( attrs_strs )

+
 def read_unordered_gtf( iterator, strict=False ):
     """
     Returns GTF features found in an iterator. GTF lines need not be ordered
@@ -382,7 +384,6 @@
         # datasources, such as RefGenes in UCSC.
         key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )

-
     # Aggregate intervals by transcript_id and collect comments.
     feature_intervals = odict()
     comments = []
@@ -403,7 +404,7 @@
     chroms_features = {}
     for count, intervals in enumerate( feature_intervals.values() ):
         # Sort intervals by start position.
-        intervals.sort( lambda a,b: cmp( a.start, b.start ) )
+        intervals.sort( lambda a, b: cmp( a.start, b.start ) )
         feature = GFFFeature( None, intervals=intervals )
         if feature.chrom not in chroms_features:
             chroms_features[ feature.chrom ] = []
@@ -413,9 +414,9 @@
     chroms_features_sorted = []
     for chrom_features in chroms_features.values():
         chroms_features_sorted.append( chrom_features )
-    chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
+    chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
     for features in chroms_features_sorted:
-        features.sort( lambda a,b: cmp( a.start, b.start ) )
+        features.sort( lambda a, b: cmp( a.start, b.start ) )

     # Yield comments first, then features.
     # FIXME: comments can appear anywhere in file, not just the beginning.
@@ -427,4 +428,3 @@
     for chrom_features in chroms_features_sorted:
         for feature in chrom_features:
             yield feature
-
--- a/utils/odict.py	Tue Apr 01 09:13:13 2014 -0400
+++ b/utils/odict.py	Tue Oct 13 12:50:14 2015 -0400
@@ -4,6 +4,7 @@

 from UserDict import UserDict

+
 class odict(UserDict):
     """
     http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747
@@ -12,7 +13,7 @@
     added. Calling keys(), values(), items(), etc. will return results in this
     order.
     """
-    def __init__( self, dict = None ):
+    def __init__( self, dict=None ):
         self._keys = []
         UserDict.__init__( self, dict )