Mercurial > repos > devteam > flanking_features

--- a/flanking_features.py	Fri Dec 18 19:37:56 2015 -0500
+++ b/flanking_features.py	Thu Jun 22 18:39:31 2017 -0400
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-#By: Guruprasad Ananda
+# By: Guruprasad Ananda
 """
 Fetch closest up/downstream interval from features corresponding to every interval in primary

@@ -9,23 +9,26 @@
     -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
     -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
 """
+from __future__ import print_function

 import fileinput
 import sys
+
 from bx.cookbook import doc_optparse
 from bx.intervals.io import Comment, GenomicInterval, Header, NiceReaderWrapper
 from bx.intervals.operations import quicksect
 from bx.tabular.io import ParseError
 from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped
+
 from utils.gff_util import convert_bed_coords_to_gff, GFFIntervalToBEDReaderWrapper

 assert sys.version_info[:2] >= ( 2, 4 )


 def get_closest_feature(node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
-    #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
-    #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
-    #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
+    # direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
+    # threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
+    # threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
     if direction == 1:
         if node.maxend <= threshold_up:
             if node.end == node.maxend:
@@ -103,11 +106,11 @@
                 result_up = []
                 result_down = []
                 if (strand == '+' and up) or (strand == '-' and down):
-                    #upstream +ve strand and downstream -ve strand cases
+                    # upstream +ve strand and downstream -ve strand cases
                     get_closest_feature(root, 1, start, None, lambda node: result_up.append( node ), None)

                 if (strand == '+' and down) or (strand == '-' and up):
-                    #downstream +ve strand and upstream -ve strand case
+                    # downstream +ve strand and upstream -ve strand case
                     get_closest_feature(root, 0, None, end - 1, None, lambda node: result_down.append( node ))

                 if result_up:
@@ -123,7 +126,7 @@

                 if result_down:
                     if not(either):
-                        #The last element of result_down will be the closest element to the given interval
+                        # The last element of result_down will be the closest element to the given interval
                         yield [ interval, result_down[-1].other ]

                 if either and (result_up or result_down):
@@ -132,12 +135,12 @@
                         if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
                             iter_val = [ interval, result_up[res_ind].other ]
                         else:
-                            #The last element of result_down will be the closest element to the given interval
+                            # The last element of result_down will be the closest element to the given interval
                             iter_val = [ interval, result_down[-1].other ]
                     elif result_up:
                         iter_val = [ interval, result_up[res_ind].other ]
                     elif result_down:
-                        #The last element of result_down will be the closest element to the given interval
+                        # The last element of result_down will be the closest element to the given interval
                         iter_val = [ interval, result_down[-1].other ]
                     yield iter_val

@@ -203,14 +206,15 @@
                     out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
             else:
                 out_file.write( "%s\n" % result )
-    except ParseError, exc:
+    except ParseError as exc:
         fail( "Invalid file format: %s" % str( exc ) )

-    print "Direction: %s" % (direction)
+    print("Direction: %s" % (direction))
     if g1.skipped > 0:
-        print skipped( g1, filedesc=" of 1st dataset" )
+        print(skipped( g1, filedesc=" of 1st dataset" ))
     if g2.skipped > 0:
-        print skipped( g2, filedesc=" of 2nd dataset" )
+        print(skipped( g2, filedesc=" of 2nd dataset" ))
+

 if __name__ == "__main__":
     main()
--- a/flanking_features.xml	Fri Dec 18 19:37:56 2015 -0500
+++ b/flanking_features.xml	Thu Jun 22 18:39:31 2017 -0400
@@ -1,86 +1,87 @@
 <tool id="flanking_features_1" name="Fetch closest non-overlapping feature" version="4.0.1">
-  <description>  for every interval</description>
-  <requirements>
-    <requirement type="package" version="0.7.1">bx-python</requirement>
-    <requirement type="package" version="1.0.0">galaxy-ops</requirement>
-  </requirements>
-  <command interpreter="python">
-      flanking_features.py $input1 $input2 $out_file1 $direction
-
-      #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
-          -1 1,4,5,7 --gff1
-      #else:
-          -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
-      #end if
-
-      #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
-          -2 1,4,5,7 --gff2
-      #else:
-          -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}
-      #end if
-  </command>
-  <inputs>
-    <param format="interval,gff" name="input1" type="data" label="For every interval in"/>
-    <param format="interval,gff" name="input2" type="data" label="Fetch closest feature(s) from"/>
-    <param name="direction" type="select" label="Located">
-      <option value="Either">Either Upstream or Downstream</option>
-      <option value="Both">Both Upstream and Downstream</option>
-      <option value="Upstream">Upstream</option>
-      <option value="Downstream">Downstream</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="input" name="out_file1" metadata_source="input1"/>
-  </outputs>
-  <tests>
-    <test>
-      <param name="input1" value="4_windows.bed"/>
-      <param name="input2" value="4_windows_2.bed"/>
-      <param name="direction" value="Either"/>
-      <output name="out_file1" file="closest_features_either.interval"/>
-    </test>
-    <test>
-      <param name="input1" value="4_windows.bed"/>
-      <param name="input2" value="4_windows_2.bed"/>
-      <param name="direction" value="Both"/>
-      <output name="out_file1" file="closest_features.interval"/>
-    </test>
-    <test>
-      <param name="input1" value="4_windows.bed"/>
-      <param name="input2" value="4_windows_2.bed"/>
-      <param name="direction" value="Upstream"/>
-      <output name="out_file1" file="closest_features_up.interval"/>
-    </test>
-    <test>
-      <param name="input1" value="4_windows.bed"/>
-      <param name="input2" value="4_windows_2.bed"/>
-      <param name="direction" value="Downstream"/>
-      <output name="out_file1" file="closest_features_down.interval"/>
-    </test>
-    <test>
-      <param name="input1" value="4_windows.bed"/>
-      <param name="input2" value="4_windows_3.bed"/>
-      <param name="direction" value="Both"/>
-      <output name="out_file1" file="closest_features_both.interval"/>
-    </test>
-    <!-- Tests for GFF functionality. -->
+    <description>for every interval</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command><![CDATA[
+python '$__tool_directory__/flanking_features.py'
+'$input1'
+'$input2'
+'$out_file1'
+$direction
+
+#if $input1.is_of_type('gff')
+    -1 1,4,5,7 --gff1
+#else:
+    -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
+#end if

-    <test>
-      <param name="input1" value="4_windows.bed"/>
-      <param name="input2" value="4_windows_2.gff"/>
-      <param name="direction" value="Either"/>
-      <output name="out_file1" file="closest_features_both.gff"/>
-    </test>
-    <test>
-      <param name="input1" value="4_windows.gff"/>
-      <param name="input2" value="4_windows_2.gff"/>
-      <param name="direction" value="Either"/>
-      <output name="out_file1" file="closest_features_both2.gff"/>
-    </test>
-
-  </tests>
- <help>
-
+#if $input2.is_of_type('gff')
+    -2 1,4,5,7 --gff2
+#else:
+    -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}
+#end if
+    ]]></command>
+    <inputs>
+        <param name="input1" type="data" format="interval,gff" label="For every interval in"/>
+        <param name="input2" type="data" format="interval,gff" label="Fetch closest feature(s) from"/>
+        <param name="direction" type="select" label="Located">
+            <option value="Either">Either Upstream or Downstream</option>
+            <option value="Both">Both Upstream and Downstream</option>
+            <option value="Upstream">Upstream</option>
+            <option value="Downstream">Downstream</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="out_file1" format_source="input1" metadata_source="input1"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="4_windows.bed"/>
+            <param name="input2" value="4_windows_2.bed"/>
+            <param name="direction" value="Either"/>
+            <output name="out_file1" file="closest_features_either.interval"/>
+        </test>
+        <test>
+            <param name="input1" value="4_windows.bed"/>
+            <param name="input2" value="4_windows_2.bed"/>
+            <param name="direction" value="Both"/>
+            <output name="out_file1" file="closest_features.interval"/>
+        </test>
+        <test>
+            <param name="input1" value="4_windows.bed"/>
+            <param name="input2" value="4_windows_2.bed"/>
+            <param name="direction" value="Upstream"/>
+            <output name="out_file1" file="closest_features_up.interval"/>
+        </test>
+        <test>
+            <param name="input1" value="4_windows.bed"/>
+            <param name="input2" value="4_windows_2.bed"/>
+            <param name="direction" value="Downstream"/>
+            <output name="out_file1" file="closest_features_down.interval"/>
+        </test>
+        <test>
+            <param name="input1" value="4_windows.bed"/>
+            <param name="input2" value="4_windows_3.bed"/>
+            <param name="direction" value="Both"/>
+            <output name="out_file1" file="closest_features_both.interval"/>
+        </test>
+        <!-- Tests for GFF functionality. -->
+        <test>
+            <param name="input1" value="4_windows.bed"/>
+            <param name="input2" value="4_windows_2.gff" ftype="gff" />
+            <param name="direction" value="Either"/>
+            <output name="out_file1" file="closest_features_both.gff"/>
+        </test>
+        <test>
+            <param name="input1" value="4_windows.gff" ftype="gff" />
+            <param name="input2" value="4_windows_2.gff" ftype="gff" />
+            <param name="direction" value="Either"/>
+            <output name="out_file1" file="closest_features_both2.gff"/>
+        </test>
+    </tests>
+    <help><![CDATA[
 .. class:: infomark

 **What it does**
@@ -91,7 +92,7 @@

 .. class:: warningmark

-**Note:**
+**Note:**

 Every line should contain at least 3 columns: chromosome number, start and stop coordinates. If any of these columns is missing or if start and stop coordinates are not numerical, the lines will be treated as invalid and skipped. The number of skipped lines is documented in the resulting history item as a "data issue".

@@ -124,8 +125,5 @@
    chr1 500  1000 Query1.2 chr1 2000 2204 Query2.4
    chr1 1100 1250 Query1.3 chr1 580  1050 Query2.3
    chr1 1100 1250 Query1.3 chr1 2000 2204 Query2.4
-
-</help>
-
-
-</tool>
\ No newline at end of file
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Jun 22 18:39:31 2017 -0400
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.7.1">bx-python</requirement>
+            <requirement type="package" version="1.0.0">galaxy-ops</requirement>
+        </requirements>
+    </xml>
+    <token name="@SCREENCASTS@">
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: https://galaxyproject.org/learn/interval-operations/
+
+-----
+    </token>
+</macros>
--- a/tool_dependencies.xml	Fri Dec 18 19:37:56 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-  <package name="bx-python" version="0.7.1">
-      <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="galaxy-ops" version="1.0.0">
-      <repository changeset_revision="60c9a7af1345" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>
Binary file utils/__init__.pyc has changed
--- a/utils/gff_util.py	Fri Dec 18 19:37:56 2015 -0500
+++ b/utils/gff_util.py	Thu Jun 22 18:39:31 2017 -0400
@@ -1,11 +1,12 @@
 """
 Provides utilities for working with GFF files.
 """
+import copy

-import copy
 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
-from bx.tabular.io import Header, Comment, ParseError
-from utils.odict import odict
+from bx.tabular.io import Comment, Header, ParseError
+
+from .odict import odict


 class GFFInterval( GenomicInterval ):
@@ -144,7 +145,7 @@
                                 self.default_strand, fix_strand=self.fix_strand )
         return interval

-    def next( self ):
+    def __next__( self ):
         """ Returns next GFFFeature. """

         #
@@ -177,10 +178,10 @@
             while not self.seed_interval:
                 try:
                     self.seed_interval = GenomicIntervalReader.next( self )
-                except ParseError, e:
+                except ParseError as e:
                     handle_parse_error( e )
                 # TODO: When no longer supporting python 2.4 use finally:
-                #finally:
+                # finally:
                 raw_size += len( self.current_line )

         # If header or comment, clear seed interval and return it with its size.
@@ -205,19 +206,19 @@
             try:
                 interval = GenomicIntervalReader.next( self )
                 raw_size += len( self.current_line )
-            except StopIteration, e:
+            except StopIteration as e:
                 # No more intervals to read, but last feature needs to be
                 # returned.
                 interval = None
                 raw_size += len( self.current_line )
                 break
-            except ParseError, e:
+            except ParseError as e:
                 handle_parse_error( e )
                 raw_size += len( self.current_line )
                 continue
             # TODO: When no longer supporting python 2.4 use finally:
-            #finally:
-            #raw_size += len( self.current_line )
+            # finally:
+            # raw_size += len( self.current_line )

             # Ignore comments.
             if isinstance( interval, Comment ):
@@ -263,6 +264,7 @@
             convert_gff_coords_to_bed( feature )

         return feature
+    next = __next__  # This line should be removed once the bx-python port to Python3 is finished


 def convert_bed_coords_to_gff( interval ):
@@ -374,7 +376,9 @@

     # -- Get function that generates line/feature key. --

-    get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
+    def get_transcript_id(fields):
+        return parse_gff_attributes( fields[8] )[ 'transcript_id' ]
+
     if strict:
         # Strict GTF parsing uses transcript_id only to group lines into feature.
         key_fn = get_transcript_id
@@ -382,7 +386,8 @@
         # Use lenient parsing where chromosome + transcript_id is the key. This allows
         # transcripts with same ID on different chromosomes; this occurs in some popular
         # datasources, such as RefGenes in UCSC.
-        key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
+        def key_fn(fields):
+            return fields[0] + '_' + get_transcript_id( fields )

     # Aggregate intervals by transcript_id and collect comments.
     feature_intervals = odict()
Binary file utils/gff_util.pyc has changed
Binary file utils/odict.pyc has changed