Mercurial > repos > devteam > subtract

--- a/gops_subtract.py	Fri Dec 18 19:39:15 2015 -0500
+++ b/gops_subtract.py	Thu Jun 22 18:51:24 2017 -0400
@@ -11,14 +11,18 @@
     -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
     -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
 """
+from __future__ import print_function
+
 import fileinput
 import sys
+
+from bx.cookbook import doc_optparse
 from bx.intervals.io import GenomicInterval, NiceReaderWrapper
 from bx.intervals.operations.subtract import subtract
-from bx.cookbook import doc_optparse
 from bx.tabular.io import ParseError
 from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped
-from utils.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff
+
+from utils.gff_util import convert_bed_coords_to_gff, GFFFeature, GFFReaderWrapper

 assert sys.version_info[:2] >= ( 2, 4 )

@@ -81,16 +85,17 @@
                 out_file.write( "%s\n" % "\t".join( feature.fields ) )
             else:
                 out_file.write( "%s\n" % feature )
-    except ParseError, exc:
+    except ParseError as exc:
         out_file.close()
         fail( "Invalid file format: %s" % str( exc ) )

     out_file.close()

     if g1.skipped > 0:
-        print skipped( g1, filedesc=" of 2nd dataset" )
+        print(skipped( g1, filedesc=" of 2nd dataset" ))
     if g2.skipped > 0:
-        print skipped( g2, filedesc=" of 1st dataset" )
+        print(skipped( g2, filedesc=" of 1st dataset" ))
+

 if __name__ == "__main__":
     main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Jun 22 18:51:24 2017 -0400
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.7.1">bx-python</requirement>
+            <requirement type="package" version="1.0.0">galaxy-ops</requirement>
+        </requirements>
+    </xml>
+    <token name="@SCREENCASTS@">
+-----
+
+**Screencasts!**
+
+See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
+
+.. _Screencasts: https://galaxyproject.org/learn/interval-operations/
+
+-----
+    </token>
+</macros>
--- a/operation_filter.py	Fri Dec 18 19:39:15 2015 -0500
+++ b/operation_filter.py	Thu Jun 22 18:51:24 2017 -0400
@@ -1,8 +1,7 @@
 # runs after the job (and after the default post-filter)
+from galaxy.jobs.handler import JOB_ERROR
 from galaxy.tools.parameters import DataToolParameter

-from galaxy.jobs.handler import JOB_ERROR
-
 # Older py compatibility
 try:
     set()
@@ -14,7 +13,7 @@
     dbkeys = set()
     data_param_names = set()
     data_params = 0
-    for name, param in page_param_map.iteritems():
+    for name, param in page_param_map.items():
         if isinstance( param, DataToolParameter ):
             # for each dataset parameter
             if param_values.get(name, None) is not None:
@@ -53,7 +52,6 @@
         try:
             if stderr and len( stderr ) > 0:
                 raise Exception( stderr )
-
         except Exception:
             data.blurb = JOB_ERROR
             data.state = JOB_ERROR
--- a/subtract.xml	Fri Dec 18 19:39:15 2015 -0500
+++ b/subtract.xml	Thu Jun 22 18:51:24 2017 -0400
@@ -1,110 +1,95 @@
 <tool id="gops_subtract_1" name="Subtract" version="1.0.0">
-  <description>the intervals of two datasets</description>
-  <requirements>
-    <requirement type="package" version="0.7.1">bx-python</requirement>
-    <requirement type="package" version="1.0.0">galaxy-ops</requirement>
-  </requirements>
-  <command interpreter="python">gops_subtract.py
-      $input1 $input2 $output
-
-      #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
-        -1 1,4,5,7 --gff1
-      #else:
-        -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
-      #end if
+    <description>the intervals of two datasets</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <code file="operation_filter.py"/>
+    <command><![CDATA[
+python '$__tool_directory__/gops_subtract.py'
+'$input1'
+'$input2'
+'$output'

-      #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__):
-        -2 1,4,5,7 --gff2
-      #else:
-          -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}
-      #end if
+#if $input1.is_of_type('gff')
+    -1 1,4,5,7 --gff1
+#else:
+    -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol}
+#end if

-      -m $min $returntype
-  </command>
-  <inputs>
-    <param format="interval,gff" name="input2" type="data" help="Second dataset">
-      <label>Subtract</label>
-    </param>
-
-    <param format="interval,gff" name="input1" type="data" help="First dataset">
-      <label>from</label>
-    </param>
+#if $input2.is_of_type('gff')
+    -2 1,4,5,7 --gff2
+#else:
+    -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol}
+#end if

-    <param name="returntype" type="select" label="Return" help="of the first dataset (see figure below)">
-      <option value="">Intervals with no overlap</option>
-      <option value="-p">Non-overlapping pieces of intervals</option>
-    </param>
-
-    <param name="min" type="integer" value="1" min="1" help="(bp)">
-      <label>where minimal overlap is</label>
-    </param>
-   </inputs>
-  <outputs>
-    <data format="input" name="output" metadata_source="input1"/>
-  </outputs>
-  <code file="operation_filter.py"/>
-  <trackster_conf/>
-  <tests>
-    <test>
-      <param name="input1" value="1.bed" />
-      <param name="input2" value="2.bed" />
-      <param name="min" value="1" />
-      <param name="returntype" value="" />
-      <output name="output" file="gops-subtract.dat" />
-    </test>
-    <test>
-      <param name="input1" value="1.bed" />
-      <param name="input2" value="2_mod.bed" ftype="interval"/>
-      <param name="min" value="1" />
-      <param name="returntype" value="" />
-      <output name="output" file="gops_subtract_diffCols.dat" />
-    </test>
-    <test>
-      <param name="input1" value="gops_subtract_bigint.bed" />
-      <param name="input2" value="2.bed" />
-      <param name="min" value="1" />
-      <param name="returntype" value="" />
-      <output name="output" file="gops-subtract.dat" />
-    </test>
-    <test>
-      <param name="input1" value="1.bed" />
-      <param name="input2" value="2.bed" />
-      <param name="min" value="10" />
-      <param name="returntype" value="Non-overlapping pieces of intervals" />
-      <output name="output" file="gops-subtract-p.dat" />
-    </test>
-    <!-- Subtract two GFF files. -->
-    <test>
-        <param name="input1" value="gops_subtract_in1.gff" />
-        <param name="input2" value="gops_subtract_in2.gff" />
-        <param name="min" value="1" />
-        <param name="returntype" value="" />
-        <output name="output" file="gops_subtract_out1.gff" />
-    </test>
-    <!-- Subtract BED file from GFF file. -->
-    <test>
-        <param name="input1" value="gops_subtract_in1.gff" />
-        <param name="input2" value="gops_subtract_in2.bed" />
-        <param name="min" value="1" />
-        <param name="returntype" value="" />
-        <output name="output" file="gops_subtract_out1.gff" />
-    </test>
-  </tests>
-  <help>
-
+-m $min
+$returntype
+    ]]></command>
+    <inputs>
+        <param name="input2" type="data" format="interval,gff" label="Subtract" help="Second dataset" />
+        <param name="input1" type="data" format="interval,gff" label="from" help="First dataset" />
+        <param name="returntype" type="select" label="Return" help="of the first dataset (see figure below)">
+            <option value="">Intervals with no overlap</option>
+            <option value="-p">Non-overlapping pieces of intervals</option>
+        </param>
+        <param name="min" type="integer" value="1" min="1" label="where minimal overlap is" help="(bp)" />
+    </inputs>
+    <outputs>
+        <data name="output" format_source="input1" metadata_source="input1"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="1.bed" />
+            <param name="input2" value="2.bed" />
+            <param name="min" value="1" />
+            <param name="returntype" value="" />
+            <output name="output" file="gops-subtract.dat" />
+        </test>
+        <test>
+            <param name="input1" value="1.bed" />
+            <param name="input2" value="2_mod.bed" ftype="interval"/>
+            <param name="min" value="1" />
+            <param name="returntype" value="" />
+            <output name="output" file="gops_subtract_diffCols.dat" />
+        </test>
+        <test>
+            <param name="input1" value="gops_subtract_bigint.bed" />
+            <param name="input2" value="2.bed" />
+            <param name="min" value="1" />
+            <param name="returntype" value="" />
+            <output name="output" file="gops-subtract.dat" />
+        </test>
+        <test>
+            <param name="input1" value="1.bed" />
+            <param name="input2" value="2.bed" />
+            <param name="min" value="10" />
+            <param name="returntype" value="Non-overlapping pieces of intervals" />
+            <output name="output" file="gops-subtract-p.dat" />
+        </test>
+        <!-- Subtract two GFF files. -->
+        <test>
+            <param name="input1" value="gops_subtract_in1.gff" />
+            <param name="input2" value="gops_subtract_in2.gff" />
+            <param name="min" value="1" />
+            <param name="returntype" value="" />
+            <output name="output" file="gops_subtract_out1.gff" />
+        </test>
+        <!-- Subtract BED file from GFF file. -->
+        <test>
+            <param name="input1" value="gops_subtract_in1.gff" />
+            <param name="input2" value="gops_subtract_in2.bed" />
+            <param name="min" value="1" />
+            <param name="returntype" value="" />
+            <output name="output" file="gops_subtract_out1.gff" />
+        </test>
+    </tests>
+    <help><![CDATA[
 .. class:: infomark

 **TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns.

------
-
-**Screencasts!**
-
-See Galaxy Interval Operation Screencasts_ (right click to open this link in another window).
-
-.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations
-
------
+@SCREENCASTS@

 **Syntax**

@@ -123,6 +108,5 @@
 Non-overlapping pieces of intervals:

 .. image:: gops_subtractOverlappingPieces.gif
-
-</help>
+    ]]></help>
 </tool>
--- a/tool_dependencies.xml	Fri Dec 18 19:39:15 2015 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-  <package name="bx-python" version="0.7.1">
-      <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="galaxy-ops" version="1.0.0">
-      <repository changeset_revision="60c9a7af1345" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>
Binary file utils/__init__.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/gff_util.py	Thu Jun 22 18:51:24 2017 -0400
@@ -0,0 +1,435 @@
+"""
+Provides utilities for working with GFF files.
+"""
+import copy
+
+from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
+from bx.tabular.io import Comment, Header, ParseError
+
+from .odict import odict
+
+
+class GFFInterval( GenomicInterval ):
+    """
+    A GFF interval, including attributes. If file is strictly a GFF file,
+    only attribute is 'group.'
+    """
+    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
+                  strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
+        # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
+        # temporarily set strand and then unset after initing GenomicInterval.
+        unknown_strand = False
+        if not fix_strand and fields[ strand_col ] == '.':
+            unknown_strand = True
+            fields[ strand_col ] = '+'
+        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
+                                  default_strand, fix_strand=fix_strand )
+        if unknown_strand:
+            self.strand = '.'
+            self.fields[ strand_col ] = '.'
+
+        # Handle feature, score column.
+        self.feature_col = feature_col
+        if self.feature_col >= self.nfields:
+            raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
+        self.feature = self.fields[ self.feature_col ]
+        self.score_col = score_col
+        if self.score_col >= self.nfields:
+            raise MissingFieldError( "No field for score_col (%d)" % score_col )
+        self.score = self.fields[ self.score_col ]
+
+        # GFF attributes.
+        self.attributes = parse_gff_attributes( fields[8] )
+
+    def copy( self ):
+        return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
+                           self.end_col, self.strand_col, self.score_col, self.strand)
+
+
+class GFFFeature( GFFInterval ):
+    """
+    A GFF feature, which can include multiple intervals.
+    """
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
+                  strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
+                  raw_size=0 ):
+        # Use copy so that first interval and feature do not share fields.
+        GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
+                              start_col, end_col, strand_col, score_col, default_strand,
+                              fix_strand=fix_strand )
+        self.intervals = intervals
+        self.raw_size = raw_size
+        # Use intervals to set feature attributes.
+        for interval in self.intervals:
+            # Error checking. NOTE: intervals need not share the same strand.
+            if interval.chrom != self.chrom:
+                raise ValueError( "interval chrom does not match self chrom: %s != %s" %
+                                  ( interval.chrom, self.chrom ) )
+            # Set start, end of interval.
+            if interval.start < self.start:
+                self.start = interval.start
+            if interval.end > self.end:
+                self.end = interval.end
+
+    def name( self ):
+        """ Returns feature's name. """
+        name = None
+        # Preference for name: GTF, GFF3, GFF.
+        for attr_name in ['gene_id', 'transcript_id',  # GTF
+                          'ID', 'id',  # GFF3
+                          'group' ]:  # GFF (TODO)
+            name = self.attributes.get( attr_name, None )
+            if name is not None:
+                break
+        return name
+
+    def copy( self ):
+        intervals_copy = []
+        for interval in self.intervals:
+            intervals_copy.append( interval.copy() )
+        return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
+                          self.score_col, self.strand, intervals=intervals_copy )
+
+    def lines( self ):
+        lines = []
+        for interval in self.intervals:
+            lines.append( '\t'.join( interval.fields ) )
+        return lines
+
+
+class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
+    """
+    Reader wrapper that reads GFF intervals/lines and automatically converts
+    them to BED format.
+    """
+
+    def parse_row( self, line ):
+        # HACK: this should return a GFF interval, but bx-python operations
+        # require GenomicInterval objects and subclasses will not work.
+        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
+                                    self.end_col, self.strand_col, self.default_strand,
+                                    fix_strand=self.fix_strand )
+        interval = convert_gff_coords_to_bed( interval )
+        return interval
+
+
+class GFFReaderWrapper( NiceReaderWrapper ):
+    """
+    Reader wrapper for GFF files.
+
+    Wrapper has two major functions:
+
+    1. group entries for GFF file (via group column), GFF3 (via id attribute),
+       or GTF (via gene_id/transcript id);
+    2. convert coordinates from GFF format--starting and ending coordinates
+       are 1-based, closed--to the 'traditional'/BED interval format--0 based,
+       half-open. This is useful when using GFF files as inputs to tools that
+       expect traditional interval format.
+    """
+
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
+                  end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
+        NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
+                                    strand_col=strand_col, fix_strand=fix_strand, **kwargs )
+        self.feature_col = feature_col
+        self.score_col = score_col
+        self.convert_to_bed_coord = convert_to_bed_coord
+        self.last_line = None
+        self.cur_offset = 0
+        self.seed_interval = None
+        self.seed_interval_line_len = 0
+
+    def parse_row( self, line ):
+        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
+                                self.start_col, self.end_col, self.strand_col, self.score_col,
+                                self.default_strand, fix_strand=self.fix_strand )
+        return interval
+
+    def __next__( self ):
+        """ Returns next GFFFeature. """
+
+        #
+        # Helper function.
+        #
+
+        def handle_parse_error( parse_error ):
+            """ Actions to take when ParseError found. """
+            if self.outstream:
+                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
+                    self.print_delegate( self.outstream, e, self )
+            self.skipped += 1
+            # no reason to stuff an entire bad file into memmory
+            if self.skipped < 10:
+                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+
+            # For debugging, uncomment this to propogate parsing exceptions up.
+            # I.e. the underlying reason for an unexpected StopIteration exception
+            # can be found by uncommenting this.
+            # raise e
+
+        #
+        # Get next GFFFeature
+        #
+        raw_size = self.seed_interval_line_len
+
+        # If there is no seed interval, set one. Also, if there are no more
+        # intervals to read, this is where iterator dies.
+        if not self.seed_interval:
+            while not self.seed_interval:
+                try:
+                    self.seed_interval = GenomicIntervalReader.next( self )
+                except ParseError as e:
+                    handle_parse_error( e )
+                # TODO: When no longer supporting python 2.4 use finally:
+                # finally:
+                raw_size += len( self.current_line )
+
+        # If header or comment, clear seed interval and return it with its size.
+        if isinstance( self.seed_interval, ( Header, Comment ) ):
+            return_val = self.seed_interval
+            return_val.raw_size = len( self.current_line )
+            self.seed_interval = None
+            self.seed_interval_line_len = 0
+            return return_val
+
+        # Initialize feature identifier from seed.
+        feature_group = self.seed_interval.attributes.get( 'group', None )  # For GFF
+        # For GFF3
+        feature_id = self.seed_interval.attributes.get( 'ID', None )
+        # For GTF.
+        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
+
+        # Read all intervals associated with seed.
+        feature_intervals = []
+        feature_intervals.append( self.seed_interval )
+        while True:
+            try:
+                interval = GenomicIntervalReader.next( self )
+                raw_size += len( self.current_line )
+            except StopIteration as e:
+                # No more intervals to read, but last feature needs to be
+                # returned.
+                interval = None
+                raw_size += len( self.current_line )
+                break
+            except ParseError as e:
+                handle_parse_error( e )
+                raw_size += len( self.current_line )
+                continue
+            # TODO: When no longer supporting python 2.4 use finally:
+            # finally:
+            # raw_size += len( self.current_line )
+
+            # Ignore comments.
+            if isinstance( interval, Comment ):
+                continue
+
+            # Determine if interval is part of feature.
+            part_of = False
+            group = interval.attributes.get( 'group', None )
+            # GFF test:
+            if group and feature_group == group:
+                part_of = True
+            # GFF3 test:
+            parent_id = interval.attributes.get( 'Parent', None )
+            cur_id = interval.attributes.get( 'ID', None )
+            if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
+                part_of = True
+            # GTF test:
+            transcript_id = interval.attributes.get( 'transcript_id', None )
+            if transcript_id and transcript_id == feature_transcript_id:
+                part_of = True
+
+            # If interval is not part of feature, clean up and break.
+            if not part_of:
+                # Adjust raw size because current line is not part of feature.
+                raw_size -= len( self.current_line )
+                break
+
+            # Interval associated with feature.
+            feature_intervals.append( interval )
+
+        # Last interval read is the seed for the next interval.
+        self.seed_interval = interval
+        self.seed_interval_line_len = len( self.current_line )
+
+        # Return feature.
+        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
+                              self.end_col, self.strand_col, self.score_col,
+                              self.default_strand, fix_strand=self.fix_strand,
+                              intervals=feature_intervals, raw_size=raw_size )
+
+        # Convert to BED coords?
+        if self.convert_to_bed_coord:
+            convert_gff_coords_to_bed( feature )
+
+        return feature
+    next = __next__  # This line should be removed once the bx-python port to Python3 is finished
+
+
+def convert_bed_coords_to_gff( interval ):
+    """
+    Converts an interval object's coordinates from BED format to GFF format.
+    Accepted object types include GenomicInterval and list (where the first
+    element in the list is the interval's start, and the second element is
+    the interval's end).
+    """
+    if isinstance( interval, GenomicInterval ):
+        interval.start += 1
+        if isinstance( interval, GFFFeature ):
+            for subinterval in interval.intervals:
+                convert_bed_coords_to_gff( subinterval )
+    elif type( interval ) is list:
+        interval[ 0 ] += 1
+    return interval
+
+
+def convert_gff_coords_to_bed( interval ):
+    """
+    Converts an interval object's coordinates from GFF format to BED format.
+    Accepted object types include GFFFeature, GenomicInterval, and list (where
+    the first element in the list is the interval's start, and the second
+    element is the interval's end).
+    """
+    if isinstance( interval, GenomicInterval ):
+        interval.start -= 1
+        if isinstance( interval, GFFFeature ):
+            for subinterval in interval.intervals:
+                convert_gff_coords_to_bed( subinterval )
+    elif type( interval ) is list:
+        interval[ 0 ] -= 1
+    return interval
+
+
+def parse_gff_attributes( attr_str ):
+    """
+    Parses a GFF/GTF attribute string and returns a dictionary of name-value
+    pairs. The general format for a GFF3 attributes string is
+
+        name1=value1;name2=value2
+
+    The general format for a GTF attribute string is
+
+        name1 "value1" ; name2 "value2"
+
+    The general format for a GFF attribute string is a single string that
+    denotes the interval's group; in this case, method returns a dictionary
+    with a single key-value pair, and key name is 'group'
+    """
+    attributes_list = attr_str.split(";")
+    attributes = {}
+    for name_value_pair in attributes_list:
+        # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
+        # attribute; next, try double quotes for GTF.
+        pair = name_value_pair.strip().split("=")
+        if len( pair ) == 1:
+            pair = name_value_pair.strip().split("\"")
+        if len( pair ) == 1:
+            # Could not split for some reason -- raise exception?
+            continue
+        if pair == '':
+            continue
+        name = pair[0].strip()
+        if name == '':
+            continue
+        # Need to strip double quote from values
+        value = pair[1].strip(" \"")
+        attributes[ name ] = value
+
+    if len( attributes ) == 0:
+        # Could not split attributes string, so entire string must be
+        # 'group' attribute. This is the case for strictly GFF files.
+        attributes['group'] = attr_str
+    return attributes
+
+
+def gff_attributes_to_str( attrs, gff_format ):
+    """
+    Convert GFF attributes to string. Supported formats are GFF3, GTF.
+    """
+    if gff_format == 'GTF':
+        format_string = '%s "%s"'
+        # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
+        id_attr = None
+        if 'group' in attrs:
+            id_attr = 'group'
+        elif 'ID' in attrs:
+            id_attr = 'ID'
+        elif 'Parent' in attrs:
+            id_attr = 'Parent'
+        if id_attr:
+            attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
+    elif gff_format == 'GFF3':
+        format_string = '%s=%s'
+    attrs_strs = []
+    for name, value in attrs.items():
+        attrs_strs.append( format_string % ( name, value ) )
+    return " ; ".join( attrs_strs )
+
+
+def read_unordered_gtf( iterator, strict=False ):
+    """
+    Returns GTF features found in an iterator. GTF lines need not be ordered
+    or clustered for reader to work. Reader returns GFFFeature objects sorted
+    by transcript_id, chrom, and start position.
+    """
+
+    # -- Get function that generates line/feature key. --
+
+    def get_transcript_id(fields):
+        return parse_gff_attributes( fields[8] )[ 'transcript_id' ]
+
+    if strict:
+        # Strict GTF parsing uses transcript_id only to group lines into feature.
+        key_fn = get_transcript_id
+    else:
+        # Use lenient parsing where chromosome + transcript_id is the key. This allows
+        # transcripts with same ID on different chromosomes; this occurs in some popular
+        # datasources, such as RefGenes in UCSC.
+        def key_fn(fields):
+            return fields[0] + '_' + get_transcript_id( fields )
+
+    # Aggregate intervals by transcript_id and collect comments.
+    feature_intervals = odict()
+    comments = []
+    for count, line in enumerate( iterator ):
+        if line.startswith( '#' ):
+            comments.append( Comment( line ) )
+            continue
+
+        line_key = key_fn( line.split('\t') )
+        if line_key in feature_intervals:
+            feature = feature_intervals[ line_key ]
+        else:
+            feature = []
+            feature_intervals[ line_key ] = feature
+        feature.append( GFFInterval( None, line.split( '\t' ) ) )
+
+    # Create features.
+    chroms_features = {}
+    for count, intervals in enumerate( feature_intervals.values() ):
+        # Sort intervals by start position.
+        intervals.sort( lambda a, b: cmp( a.start, b.start ) )
+        feature = GFFFeature( None, intervals=intervals )
+        if feature.chrom not in chroms_features:
+            chroms_features[ feature.chrom ] = []
+        chroms_features[ feature.chrom ].append( feature )
+
+    # Sort features by chrom, start position.
+    chroms_features_sorted = []
+    for chrom_features in chroms_features.values():
+        chroms_features_sorted.append( chrom_features )
+    chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
+    for features in chroms_features_sorted:
+        features.sort( lambda a, b: cmp( a.start, b.start ) )
+
+    # Yield comments first, then features.
+    # FIXME: comments can appear anywhere in file, not just the beginning.
+    # Ideally, then comments would be associated with features and output
+    # just before feature/line.
+    for comment in comments:
+        yield comment
+
+    for chrom_features in chroms_features_sorted:
+        for feature in chrom_features:
+            yield feature
Binary file utils/gff_util.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/odict.py	Thu Jun 22 18:51:24 2017 -0400
@@ -0,0 +1,86 @@
+"""
+Ordered dictionary implementation.
+"""
+
+from UserDict import UserDict
+
+
+class odict(UserDict):
+    """
+    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747
+
+    This dictionary class extends UserDict to record the order in which items are
+    added. Calling keys(), values(), items(), etc. will return results in this
+    order.
+    """
+    def __init__( self, dict=None ):
+        self._keys = []
+        UserDict.__init__( self, dict )
+
+    def __delitem__( self, key ):
+        UserDict.__delitem__( self, key )
+        self._keys.remove( key )
+
+    def __setitem__( self, key, item ):
+        UserDict.__setitem__( self, key, item )
+        if key not in self._keys:
+            self._keys.append( key )
+
+    def clear( self ):
+        UserDict.clear( self )
+        self._keys = []
+
+    def copy(self):
+        new = odict()
+        new.update( self )
+        return new
+
+    def items( self ):
+        return zip( self._keys, self.values() )
+
+    def keys( self ):
+        return self._keys[:]
+
+    def popitem( self ):
+        try:
+            key = self._keys[-1]
+        except IndexError:
+            raise KeyError( 'dictionary is empty' )
+        val = self[ key ]
+        del self[ key ]
+        return ( key, val )
+
+    def setdefault( self, key, failobj=None ):
+        if key not in self._keys:
+            self._keys.append( key )
+        return UserDict.setdefault( self, key, failobj )
+
+    def update( self, dict ):
+        for ( key, val ) in dict.items():
+            self.__setitem__( key, val )
+
+    def values( self ):
+        return map( self.get, self._keys )
+
+    def iterkeys( self ):
+        return iter( self._keys )
+
+    def itervalues( self ):
+        for key in self._keys:
+            yield self.get( key )
+
+    def iteritems( self ):
+        for key in self._keys:
+            yield key, self.get( key )
+
+    def __iter__( self ):
+        for key in self._keys:
+            yield key
+
+    def reverse( self ):
+        self._keys.reverse()
+
+    def insert( self, index, key, item ):
+        if key not in self._keys:
+            self._keys.insert( index, key )
+            UserDict.__setitem__( self, key, item )
Binary file utils/odict.pyc has changed