# HG changeset patch # User devteam # Date 1498171820 14400 # Node ID 60925436ca5f2270196855b1fbd8709b9f4c40f8 # Parent 21717d77aee56e60b193acbff9d106f1cd9b7013 planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/intersect commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689 diff -r 21717d77aee5 -r 60925436ca5f gops_intersect.py --- a/gops_intersect.py Fri Dec 18 19:38:28 2015 -0500 +++ b/gops_intersect.py Thu Jun 22 18:50:20 2017 -0400 @@ -11,14 +11,18 @@ -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval """ +from __future__ import print_function + import fileinput import sys + +from bx.cookbook import doc_optparse from bx.intervals.io import GenomicInterval, NiceReaderWrapper from bx.intervals.operations.intersect import intersect -from bx.cookbook import doc_optparse from bx.tabular.io import ParseError from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped -from utils.gff_util import GFFFeature, GFFReaderWrapper, convert_bed_coords_to_gff + +from utils.gff_util import convert_bed_coords_to_gff, GFFFeature, GFFReaderWrapper assert sys.version_info[:2] >= ( 2, 4 ) @@ -80,16 +84,17 @@ out_file.write( "%s\n" % "\t".join( feature.fields ) ) else: out_file.write( "%s\n" % feature ) - except ParseError, e: + except ParseError as e: out_file.close() fail( "Invalid file format: %s" % str( e ) ) out_file.close() if g1.skipped > 0: - print skipped( g1, filedesc=" of 1st dataset" ) + print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: - print skipped( g2, filedesc=" of 2nd dataset" ) + print(skipped( g2, filedesc=" of 2nd dataset" )) + if __name__ == "__main__": main() diff -r 21717d77aee5 -r 60925436ca5f intersect.xml --- a/intersect.xml Fri Dec 18 19:38:28 2015 -0500 +++ b/intersect.xml Thu Jun 22 18:50:20 2017 -0400 @@ -1,147 +1,132 @@ -<tool id="gops_intersect_1" name="Intersect" version="1.0.0"> - <description>the intervals of two datasets</description> - <requirements> - <requirement type="package" version="0.7.1">bx-python</requirement> - <requirement type="package" version="1.0.0">galaxy-ops</requirement> - </requirements> - <command interpreter="python">gops_intersect.py - $input1 $input2 $output - - #if isinstance( $input1.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -1 1,4,5,7 --gff1 - #else: - -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} - #end if - - #if isinstance( $input2.datatype, $__app__.datatypes_registry.get_datatype_by_extension('gff').__class__): - -2 1,4,5,7 --gff2 - #else: - -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} - #end if - - -m $min $returntype - </command> - <inputs> - <param name="returntype" type="select" label="Return" help="(see figure below)"> - <option value="">Overlapping Intervals</option> - <option value="-p">Overlapping pieces of Intervals</option> - </param> - <param format="interval,gff" name="input1" type="data" help="First dataset"> - <label>of</label> - </param> - <param format="interval,gff" name="input2" type="data" help="Second dataset"> - <label>that intersect</label> - </param> - <param name="min" type="integer" value="1" min="1" help="(bp)"> - <label>for at least</label> - </param> - </inputs> - <outputs> - <data format="input" name="output" metadata_source="input1"/> - </outputs> - <code file="operation_filter.py"/> - <trackster_conf/> - <tests> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_out.bed" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2_mod.bed" ftype="interval"/> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_diffCols.bed" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2_mod.bed" ftype="interval"/> - <param name="min" value="1" /> - <param name="returntype" value="Overlapping pieces of Intervals" /> - <output name="output" file="gops_intersect_p_diffCols.bed" /> - </test> - <test> - <param name="input1" value="1.bed" /> - <param name="input2" value="2.bed" /> - <param name="min" value="10" /> - <param name="returntype" value="Overlapping pieces of Intervals" /> - <output name="output" file="gops_intersect_p_out.bed" /> - </test> - <test> - <param name="input1" value="gops_bigint.interval" ftype="interval" /> - <param name="input2" value="gops_bigint2.interval" ftype="interval" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_bigint_out.interval" /> - </test> - <test> - <param name="input1" value="gops_bigint2.interval" ftype="interval" /> - <param name="input2" value="gops_bigint.interval" ftype="interval" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_bigint_out.interval" /> - </test> - <test> - <param name="input1" value="12.bed" ftype="bed" /> - <param name="input2" value="1.bed" ftype="bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_no_strand_out.bed" /> - </test> - <!-- Intersect two GFF files. --> - <test> - <param name="input1" value="gops_subtract_in1.gff" /> - <param name="input2" value="gops_subtract_in2.gff" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_out2.gff" /> - </test> - <!-- Intersect GFF file and bed file. --> - <test> - <param name="input1" value="gops_subtract_in1.gff" /> - <param name="input2" value="gops_subtract_in2.bed" /> - <param name="min" value="1" /> - <param name="returntype" value="" /> - <output name="output" file="gops_intersect_out2.gff" /> - </test> - - </tests> - <help> - -.. class:: infomark - -**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. - ------ - -**Screencasts!** - -See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). - -.. _Screencasts: http://wiki.g2.bx.psu.edu/Learn/Interval%20Operations - ------ - -**Syntax** - -- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets -- **Overlapping Intervals** returns entire intervals from the first dataset that overlap the second dataset. The returned intervals are completely unchanged, and this option only filters out intervals that do not overlap with the second dataset. -- **Overlapping pieces of Intervals** returns intervals that indicate the exact base pair overlap between the first dataset and the second dataset. The intervals returned are from the first dataset, and all fields besides start and end are guaranteed to remain unchanged. - ------ - -**Examples** - -Overlapping Intervals: - -.. image:: gops_intersectOverlappingIntervals.gif - -Overlapping Pieces of Intervals: - -.. image:: gops_intersectOverlappingPieces.gif - -</help> -</tool> +<tool id="gops_intersect_1" name="Intersect" version="1.0.0"> + <description>the intervals of two datasets</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <code file="operation_filter.py"/> + <command><![CDATA[ +python '$__tool_directory__/gops_intersect.py' +'$input1' +'$input2' +'$output' + +#if $input1.is_of_type('gff') + -1 1,4,5,7 --gff1 +#else: + -1 ${input1.metadata.chromCol},${input1.metadata.startCol},${input1.metadata.endCol},${input1.metadata.strandCol} +#end if + +#if $input2.is_of_type('gff') + -2 1,4,5,7 --gff2 +#else: + -2 ${input2.metadata.chromCol},${input2.metadata.startCol},${input2.metadata.endCol},${input2.metadata.strandCol} +#end if + +-m $min $returntype + ]]></command> + <inputs> + <param name="returntype" type="select" label="Return" help="(see figure below)"> + <option value="">Overlapping Intervals</option> + <option value="-p">Overlapping pieces of Intervals</option> + </param> + <param name="input1" type="data" format="interval,gff" label="of" help="First dataset" /> + <param name="input2" type="data" format="interval,gff" label="that intersect" help="Second dataset" /> + <param name="min" type="integer" value="1" min="1" label="for at least" help="(bp)" /> + </inputs> + <outputs> + <data name="output" format_source="input1" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_out.bed" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2_mod.bed" ftype="interval"/> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_diffCols.bed" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2_mod.bed" ftype="interval"/> + <param name="min" value="1" /> + <param name="returntype" value="Overlapping pieces of Intervals" /> + <output name="output" file="gops_intersect_p_diffCols.bed" /> + </test> + <test> + <param name="input1" value="1.bed" /> + <param name="input2" value="2.bed" /> + <param name="min" value="10" /> + <param name="returntype" value="Overlapping pieces of Intervals" /> + <output name="output" file="gops_intersect_p_out.bed" /> + </test> + <test> + <param name="input1" value="gops_bigint.interval" ftype="interval" /> + <param name="input2" value="gops_bigint2.interval" ftype="interval" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_bigint_out.interval" /> + </test> + <test> + <param name="input1" value="gops_bigint2.interval" ftype="interval" /> + <param name="input2" value="gops_bigint.interval" ftype="interval" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_bigint_out.interval" /> + </test> + <test> + <param name="input1" value="12.bed" ftype="bed" /> + <param name="input2" value="1.bed" ftype="bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_no_strand_out.bed" /> + </test> + <!-- Intersect two GFF files. --> + <test> + <param name="input1" value="gops_subtract_in1.gff" /> + <param name="input2" value="gops_subtract_in2.gff" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_out2.gff" /> + </test> + <!-- Intersect GFF file and bed file. --> + <test> + <param name="input1" value="gops_subtract_in1.gff" /> + <param name="input2" value="gops_subtract_in2.bed" /> + <param name="min" value="1" /> + <param name="returntype" value="" /> + <output name="output" file="gops_intersect_out2.gff" /> + </test> + </tests> + <help><![CDATA[ +.. class:: infomark + +**TIP:** If your dataset does not appear in the pulldown menu, it means that it is not in interval format. Use "edit attributes" to set chromosome, start, end, and strand columns. + +@SCREENCASTS@ + +**Syntax** + +- **Where overlap is at least** sets the minimum length (in base pairs) of overlap between elements of the two datasets +- **Overlapping Intervals** returns entire intervals from the first dataset that overlap the second dataset. The returned intervals are completely unchanged, and this option only filters out intervals that do not overlap with the second dataset. +- **Overlapping pieces of Intervals** returns intervals that indicate the exact base pair overlap between the first dataset and the second dataset. The intervals returned are from the first dataset, and all fields besides start and end are guaranteed to remain unchanged. + +----- + +**Examples** + +Overlapping Intervals: + +.. image:: gops_intersectOverlappingIntervals.gif + +Overlapping Pieces of Intervals: + +.. image:: gops_intersectOverlappingPieces.gif + ]]></help> +</tool> diff -r 21717d77aee5 -r 60925436ca5f macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Jun 22 18:50:20 2017 -0400 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.7.1">bx-python</requirement> + <requirement type="package" version="1.0.0">galaxy-ops</requirement> + </requirements> + </xml> + <token name="@SCREENCASTS@"> +----- + +**Screencasts!** + +See Galaxy Interval Operation Screencasts_ (right click to open this link in another window). + +.. _Screencasts: https://galaxyproject.org/learn/interval-operations/ + +----- + </token> +</macros> diff -r 21717d77aee5 -r 60925436ca5f operation_filter.py --- a/operation_filter.py Fri Dec 18 19:38:28 2015 -0500 +++ b/operation_filter.py Thu Jun 22 18:50:20 2017 -0400 @@ -1,8 +1,7 @@ # runs after the job (and after the default post-filter) +from galaxy.jobs.handler import JOB_ERROR from galaxy.tools.parameters import DataToolParameter -from galaxy.jobs.handler import JOB_ERROR - # Older py compatibility try: set() @@ -14,7 +13,7 @@ dbkeys = set() data_param_names = set() data_params = 0 - for name, param in page_param_map.iteritems(): + for name, param in page_param_map.items(): if isinstance( param, DataToolParameter ): # for each dataset parameter if param_values.get(name, None) is not None: @@ -53,7 +52,6 @@ try: if stderr and len( stderr ) > 0: raise Exception( stderr ) - except Exception: data.blurb = JOB_ERROR data.state = JOB_ERROR diff -r 21717d77aee5 -r 60925436ca5f tool_dependencies.xml --- a/tool_dependencies.xml Fri Dec 18 19:38:28 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="bx-python" version="0.7.1"> - <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> - <package name="galaxy-ops" version="1.0.0"> - <repository changeset_revision="60c9a7af1345" name="package_galaxy_ops_1_0_0" owner="devteam" toolshed="https://testtoolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency> diff -r 21717d77aee5 -r 60925436ca5f utils/__init__.py diff -r 21717d77aee5 -r 60925436ca5f utils/__init__.pyc Binary file utils/__init__.pyc has changed diff -r 21717d77aee5 -r 60925436ca5f utils/gff_util.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/gff_util.py Thu Jun 22 18:50:20 2017 -0400 @@ -0,0 +1,435 @@ +""" +Provides utilities for working with GFF files. +""" +import copy + +from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper +from bx.tabular.io import Comment, Header, ParseError + +from .odict import odict + + +class GFFInterval( GenomicInterval ): + """ + A GFF interval, including attributes. If file is strictly a GFF file, + only attribute is 'group.' + """ + def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, + strand_col=6, score_col=5, default_strand='.', fix_strand=False ): + # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, + # temporarily set strand and then unset after initing GenomicInterval. + unknown_strand = False + if not fix_strand and fields[ strand_col ] == '.': + unknown_strand = True + fields[ strand_col ] = '+' + GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, + default_strand, fix_strand=fix_strand ) + if unknown_strand: + self.strand = '.' + self.fields[ strand_col ] = '.' + + # Handle feature, score column. + self.feature_col = feature_col + if self.feature_col >= self.nfields: + raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) + self.feature = self.fields[ self.feature_col ] + self.score_col = score_col + if self.score_col >= self.nfields: + raise MissingFieldError( "No field for score_col (%d)" % score_col ) + self.score = self.fields[ self.score_col ] + + # GFF attributes. + self.attributes = parse_gff_attributes( fields[8] ) + + def copy( self ): + return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, + self.end_col, self.strand_col, self.score_col, self.strand) + + +class GFFFeature( GFFInterval ): + """ + A GFF feature, which can include multiple intervals. + """ + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, + strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], + raw_size=0 ): + # Use copy so that first interval and feature do not share fields. + GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, + start_col, end_col, strand_col, score_col, default_strand, + fix_strand=fix_strand ) + self.intervals = intervals + self.raw_size = raw_size + # Use intervals to set feature attributes. + for interval in self.intervals: + # Error checking. NOTE: intervals need not share the same strand. + if interval.chrom != self.chrom: + raise ValueError( "interval chrom does not match self chrom: %s != %s" % + ( interval.chrom, self.chrom ) ) + # Set start, end of interval. + if interval.start < self.start: + self.start = interval.start + if interval.end > self.end: + self.end = interval.end + + def name( self ): + """ Returns feature's name. """ + name = None + # Preference for name: GTF, GFF3, GFF. + for attr_name in ['gene_id', 'transcript_id', # GTF + 'ID', 'id', # GFF3 + 'group' ]: # GFF (TODO) + name = self.attributes.get( attr_name, None ) + if name is not None: + break + return name + + def copy( self ): + intervals_copy = [] + for interval in self.intervals: + intervals_copy.append( interval.copy() ) + return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, + self.score_col, self.strand, intervals=intervals_copy ) + + def lines( self ): + lines = [] + for interval in self.intervals: + lines.append( '\t'.join( interval.fields ) ) + return lines + + +class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper that reads GFF intervals/lines and automatically converts + them to BED format. + """ + + def parse_row( self, line ): + # HACK: this should return a GFF interval, but bx-python operations + # require GenomicInterval objects and subclasses will not work. + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, + self.end_col, self.strand_col, self.default_strand, + fix_strand=self.fix_strand ) + interval = convert_gff_coords_to_bed( interval ) + return interval + + +class GFFReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper for GFF files. + + Wrapper has two major functions: + + 1. group entries for GFF file (via group column), GFF3 (via id attribute), + or GTF (via gene_id/transcript id); + 2. convert coordinates from GFF format--starting and ending coordinates + are 1-based, closed--to the 'traditional'/BED interval format--0 based, + half-open. This is useful when using GFF files as inputs to tools that + expect traditional interval format. + """ + + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, + end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): + NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, + strand_col=strand_col, fix_strand=fix_strand, **kwargs ) + self.feature_col = feature_col + self.score_col = score_col + self.convert_to_bed_coord = convert_to_bed_coord + self.last_line = None + self.cur_offset = 0 + self.seed_interval = None + self.seed_interval_line_len = 0 + + def parse_row( self, line ): + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, + self.start_col, self.end_col, self.strand_col, self.score_col, + self.default_strand, fix_strand=self.fix_strand ) + return interval + + def __next__( self ): + """ Returns next GFFFeature. """ + + # + # Helper function. + # + + def handle_parse_error( parse_error ): + """ Actions to take when ParseError found. """ + if self.outstream: + if self.print_delegate and hasattr(self.print_delegate, "__call__"): + self.print_delegate( self.outstream, e, self ) + self.skipped += 1 + # no reason to stuff an entire bad file into memmory + if self.skipped < 10: + self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + + # For debugging, uncomment this to propogate parsing exceptions up. + # I.e. the underlying reason for an unexpected StopIteration exception + # can be found by uncommenting this. + # raise e + + # + # Get next GFFFeature + # + raw_size = self.seed_interval_line_len + + # If there is no seed interval, set one. Also, if there are no more + # intervals to read, this is where iterator dies. + if not self.seed_interval: + while not self.seed_interval: + try: + self.seed_interval = GenomicIntervalReader.next( self ) + except ParseError as e: + handle_parse_error( e ) + # TODO: When no longer supporting python 2.4 use finally: + # finally: + raw_size += len( self.current_line ) + + # If header or comment, clear seed interval and return it with its size. + if isinstance( self.seed_interval, ( Header, Comment ) ): + return_val = self.seed_interval + return_val.raw_size = len( self.current_line ) + self.seed_interval = None + self.seed_interval_line_len = 0 + return return_val + + # Initialize feature identifier from seed. + feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF + # For GFF3 + feature_id = self.seed_interval.attributes.get( 'ID', None ) + # For GTF. + feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) + + # Read all intervals associated with seed. + feature_intervals = [] + feature_intervals.append( self.seed_interval ) + while True: + try: + interval = GenomicIntervalReader.next( self ) + raw_size += len( self.current_line ) + except StopIteration as e: + # No more intervals to read, but last feature needs to be + # returned. + interval = None + raw_size += len( self.current_line ) + break + except ParseError as e: + handle_parse_error( e ) + raw_size += len( self.current_line ) + continue + # TODO: When no longer supporting python 2.4 use finally: + # finally: + # raw_size += len( self.current_line ) + + # Ignore comments. + if isinstance( interval, Comment ): + continue + + # Determine if interval is part of feature. + part_of = False + group = interval.attributes.get( 'group', None ) + # GFF test: + if group and feature_group == group: + part_of = True + # GFF3 test: + parent_id = interval.attributes.get( 'Parent', None ) + cur_id = interval.attributes.get( 'ID', None ) + if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ): + part_of = True + # GTF test: + transcript_id = interval.attributes.get( 'transcript_id', None ) + if transcript_id and transcript_id == feature_transcript_id: + part_of = True + + # If interval is not part of feature, clean up and break. + if not part_of: + # Adjust raw size because current line is not part of feature. + raw_size -= len( self.current_line ) + break + + # Interval associated with feature. + feature_intervals.append( interval ) + + # Last interval read is the seed for the next interval. + self.seed_interval = interval + self.seed_interval_line_len = len( self.current_line ) + + # Return feature. + feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, + self.end_col, self.strand_col, self.score_col, + self.default_strand, fix_strand=self.fix_strand, + intervals=feature_intervals, raw_size=raw_size ) + + # Convert to BED coords? + if self.convert_to_bed_coord: + convert_gff_coords_to_bed( feature ) + + return feature + next = __next__ # This line should be removed once the bx-python port to Python3 is finished + + +def convert_bed_coords_to_gff( interval ): + """ + Converts an interval object's coordinates from BED format to GFF format. + Accepted object types include GenomicInterval and list (where the first + element in the list is the interval's start, and the second element is + the interval's end). + """ + if isinstance( interval, GenomicInterval ): + interval.start += 1 + if isinstance( interval, GFFFeature ): + for subinterval in interval.intervals: + convert_bed_coords_to_gff( subinterval ) + elif type( interval ) is list: + interval[ 0 ] += 1 + return interval + + +def convert_gff_coords_to_bed( interval ): + """ + Converts an interval object's coordinates from GFF format to BED format. + Accepted object types include GFFFeature, GenomicInterval, and list (where + the first element in the list is the interval's start, and the second + element is the interval's end). + """ + if isinstance( interval, GenomicInterval ): + interval.start -= 1 + if isinstance( interval, GFFFeature ): + for subinterval in interval.intervals: + convert_gff_coords_to_bed( subinterval ) + elif type( interval ) is list: + interval[ 0 ] -= 1 + return interval + + +def parse_gff_attributes( attr_str ): + """ + Parses a GFF/GTF attribute string and returns a dictionary of name-value + pairs. The general format for a GFF3 attributes string is + + name1=value1;name2=value2 + + The general format for a GTF attribute string is + + name1 "value1" ; name2 "value2" + + The general format for a GFF attribute string is a single string that + denotes the interval's group; in this case, method returns a dictionary + with a single key-value pair, and key name is 'group' + """ + attributes_list = attr_str.split(";") + attributes = {} + for name_value_pair in attributes_list: + # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 + # attribute; next, try double quotes for GTF. + pair = name_value_pair.strip().split("=") + if len( pair ) == 1: + pair = name_value_pair.strip().split("\"") + if len( pair ) == 1: + # Could not split for some reason -- raise exception? + continue + if pair == '': + continue + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[ name ] = value + + if len( attributes ) == 0: + # Could not split attributes string, so entire string must be + # 'group' attribute. This is the case for strictly GFF files. + attributes['group'] = attr_str + return attributes + + +def gff_attributes_to_str( attrs, gff_format ): + """ + Convert GFF attributes to string. Supported formats are GFF3, GTF. + """ + if gff_format == 'GTF': + format_string = '%s "%s"' + # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id + id_attr = None + if 'group' in attrs: + id_attr = 'group' + elif 'ID' in attrs: + id_attr = 'ID' + elif 'Parent' in attrs: + id_attr = 'Parent' + if id_attr: + attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] + elif gff_format == 'GFF3': + format_string = '%s=%s' + attrs_strs = [] + for name, value in attrs.items(): + attrs_strs.append( format_string % ( name, value ) ) + return " ; ".join( attrs_strs ) + + +def read_unordered_gtf( iterator, strict=False ): + """ + Returns GTF features found in an iterator. GTF lines need not be ordered + or clustered for reader to work. Reader returns GFFFeature objects sorted + by transcript_id, chrom, and start position. + """ + + # -- Get function that generates line/feature key. -- + + def get_transcript_id(fields): + return parse_gff_attributes( fields[8] )[ 'transcript_id' ] + + if strict: + # Strict GTF parsing uses transcript_id only to group lines into feature. + key_fn = get_transcript_id + else: + # Use lenient parsing where chromosome + transcript_id is the key. This allows + # transcripts with same ID on different chromosomes; this occurs in some popular + # datasources, such as RefGenes in UCSC. + def key_fn(fields): + return fields[0] + '_' + get_transcript_id( fields ) + + # Aggregate intervals by transcript_id and collect comments. + feature_intervals = odict() + comments = [] + for count, line in enumerate( iterator ): + if line.startswith( '#' ): + comments.append( Comment( line ) ) + continue + + line_key = key_fn( line.split('\t') ) + if line_key in feature_intervals: + feature = feature_intervals[ line_key ] + else: + feature = [] + feature_intervals[ line_key ] = feature + feature.append( GFFInterval( None, line.split( '\t' ) ) ) + + # Create features. + chroms_features = {} + for count, intervals in enumerate( feature_intervals.values() ): + # Sort intervals by start position. + intervals.sort( lambda a, b: cmp( a.start, b.start ) ) + feature = GFFFeature( None, intervals=intervals ) + if feature.chrom not in chroms_features: + chroms_features[ feature.chrom ] = [] + chroms_features[ feature.chrom ].append( feature ) + + # Sort features by chrom, start position. + chroms_features_sorted = [] + for chrom_features in chroms_features.values(): + chroms_features_sorted.append( chrom_features ) + chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) + for features in chroms_features_sorted: + features.sort( lambda a, b: cmp( a.start, b.start ) ) + + # Yield comments first, then features. + # FIXME: comments can appear anywhere in file, not just the beginning. + # Ideally, then comments would be associated with features and output + # just before feature/line. + for comment in comments: + yield comment + + for chrom_features in chroms_features_sorted: + for feature in chrom_features: + yield feature diff -r 21717d77aee5 -r 60925436ca5f utils/gff_util.pyc Binary file utils/gff_util.pyc has changed diff -r 21717d77aee5 -r 60925436ca5f utils/odict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/odict.py Thu Jun 22 18:50:20 2017 -0400 @@ -0,0 +1,86 @@ +""" +Ordered dictionary implementation. +""" + +from UserDict import UserDict + + +class odict(UserDict): + """ + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747 + + This dictionary class extends UserDict to record the order in which items are + added. Calling keys(), values(), items(), etc. will return results in this + order. + """ + def __init__( self, dict=None ): + self._keys = [] + UserDict.__init__( self, dict ) + + def __delitem__( self, key ): + UserDict.__delitem__( self, key ) + self._keys.remove( key ) + + def __setitem__( self, key, item ): + UserDict.__setitem__( self, key, item ) + if key not in self._keys: + self._keys.append( key ) + + def clear( self ): + UserDict.clear( self ) + self._keys = [] + + def copy(self): + new = odict() + new.update( self ) + return new + + def items( self ): + return zip( self._keys, self.values() ) + + def keys( self ): + return self._keys[:] + + def popitem( self ): + try: + key = self._keys[-1] + except IndexError: + raise KeyError( 'dictionary is empty' ) + val = self[ key ] + del self[ key ] + return ( key, val ) + + def setdefault( self, key, failobj=None ): + if key not in self._keys: + self._keys.append( key ) + return UserDict.setdefault( self, key, failobj ) + + def update( self, dict ): + for ( key, val ) in dict.items(): + self.__setitem__( key, val ) + + def values( self ): + return map( self.get, self._keys ) + + def iterkeys( self ): + return iter( self._keys ) + + def itervalues( self ): + for key in self._keys: + yield self.get( key ) + + def iteritems( self ): + for key in self._keys: + yield key, self.get( key ) + + def __iter__( self ): + for key in self._keys: + yield key + + def reverse( self ): + self._keys.reverse() + + def insert( self, index, key, item ): + if key not in self._keys: + self._keys.insert( index, key ) + UserDict.__setitem__( self, key, item ) diff -r 21717d77aee5 -r 60925436ca5f utils/odict.pyc Binary file utils/odict.pyc has changed