view gff2coverage.py @ 1:924fec18933e draft

Uploaded
author sebastian
date Fri, 21 Feb 2014 10:28:59 -0500
parents 0d66ec694153
children
line wrap: on
line source

'''
gff2coverage.py - compute genomic coverage of gff intervals
===========================================================

:Author: Andreas Heger
:Release: $Id$
:Date: |today|
:Tags: Genomics Intervals Summary GFF

Purpose
-------

This script computes the genomic coverage of intervals 
in a :term:`gff` formatted file. The coverage is computed
per feature.

Usage
-----

Example::

   python <script_name>.py --help

Type::

   python <script_name>.py --help

for command line help.

Command line options
--------------------

'''

import sys
import string
import re
import optparse
import time
import os
import shutil
import tempfile
import math
import collections

import CGAT.Experiment as E
import CGAT.IndexedFasta as IndexedFasta
import CGAT.IOTools as IOTools
import CGAT.GTF as GTF

def printValues( contig, max_size, window_size, values, options ):
    """output values."""

    outfile = E.openOutputFile( contig, "w" )

    outfile.write( "abs_pos\trel_pos" )

    for feature in options.features:
        outfile.write("\tabs_%s\trel_%s" % (feature,feature) )
    outfile.write("\n")

    max_vv = []

    for f in range(len(options.features)):
        max_vv.append( float(max(map(lambda x: x[f], values ) )))

    bin = 0
    for vv in values:
        outfile.write( "%i\t" % bin )
        outfile.write( options.value_format % (float(bin) / max_size) )
        
        for x in range(len(options.features)):
            outfile.write( "\t%i\t%s" % (vv[x] ,
                                         options.value_format % (vv[x] / max_vv[x]) ) )
        outfile.write("\n" )            
        bin += window_size

    outfile.close()
    
def processChunk( contig, chunk, options, fasta = None ):
    """
    This function requires segments to be non-overlapping.
    """
    
    if len(chunk) == 0: return

    # check whether there are overlapping features or not
    checked = []
    for feature in chunk:
        checked.append(feature)
	others = [x for x in chunk if x not in checked]
	for otherFeature in others:
	   if GTF.Overlap(feature, otherFeature):
		raise ValueError( " Histogram could not be created since the file contains overlapping features! \n%s\n%s  " % (feature, otherFeature) )
    # clear auxiliary list
    del checked[:]

    # compute min_coordinate, max_coordinate for the histogram
    min_coordinate = 1
    max_coordinate = max( map( lambda x: x.end, chunk) )
    ## compute window size
    if options.window_size:
        window_size = options.window_size
        num_bins = int(math.ceil((float(max_coordinate) / window_size)))
    elif options.num_bins and fasta:
        contig_length = fasta.getLength( contig )
        assert max_coordinate <= contig_length, "maximum coordinate (%i) larger than contig size (%i) for contig %s" % (max_coordinate, contig_length, contig )
        max_coordinate = contig_length
        window_size = int(math.floor( float(contig_length) / options.num_bins))
        num_bins = options.num_bins
    else:
        raise ValueError("please specify a window size of provide genomic sequence with number of bins.")

    values = [ [] for x in range(num_bins) ]

    ## do several parses for each feature, slow, but easier to code
    ## alternatively: sort by feature and location.
    for feature in options.features:
        total = 0
        bin = 0
        end = window_size
        for entry in chunk:
            if entry.feature != feature: continue

            while end < entry.start:
                values[bin].append( total )
                bin += 1
                end += window_size

            while entry.end > end:
                seg_start = max(entry.start, end - window_size)
                seg_end = min(entry.end, end)
                total += seg_end - seg_start
                values[bin].append( total )                
                end += window_size
                bin += 1
            else:
                seg_start = max(entry.start, end - window_size)
                seg_end = min(entry.end, end)
                total += seg_end - seg_start

        while bin < num_bins:
            values[bin].append(total)
            bin += 1

    printValues( contig, max_coordinate, window_size, values, options )
    
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gff2coverage.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome [default=%default]"  )

    parser.add_option( "-f", "--features", dest="features", type="string", action="append",
                       help="features to collect " 
                       "[default=%default]" )

    parser.add_option( "-w", "--window-size", dest="window_size", type="int", 
                       help="window size in bp for histogram computation. "
                       "Determines the bin size.  "
                       "[default=%default]" )

    parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", 
                       help="number of bins for histogram computation "
                       "if window size is not given. "
                       "[default=%default]" )

    parser.add_option( "-m", "--method", dest="method", type="choice",
                       choices = ("genomic", "histogram", ),
                       help="methods to apply. "
                       "[default=%default]"  )

    parser.set_defaults(
        genome_file = None,
        window_size = None,
        num_bins = 1000,
        value_format = "%6.4f",
        features = [],
        method = "genomic",
        )

    (options, args) = E.Start( parser, add_output_options = True )

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None
        
    if options.method == "histogram":

        gff = GTF.readFromFile( sys.stdin )

        gff.sort( lambda x,y: cmp( (x.contig, x.start), (y.contig, y.start) ) )

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk( last_contig, chunk, options, fasta )
                last_contig = entry.contig
                chunk = []

            chunk.append( entry )

        processChunk( last_contig, chunk, options, fasta )

    elif options.method == "genomic":
        intervals = collections.defaultdict( int )
        bases = collections.defaultdict( int )
        total = 0
        for entry in GTF.iterator( sys.stdin ):
            intervals[ (entry.contig, entry.source, entry.feature) ] += 1
            bases[ (entry.contig, entry.source, entry.feature) ] += entry.end - entry.start
            total += entry.end - entry.start

        options.stdout.write( "contig\tsource\tfeature\tintervals\tbases" )
        if fasta:
            options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n" )
        else:
            options.stdout.write( "\n" )

        total_genome_size = sum( fasta.getContigSizes( with_synonyms = False ).values() )

        for key in sorted (intervals.keys() ):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            options.stdout.write( "\t".join( ( "\t".join(key),
                                     str(nintervals),
                                     str(nbases) ) ) )
            if fasta: 
                options.stdout.write( "\t%f" % ( 100.0 * float(nbases) / fasta.getLength( contig )))
                options.stdout.write( "\t%f\n" % ( 100.0 * float(nbases) / total_genome_size ))
            else: options.stdout.write( "\n" )
                                     
            
    E.Stop()

if __name__ == "__main__":
    sys.exit( main( sys.argv) )