Mercurial > repos > sebastian > my_first_test
changeset 2:6dea9465b116 draft
Removing gff2coverage.py
author | sebastian |
---|---|
date | Fri, 21 Feb 2014 10:29:38 -0500 |
parents | 924fec18933e |
children | 5b5de012bc2e |
files | gff2coverage.py |
diffstat | 1 files changed, 0 insertions(+), 251 deletions(-) [+] |
line wrap: on
line diff
--- a/gff2coverage.py Fri Feb 21 10:28:59 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,251 +0,0 @@ -''' -gff2coverage.py - compute genomic coverage of gff intervals -=========================================================== - -:Author: Andreas Heger -:Release: $Id$ -:Date: |today| -:Tags: Genomics Intervals Summary GFF - -Purpose -------- - -This script computes the genomic coverage of intervals -in a :term:`gff` formatted file. The coverage is computed -per feature. - -Usage ------ - -Example:: - - python <script_name>.py --help - -Type:: - - python <script_name>.py --help - -for command line help. - -Command line options --------------------- - -''' - -import sys -import string -import re -import optparse -import time -import os -import shutil -import tempfile -import math -import collections - -import CGAT.Experiment as E -import CGAT.IndexedFasta as IndexedFasta -import CGAT.IOTools as IOTools -import CGAT.GTF as GTF - -def printValues( contig, max_size, window_size, values, options ): - """output values.""" - - outfile = E.openOutputFile( contig, "w" ) - - outfile.write( "abs_pos\trel_pos" ) - - for feature in options.features: - outfile.write("\tabs_%s\trel_%s" % (feature,feature) ) - outfile.write("\n") - - max_vv = [] - - for f in range(len(options.features)): - max_vv.append( float(max(map(lambda x: x[f], values ) ))) - - bin = 0 - for vv in values: - outfile.write( "%i\t" % bin ) - outfile.write( options.value_format % (float(bin) / max_size) ) - - for x in range(len(options.features)): - outfile.write( "\t%i\t%s" % (vv[x] , - options.value_format % (vv[x] / max_vv[x]) ) ) - outfile.write("\n" ) - bin += window_size - - outfile.close() - -def processChunk( contig, chunk, options, fasta = None ): - """ - This function requires segments to be non-overlapping. - """ - - if len(chunk) == 0: return - - # check whether there are overlapping features or not - checked = [] - for feature in chunk: - checked.append(feature) - others = [x for x in chunk if x not in checked] - for otherFeature in others: - if GTF.Overlap(feature, otherFeature): - raise ValueError( " Histogram could not be created since the file contains overlapping features! \n%s\n%s " % (feature, otherFeature) ) - # clear auxiliary list - del checked[:] - - # compute min_coordinate, max_coordinate for the histogram - min_coordinate = 1 - max_coordinate = max( map( lambda x: x.end, chunk) ) - ## compute window size - if options.window_size: - window_size = options.window_size - num_bins = int(math.ceil((float(max_coordinate) / window_size))) - elif options.num_bins and fasta: - contig_length = fasta.getLength( contig ) - assert max_coordinate <= contig_length, "maximum coordinate (%i) larger than contig size (%i) for contig %s" % (max_coordinate, contig_length, contig ) - max_coordinate = contig_length - window_size = int(math.floor( float(contig_length) / options.num_bins)) - num_bins = options.num_bins - else: - raise ValueError("please specify a window size of provide genomic sequence with number of bins.") - - values = [ [] for x in range(num_bins) ] - - ## do several parses for each feature, slow, but easier to code - ## alternatively: sort by feature and location. - for feature in options.features: - total = 0 - bin = 0 - end = window_size - for entry in chunk: - if entry.feature != feature: continue - - while end < entry.start: - values[bin].append( total ) - bin += 1 - end += window_size - - while entry.end > end: - seg_start = max(entry.start, end - window_size) - seg_end = min(entry.end, end) - total += seg_end - seg_start - values[bin].append( total ) - end += window_size - bin += 1 - else: - seg_start = max(entry.start, end - window_size) - seg_end = min(entry.end, end) - total += seg_end - seg_start - - while bin < num_bins: - values[bin].append(total) - bin += 1 - - printValues( contig, max_coordinate, window_size, values, options ) - -def main( argv = None ): - """script main. - - parses command line options in sys.argv, unless *argv* is given. - """ - - if argv == None: argv = sys.argv - - parser = E.OptionParser( version = "%prog version: $Id: gff2coverage.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) - - parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", - help="filename with genome [default=%default]" ) - - parser.add_option( "-f", "--features", dest="features", type="string", action="append", - help="features to collect " - "[default=%default]" ) - - parser.add_option( "-w", "--window-size", dest="window_size", type="int", - help="window size in bp for histogram computation. " - "Determines the bin size. " - "[default=%default]" ) - - parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", - help="number of bins for histogram computation " - "if window size is not given. " - "[default=%default]" ) - - parser.add_option( "-m", "--method", dest="method", type="choice", - choices = ("genomic", "histogram", ), - help="methods to apply. " - "[default=%default]" ) - - parser.set_defaults( - genome_file = None, - window_size = None, - num_bins = 1000, - value_format = "%6.4f", - features = [], - method = "genomic", - ) - - (options, args) = E.Start( parser, add_output_options = True ) - - if options.genome_file: - fasta = IndexedFasta.IndexedFasta( options.genome_file ) - else: - fasta = None - - if options.method == "histogram": - - gff = GTF.readFromFile( sys.stdin ) - - gff.sort( lambda x,y: cmp( (x.contig, x.start), (y.contig, y.start) ) ) - - chunk = [] - last_contig = None - - for entry in gff: - - if last_contig != entry.contig: - processChunk( last_contig, chunk, options, fasta ) - last_contig = entry.contig - chunk = [] - - chunk.append( entry ) - - processChunk( last_contig, chunk, options, fasta ) - - elif options.method == "genomic": - intervals = collections.defaultdict( int ) - bases = collections.defaultdict( int ) - total = 0 - for entry in GTF.iterator( sys.stdin ): - intervals[ (entry.contig, entry.source, entry.feature) ] += 1 - bases[ (entry.contig, entry.source, entry.feature) ] += entry.end - entry.start - total += entry.end - entry.start - - options.stdout.write( "contig\tsource\tfeature\tintervals\tbases" ) - if fasta: - options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n" ) - else: - options.stdout.write( "\n" ) - - total_genome_size = sum( fasta.getContigSizes( with_synonyms = False ).values() ) - - for key in sorted (intervals.keys() ): - nbases = bases[key] - nintervals = intervals[key] - contig, source, feature = key - options.stdout.write( "\t".join( ( "\t".join(key), - str(nintervals), - str(nbases) ) ) ) - if fasta: - options.stdout.write( "\t%f" % ( 100.0 * float(nbases) / fasta.getLength( contig ))) - options.stdout.write( "\t%f\n" % ( 100.0 * float(nbases) / total_genome_size )) - else: options.stdout.write( "\n" ) - - - E.Stop() - -if __name__ == "__main__": - sys.exit( main( sys.argv) ) - -