Mercurial > repos > sebastian > my_first_test
changeset 0:0d66ec694153 draft
Uploaded test first test script
author | sebastian |
---|---|
date | Fri, 21 Feb 2014 08:04:58 -0500 |
parents | |
children | 924fec18933e |
files | gff2coverage.py |
diffstat | 1 files changed, 251 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff2coverage.py Fri Feb 21 08:04:58 2014 -0500 @@ -0,0 +1,251 @@ +''' +gff2coverage.py - compute genomic coverage of gff intervals +=========================================================== + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Genomics Intervals Summary GFF + +Purpose +------- + +This script computes the genomic coverage of intervals +in a :term:`gff` formatted file. The coverage is computed +per feature. + +Usage +----- + +Example:: + + python <script_name>.py --help + +Type:: + + python <script_name>.py --help + +for command line help. + +Command line options +-------------------- + +''' + +import sys +import string +import re +import optparse +import time +import os +import shutil +import tempfile +import math +import collections + +import CGAT.Experiment as E +import CGAT.IndexedFasta as IndexedFasta +import CGAT.IOTools as IOTools +import CGAT.GTF as GTF + +def printValues( contig, max_size, window_size, values, options ): + """output values.""" + + outfile = E.openOutputFile( contig, "w" ) + + outfile.write( "abs_pos\trel_pos" ) + + for feature in options.features: + outfile.write("\tabs_%s\trel_%s" % (feature,feature) ) + outfile.write("\n") + + max_vv = [] + + for f in range(len(options.features)): + max_vv.append( float(max(map(lambda x: x[f], values ) ))) + + bin = 0 + for vv in values: + outfile.write( "%i\t" % bin ) + outfile.write( options.value_format % (float(bin) / max_size) ) + + for x in range(len(options.features)): + outfile.write( "\t%i\t%s" % (vv[x] , + options.value_format % (vv[x] / max_vv[x]) ) ) + outfile.write("\n" ) + bin += window_size + + outfile.close() + +def processChunk( contig, chunk, options, fasta = None ): + """ + This function requires segments to be non-overlapping. + """ + + if len(chunk) == 0: return + + # check whether there are overlapping features or not + checked = [] + for feature in chunk: + checked.append(feature) + others = [x for x in chunk if x not in checked] + for otherFeature in others: + if GTF.Overlap(feature, otherFeature): + raise ValueError( " Histogram could not be created since the file contains overlapping features! \n%s\n%s " % (feature, otherFeature) ) + # clear auxiliary list + del checked[:] + + # compute min_coordinate, max_coordinate for the histogram + min_coordinate = 1 + max_coordinate = max( map( lambda x: x.end, chunk) ) + ## compute window size + if options.window_size: + window_size = options.window_size + num_bins = int(math.ceil((float(max_coordinate) / window_size))) + elif options.num_bins and fasta: + contig_length = fasta.getLength( contig ) + assert max_coordinate <= contig_length, "maximum coordinate (%i) larger than contig size (%i) for contig %s" % (max_coordinate, contig_length, contig ) + max_coordinate = contig_length + window_size = int(math.floor( float(contig_length) / options.num_bins)) + num_bins = options.num_bins + else: + raise ValueError("please specify a window size of provide genomic sequence with number of bins.") + + values = [ [] for x in range(num_bins) ] + + ## do several parses for each feature, slow, but easier to code + ## alternatively: sort by feature and location. + for feature in options.features: + total = 0 + bin = 0 + end = window_size + for entry in chunk: + if entry.feature != feature: continue + + while end < entry.start: + values[bin].append( total ) + bin += 1 + end += window_size + + while entry.end > end: + seg_start = max(entry.start, end - window_size) + seg_end = min(entry.end, end) + total += seg_end - seg_start + values[bin].append( total ) + end += window_size + bin += 1 + else: + seg_start = max(entry.start, end - window_size) + seg_end = min(entry.end, end) + total += seg_end - seg_start + + while bin < num_bins: + values[bin].append(total) + bin += 1 + + printValues( contig, max_coordinate, window_size, values, options ) + +def main( argv = None ): + """script main. + + parses command line options in sys.argv, unless *argv* is given. + """ + + if argv == None: argv = sys.argv + + parser = E.OptionParser( version = "%prog version: $Id: gff2coverage.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) + + parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", + help="filename with genome [default=%default]" ) + + parser.add_option( "-f", "--features", dest="features", type="string", action="append", + help="features to collect " + "[default=%default]" ) + + parser.add_option( "-w", "--window-size", dest="window_size", type="int", + help="window size in bp for histogram computation. " + "Determines the bin size. " + "[default=%default]" ) + + parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", + help="number of bins for histogram computation " + "if window size is not given. " + "[default=%default]" ) + + parser.add_option( "-m", "--method", dest="method", type="choice", + choices = ("genomic", "histogram", ), + help="methods to apply. " + "[default=%default]" ) + + parser.set_defaults( + genome_file = None, + window_size = None, + num_bins = 1000, + value_format = "%6.4f", + features = [], + method = "genomic", + ) + + (options, args) = E.Start( parser, add_output_options = True ) + + if options.genome_file: + fasta = IndexedFasta.IndexedFasta( options.genome_file ) + else: + fasta = None + + if options.method == "histogram": + + gff = GTF.readFromFile( sys.stdin ) + + gff.sort( lambda x,y: cmp( (x.contig, x.start), (y.contig, y.start) ) ) + + chunk = [] + last_contig = None + + for entry in gff: + + if last_contig != entry.contig: + processChunk( last_contig, chunk, options, fasta ) + last_contig = entry.contig + chunk = [] + + chunk.append( entry ) + + processChunk( last_contig, chunk, options, fasta ) + + elif options.method == "genomic": + intervals = collections.defaultdict( int ) + bases = collections.defaultdict( int ) + total = 0 + for entry in GTF.iterator( sys.stdin ): + intervals[ (entry.contig, entry.source, entry.feature) ] += 1 + bases[ (entry.contig, entry.source, entry.feature) ] += entry.end - entry.start + total += entry.end - entry.start + + options.stdout.write( "contig\tsource\tfeature\tintervals\tbases" ) + if fasta: + options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n" ) + else: + options.stdout.write( "\n" ) + + total_genome_size = sum( fasta.getContigSizes( with_synonyms = False ).values() ) + + for key in sorted (intervals.keys() ): + nbases = bases[key] + nintervals = intervals[key] + contig, source, feature = key + options.stdout.write( "\t".join( ( "\t".join(key), + str(nintervals), + str(nbases) ) ) ) + if fasta: + options.stdout.write( "\t%f" % ( 100.0 * float(nbases) / fasta.getLength( contig ))) + options.stdout.write( "\t%f\n" % ( 100.0 * float(nbases) / total_genome_size )) + else: options.stdout.write( "\n" ) + + + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) ) + +