Mercurial > repos > morinlab > fetch_interval
view fetch_interval.py @ 1:588cdb7e20b2 draft
Uploaded
author | morinlab |
---|---|
date | Sun, 04 Dec 2016 18:24:23 -0500 |
parents | |
children |
line wrap: on
line source
import argparse import math import subprocess if __name__ == "__main__": desc = "Create BAM interval file for easily parallelism in Galaxy" parser = argparse.ArgumentParser(description=desc) parser.add_argument( "--mode", "-m", required=True, choices=['by_rname', 'by_chunk'] ) parser.add_argument( "--input", "-i", type=str, required=True, help="Input BAM file" ) parser.add_argument( "--output", "-o", type=str, required=True, help="Output BED file" ) parser.add_argument( "--order_file", "-of", type=str, required=False, help="Generate a file with the original contig order" ) parser.add_argument( "--chunk_size", "-s", type=int, required=False, default=50000000, help="The number of bases to include in a chunk interval, required when 'by_chunks' specified" ) parser.add_argument( "--chromosome", action="store_true", required=False, help="Add the Chromosome to the Interval File" ) parser.add_argument( "--start_position", action="store_true", required=False, help="Add the Start Position to the Interval File" ) parser.add_argument( "--end_position", action="store_true", required=False, help="Add the End Position to the Interval File" ) parser.add_argument( "--interval", action="store_true", required=False, help="Add the entire interval to the interval file" ) parser.add_argument( "--prefixes_to_ignore", nargs="+", required=False, help="Ignore Intervals that contains the following prefix" ) parser.add_argument( "--group_according_to_largest_chromosome", action="store_true", required=False, help="Output a set of files whose length do not sum larger then the largest chromosome" ) parser.add_argument( "--output_dir", type=str, default="outputs", help="Output directory for collection output" ) args = parser.parse_args() order_file = None if not args.order_file == None: order_file = open(args.order_file, 'w') rnames = [] data = {} commands = [ "samtools", "view", "-H", args.input ] p = subprocess.Popen(commands, stdout=subprocess.PIPE) out, err = p.communicate() seqs = [] lengths = [] for line in out.split("\n"): if not line.find("@SQ") == -1: tmp = line.split("\t") seqs.append(tmp[1][3:]) lengths.append(tmp[2][3:]) chunk_size = args.chunk_size for i in range(len(seqs)): rname = seqs[i] length = lengths[i] if not rname in data: rnames.append(rname) data[rname] = [] if args.mode == 'by_rname': chunk_size = int(length) for i in range(int(math.ceil(float(length) / float(chunk_size)))): if (i+1) * chunk_size > length: data[rname].append([rname,(i * chunk_size) + 1, length]) else: data[rname].append([rname,(i * chunk_size) + 1, (i+1) * chunk_size]) if args.group_according_to_largest_chromosome : ids = [] id_to_length = [] bins_to_length = [] bins_to_id = [] for rname in rnames: ids.append(rname) for i in range(len(ids)): id_to_length.append(data[ids[i]][-1][2]) max_length = 0 for i in range(len(id_to_length)): if max_length < id_to_length[i]: max_length = id_to_length[i] for i in range(len(ids)): rname = ids[i] if any([not rname.find(x) == -1 for x in args.prefixes_to_ignore]): continue if not order_file == None: order_file.write(''.join([rname, '\n'])) if bins_to_id == []: bins_to_id.append([rname]) bins_to_length.append(id_to_length[i]) else: added = False for bin_id in range(len(bins_to_length)): if bins_to_length[bin_id] + id_to_length[i] < max_length: bins_to_id[bin_id].append(rname) bins_to_length[bin_id] += id_to_length[i] added = True break if added == False: bins_to_id.append([rname]) bins_to_length.append(id_to_length[i]) if not order_file == None: order_file.close() for i in range(len(bins_to_id)): output = open(''.join(["./outputs/samp", str(i+1),".bed"]), 'w') for rname in bins_to_id[i]: line = ''.join(['%s\n' % rname]) output.write(line) output.close() else: output = open(args.output,'w') for i in range(len(seqs)): rname = seqs[i] for interval in data[rname]: if any([not interval[0].find(x) == -1 for x in args.prefixes_to_ignore]): continue line = '' if args.interval: line = ''.join([line, '%s:%s-%s\t' % (interval[0], interval[1], interval[2])]) if args.chromosome: line = ''.join([line,'%s\t' % interval[0]]) if args.start_position: line = ''.join([line,'%s\t' % interval[1]]) if args.end_position: line = ''.join([line,'%s\t' % interval[2]]) output.write(''.join([line[:-1], '\n']))