Mercurial > repos > morinlab > fetch_interval
diff fetch_interval.py @ 1:588cdb7e20b2 draft
Uploaded
| author | morinlab | 
|---|---|
| date | Sun, 04 Dec 2016 18:24:23 -0500 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetch_interval.py Sun Dec 04 18:24:23 2016 -0500 @@ -0,0 +1,187 @@ +import argparse +import math +import subprocess + +if __name__ == "__main__": + desc = "Create BAM interval file for easily parallelism in Galaxy" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument( + "--mode", "-m", + required=True, + choices=['by_rname', 'by_chunk'] + ) + parser.add_argument( + "--input", "-i", + type=str, + required=True, + help="Input BAM file" + ) + parser.add_argument( + "--output", "-o", + type=str, + required=True, + help="Output BED file" + ) + parser.add_argument( + "--order_file", "-of", + type=str, + required=False, + help="Generate a file with the original contig order" + ) + parser.add_argument( + "--chunk_size", "-s", + type=int, + required=False, + default=50000000, + help="The number of bases to include in a chunk interval, required when 'by_chunks' specified" + ) + parser.add_argument( + "--chromosome", + action="store_true", + required=False, + help="Add the Chromosome to the Interval File" + ) + parser.add_argument( + "--start_position", + action="store_true", + required=False, + help="Add the Start Position to the Interval File" + ) + parser.add_argument( + "--end_position", + action="store_true", + required=False, + help="Add the End Position to the Interval File" + ) + parser.add_argument( + "--interval", + action="store_true", + required=False, + help="Add the entire interval to the interval file" + ) + parser.add_argument( + "--prefixes_to_ignore", + nargs="+", + required=False, + help="Ignore Intervals that contains the following prefix" + ) + parser.add_argument( + "--group_according_to_largest_chromosome", + action="store_true", + required=False, + help="Output a set of files whose length do not sum larger then the largest chromosome" + ) + parser.add_argument( + "--output_dir", + type=str, + default="outputs", + help="Output directory for collection output" + ) + args = parser.parse_args() + + + order_file = None + if not args.order_file == None: + order_file = open(args.order_file, 'w') + + rnames = [] + data = {} + commands = [ + "samtools", "view", + "-H", args.input + ] + p = subprocess.Popen(commands, stdout=subprocess.PIPE) + out, err = p.communicate() + seqs = [] + lengths = [] + for line in out.split("\n"): + if not line.find("@SQ") == -1: + tmp = line.split("\t") + seqs.append(tmp[1][3:]) + lengths.append(tmp[2][3:]) + + + chunk_size = args.chunk_size + + for i in range(len(seqs)): + rname = seqs[i] + length = lengths[i] + + if not rname in data: + rnames.append(rname) + data[rname] = [] + + if args.mode == 'by_rname': + chunk_size = int(length) + for i in range(int(math.ceil(float(length) / float(chunk_size)))): + if (i+1) * chunk_size > length: + data[rname].append([rname,(i * chunk_size) + 1, length]) + else: + data[rname].append([rname,(i * chunk_size) + 1, (i+1) * chunk_size]) + + if args.group_according_to_largest_chromosome : + ids = [] + id_to_length = [] + bins_to_length = [] + bins_to_id = [] + for rname in rnames: + ids.append(rname) + + for i in range(len(ids)): + id_to_length.append(data[ids[i]][-1][2]) + + max_length = 0 + for i in range(len(id_to_length)): + if max_length < id_to_length[i]: + max_length = id_to_length[i] + + for i in range(len(ids)): + rname = ids[i] + if any([not rname.find(x) == -1 for x in args.prefixes_to_ignore]): + continue + if not order_file == None: + order_file.write(''.join([rname, '\n'])) + + if bins_to_id == []: + bins_to_id.append([rname]) + bins_to_length.append(id_to_length[i]) + else: + added = False + for bin_id in range(len(bins_to_length)): + if bins_to_length[bin_id] + id_to_length[i] < max_length: + bins_to_id[bin_id].append(rname) + bins_to_length[bin_id] += id_to_length[i] + added = True + break + if added == False: + bins_to_id.append([rname]) + bins_to_length.append(id_to_length[i]) + + if not order_file == None: + order_file.close() + + for i in range(len(bins_to_id)): + output = open(''.join(["./outputs/samp", str(i+1),".bed"]), 'w') + for rname in bins_to_id[i]: + line = ''.join(['%s\n' % rname]) + output.write(line) + output.close() + + + else: + output = open(args.output,'w') + for i in range(len(seqs)): + rname = seqs[i] + for interval in data[rname]: + if any([not interval[0].find(x) == -1 for x in args.prefixes_to_ignore]): + continue + line = '' + if args.interval: + line = ''.join([line, '%s:%s-%s\t' % (interval[0], interval[1], interval[2])]) + if args.chromosome: + line = ''.join([line,'%s\t' % interval[0]]) + if args.start_position: + line = ''.join([line,'%s\t' % interval[1]]) + if args.end_position: + line = ''.join([line,'%s\t' % interval[2]]) + output.write(''.join([line[:-1], '\n']))
