| 1 | 1 import argparse | 
|  | 2 import math | 
|  | 3 import subprocess | 
|  | 4 | 
|  | 5 if __name__ == "__main__": | 
|  | 6     desc = "Create BAM interval file for easily parallelism in Galaxy" | 
|  | 7     parser = argparse.ArgumentParser(description=desc) | 
|  | 8     parser.add_argument( | 
|  | 9     	"--mode", "-m", | 
|  | 10     	required=True, | 
|  | 11     	choices=['by_rname', 'by_chunk'] | 
|  | 12     	) | 
|  | 13     parser.add_argument( | 
|  | 14     	"--input", "-i", | 
|  | 15     	type=str, | 
|  | 16     	required=True, | 
|  | 17     	help="Input BAM file" | 
|  | 18     	) | 
|  | 19     parser.add_argument( | 
|  | 20     	"--output", "-o", | 
|  | 21     	type=str, | 
|  | 22     	required=True, | 
|  | 23     	help="Output BED file" | 
|  | 24     	) | 
|  | 25     parser.add_argument( | 
|  | 26       "--order_file", "-of", | 
|  | 27       type=str, | 
|  | 28       required=False, | 
|  | 29       help="Generate a file with the original contig order" | 
|  | 30       ) | 
|  | 31     parser.add_argument( | 
|  | 32     	  "--chunk_size", "-s", | 
|  | 33     	  type=int, | 
|  | 34     	  required=False, | 
|  | 35     	  default=50000000, | 
|  | 36     	  help="The number of bases to include in a chunk interval, required when 'by_chunks' specified" | 
|  | 37     	  ) | 
|  | 38     parser.add_argument( | 
|  | 39         "--chromosome", | 
|  | 40         action="store_true", | 
|  | 41         required=False, | 
|  | 42         help="Add the Chromosome to the Interval File" | 
|  | 43         ) | 
|  | 44     parser.add_argument( | 
|  | 45         "--start_position", | 
|  | 46         action="store_true", | 
|  | 47         required=False, | 
|  | 48         help="Add the Start Position to the Interval File" | 
|  | 49         ) | 
|  | 50     parser.add_argument( | 
|  | 51         "--end_position", | 
|  | 52         action="store_true", | 
|  | 53         required=False, | 
|  | 54         help="Add the End Position to the Interval File" | 
|  | 55         ) | 
|  | 56     parser.add_argument( | 
|  | 57         "--interval", | 
|  | 58         action="store_true", | 
|  | 59         required=False, | 
|  | 60         help="Add the entire interval to the interval file" | 
|  | 61         ) | 
|  | 62     parser.add_argument( | 
|  | 63         "--prefixes_to_ignore", | 
|  | 64         nargs="+", | 
|  | 65         required=False, | 
|  | 66         help="Ignore Intervals that contains the following prefix" | 
|  | 67         ) | 
|  | 68     parser.add_argument( | 
|  | 69         "--group_according_to_largest_chromosome", | 
|  | 70         action="store_true", | 
|  | 71         required=False, | 
|  | 72         help="Output a set of files whose length do not sum larger then the largest chromosome" | 
|  | 73         ) | 
|  | 74     parser.add_argument( | 
|  | 75         "--output_dir", | 
|  | 76         type=str, | 
|  | 77         default="outputs", | 
|  | 78         help="Output directory for collection output" | 
|  | 79         ) | 
|  | 80     args = parser.parse_args() | 
|  | 81 | 
|  | 82 | 
|  | 83     order_file = None | 
|  | 84     if not args.order_file == None: | 
|  | 85         order_file = open(args.order_file, 'w') | 
|  | 86 | 
|  | 87     rnames = [] | 
|  | 88     data = {} | 
|  | 89     commands = [ | 
|  | 90       "samtools", "view", | 
|  | 91       "-H", args.input | 
|  | 92       ] | 
|  | 93     p = subprocess.Popen(commands, stdout=subprocess.PIPE) | 
|  | 94     out, err = p.communicate() | 
|  | 95     seqs = [] | 
|  | 96     lengths = [] | 
|  | 97     for line in out.split("\n"): | 
|  | 98         if not line.find("@SQ") == -1: | 
|  | 99             tmp = line.split("\t") | 
|  | 100             seqs.append(tmp[1][3:]) | 
|  | 101             lengths.append(tmp[2][3:]) | 
|  | 102 | 
|  | 103 | 
|  | 104     chunk_size = args.chunk_size | 
|  | 105 | 
|  | 106     for i in range(len(seqs)): | 
|  | 107         rname = seqs[i] | 
|  | 108         length = lengths[i] | 
|  | 109 | 
|  | 110         if not rname in data: | 
|  | 111             rnames.append(rname) | 
|  | 112             data[rname] = [] | 
|  | 113 | 
|  | 114         if args.mode == 'by_rname': | 
|  | 115             chunk_size = int(length) | 
|  | 116         for i in range(int(math.ceil(float(length) / float(chunk_size)))): | 
|  | 117             if (i+1) * chunk_size > length: | 
|  | 118                 data[rname].append([rname,(i * chunk_size) + 1, length]) | 
|  | 119             else: | 
|  | 120                 data[rname].append([rname,(i * chunk_size) + 1, (i+1) * chunk_size]) | 
|  | 121 | 
|  | 122     if args.group_according_to_largest_chromosome : | 
|  | 123         ids = [] | 
|  | 124         id_to_length = [] | 
|  | 125         bins_to_length = [] | 
|  | 126         bins_to_id = [] | 
|  | 127         for rname in rnames: | 
|  | 128             ids.append(rname) | 
|  | 129 | 
|  | 130         for i in range(len(ids)): | 
|  | 131             id_to_length.append(data[ids[i]][-1][2]) | 
|  | 132 | 
|  | 133         max_length = 0 | 
|  | 134         for i in range(len(id_to_length)): | 
|  | 135             if max_length < id_to_length[i]: | 
|  | 136                 max_length = id_to_length[i] | 
|  | 137 | 
|  | 138         for i in range(len(ids)): | 
|  | 139             rname = ids[i] | 
|  | 140             if any([not rname.find(x) == -1 for x in args.prefixes_to_ignore]): | 
|  | 141                 continue | 
|  | 142             if not order_file == None: | 
|  | 143                 order_file.write(''.join([rname, '\n'])) | 
|  | 144 | 
|  | 145             if bins_to_id == []: | 
|  | 146                 bins_to_id.append([rname]) | 
|  | 147                 bins_to_length.append(id_to_length[i]) | 
|  | 148             else: | 
|  | 149                 added = False | 
|  | 150                 for bin_id in range(len(bins_to_length)): | 
|  | 151                     if bins_to_length[bin_id] + id_to_length[i] < max_length: | 
|  | 152                         bins_to_id[bin_id].append(rname) | 
|  | 153                         bins_to_length[bin_id] += id_to_length[i] | 
|  | 154                         added = True | 
|  | 155                         break | 
|  | 156                 if added == False: | 
|  | 157                     bins_to_id.append([rname]) | 
|  | 158                     bins_to_length.append(id_to_length[i]) | 
|  | 159 | 
|  | 160         if not order_file == None: | 
|  | 161             order_file.close() | 
|  | 162 | 
|  | 163         for i in range(len(bins_to_id)): | 
|  | 164             output = open(''.join(["./outputs/samp", str(i+1),".bed"]), 'w') | 
|  | 165             for rname in bins_to_id[i]: | 
|  | 166                 line = ''.join(['%s\n' % rname]) | 
|  | 167                 output.write(line) | 
|  | 168             output.close() | 
|  | 169 | 
|  | 170 | 
|  | 171     else: | 
|  | 172         output = open(args.output,'w') | 
|  | 173         for i in range(len(seqs)): | 
|  | 174             rname = seqs[i] | 
|  | 175             for interval in data[rname]: | 
|  | 176                 if any([not interval[0].find(x) == -1 for x in args.prefixes_to_ignore]): | 
|  | 177                     continue | 
|  | 178                 line = '' | 
|  | 179                 if args.interval: | 
|  | 180                     line = ''.join([line, '%s:%s-%s\t' % (interval[0], interval[1], interval[2])]) | 
|  | 181                 if args.chromosome: | 
|  | 182                     line = ''.join([line,'%s\t' % interval[0]]) | 
|  | 183                 if args.start_position: | 
|  | 184                     line = ''.join([line,'%s\t' % interval[1]]) | 
|  | 185                 if args.end_position: | 
|  | 186                     line = ''.join([line,'%s\t' % interval[2]]) | 
|  | 187                 output.write(''.join([line[:-1], '\n'])) |