comparison small_rna_maps.py @ 0:0a06985c0894 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit 7b2ceb05489c27ddb769c38fdec56274108a6fa1
author artbio
date Tue, 22 Aug 2017 12:06:58 -0400
parents
children 615fa2171a34
comparison
equal deleted inserted replaced
-1:000000000000 0:0a06985c0894
1 import argparse
2 from collections import defaultdict
3
4 import numpy
5
6 import pysam
7
8
9 def Parser():
10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--inputs', dest='inputs', required=True,
12 nargs='+', help='list of input BAM files')
13 the_parser.add_argument('--sample_names', dest='sample_names',
14 required=True, nargs='+',
15 help='list of sample names')
16 the_parser.add_argument('--outputs', nargs='+', action='store',
17 help='list of two output paths (only two)')
18 the_parser.add_argument('-M', '--plot_methods', nargs='+', action='store',
19 help='list of 2 plot methods (only two) among:\
20 Counts, Max, Mean, Median, Coverage and Size')
21 args = the_parser.parse_args()
22 return args
23
24
25 class Map:
26
27 def __init__(self, bam_file, sample):
28 self.sample_name = sample
29 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
30 self.chromosomes = dict(zip(self.bam_object.references,
31 self.bam_object.lengths))
32 self.map_dict = self.create_map(self.bam_object)
33
34 def create_map(self, bam_object):
35 '''
36 Returns a map_dictionary {(chromosome,read_position,polarity):
37 [read_length, ...]}
38 '''
39 map_dictionary = defaultdict(list)
40 # get empty value for start and end of each chromosome
41 for chrom in self.chromosomes:
42 map_dictionary[(chrom, 1, 'F')] = []
43 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
44 for chrom in self.chromosomes:
45 for read in bam_object.fetch(chrom):
46 positions = read.positions # a list of covered positions
47 for pos in positions:
48 if not map_dictionary[(chrom, pos+1, 'F')]:
49 map_dictionary[(chrom, pos+1, 'F')] = []
50 if read.is_reverse:
51 map_dictionary[(chrom, positions[-1]+1,
52 'R')].append(read.query_alignment_length)
53 else:
54 map_dictionary[(chrom, positions[0]+1,
55 'F')].append(read.query_alignment_length)
56 return map_dictionary
57
58 def compute_readcount(self, map_dictionary, out):
59 '''
60 takes a map_dictionary as input and writes
61 a readmap_dictionary {(chromosome,read_position,polarity):
62 number_of_reads}
63 in an open file handler out
64 '''
65 readmap_dictionary = dict()
66 for key in map_dictionary:
67 readmap_dictionary[key] = len(map_dictionary[key])
68 self.write_table(readmap_dictionary, out)
69
70 def compute_max(self, map_dictionary, out):
71 '''
72 takes a map_dictionary as input and writes
73 a max_dictionary {(chromosome,read_position,polarity):
74 max_of_number_of_read_at_any_position}
75 Not clear this function is still required
76 '''
77 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()]
78 max_dictionary = dict(merge_keylist)
79 for key in map_dictionary:
80 if len(map_dictionary[key]) > max_dictionary[key[0]]:
81 max_dictionary[key[0]] = len(map_dictionary[key])
82 self.write_table(max_dictionary, out)
83
84 def compute_mean(self, map_dictionary, out):
85 '''
86 takes a map_dictionary as input and returns
87 a mean_dictionary {(chromosome,read_position,polarity):
88 mean_value_of_reads}
89 '''
90 mean_dictionary = dict()
91 for key in map_dictionary:
92 if len(map_dictionary[key]) == 0:
93 mean_dictionary[key] = 0
94 else:
95 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]),
96 1)
97 self.write_table(mean_dictionary, out)
98
99 def compute_median(self, map_dictionary, out):
100 '''
101 takes a map_dictionary as input and returns
102 a mean_dictionary {(chromosome,read_position,polarity):
103 mean_value_of_reads}
104 '''
105 median_dictionary = dict()
106 for key in map_dictionary:
107 if len(map_dictionary[key]) == 0:
108 median_dictionary[key] = 0
109 else:
110 median_dictionary[key] = numpy.median(map_dictionary[key])
111 self.write_table(median_dictionary, out)
112
113 def compute_coverage(self, map_dictionary, out, quality=10):
114 '''
115 takes a map_dictionary as input and returns
116 a coverage_dictionary {(chromosome,read_position,polarity):
117 coverage}
118 '''
119 coverage_dictionary = dict()
120 for chrom in self.chromosomes:
121 coverage_dictionary[(chrom, 1, 'F')] = 0
122 coverage_dictionary[(chrom, self.chromosomes[chrom], 'F')] = 0
123 for key in map_dictionary:
124 coverage = self.bam_object.count_coverage(
125 reference=key[0],
126 start=key[1]-1,
127 end=key[1],
128 quality_threshold=quality)
129 """ Add the 4 coverage values """
130 coverage = [sum(x) for x in zip(*coverage)]
131 coverage_dictionary[key] = coverage[0]
132 self.write_table(coverage_dictionary, out)
133
134 def compute_size(self, map_dictionary, out):
135 '''
136 Takes a map_dictionary and returns a dictionary of sizes:
137 {chrom: {polarity: {size: nbre of reads}}}
138 '''
139 size_dictionary = defaultdict(lambda: defaultdict(
140 lambda: defaultdict(int)))
141 # to track empty chromosomes
142 for chrom in self.chromosomes:
143 if self.bam_object.count(chrom) == 0:
144 size_dictionary[chrom]['F'][10] = 0
145 for key in map_dictionary:
146 for size in map_dictionary[key]:
147 size_dictionary[key[0]][key[2]][size] += 1
148 self.write_size_table(size_dictionary, out)
149
150 def write_table(self, mapdict, out):
151 '''
152 Generic writer
153 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
154 <some mapped value>
155 out is an *open* file handler
156 '''
157 for key in sorted(mapdict):
158 line = [self.sample_name, key[0], self.chromosomes[key[0]],
159 key[1], key[2], mapdict[key]]
160 line = [str(i) for i in line]
161 out.write('\t'.join(line) + '\n')
162
163 def write_size_table(self, sizedic, out):
164 '''
165 Generic writer of summary values
166 Dataset, Chromosome, Chrom_length, <some category>, <some value>
167 out is an *open* file handler
168 '''
169 for chrom in sorted(sizedic):
170 sizes = sizedic[chrom]['F'].keys()
171 sizes.extend(sizedic[chrom]['R'].keys())
172 for polarity in sorted(sizedic[chrom]):
173 for size in range(min(sizes), max(sizes)+1):
174 try:
175 line = [self.sample_name, chrom, polarity, size,
176 sizedic[chrom][polarity][size]]
177 except KeyError:
178 line = [self.sample_name, chrom, polarity, size, 0]
179 line = [str(i) for i in line]
180 out.write('\t'.join(line) + '\n')
181
182
183 def main(inputs, samples, methods, outputs):
184 for method, output in zip(methods, outputs):
185 F = open(output, 'w')
186 if method == 'Size':
187 header = ["Dataset", "Chromosome", "Polarity", method, "Count"]
188 else:
189 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate",
190 "Polarity", method]
191 F.write('\t'.join(header) + '\n')
192 for input, sample in zip(inputs, samples):
193 mapobj = Map(input, sample)
194 token = {"Counts": mapobj.compute_readcount,
195 "Max": mapobj.compute_max,
196 "Mean": mapobj.compute_mean,
197 "Median": mapobj.compute_median,
198 "Coverage": mapobj.compute_coverage,
199 "Size": mapobj.compute_size}
200 token[method](mapobj.map_dict, F)
201 F.close()
202
203
204 if __name__ == "__main__":
205 args = Parser()
206 # if identical sample names
207 if len(set(args.sample_names)) != len(args.sample_names):
208 args.sample_names = [name + '_' + str(i) for
209 i, name in enumerate(args.sample_names)]
210 main(args.inputs, args.sample_names, args.plot_methods, args.outputs)