Mercurial > repos > artbio > small_rna_map
comparison small_rna_map.py @ 5:d65045e976e6 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit b673d39fbe79f5164ba6489b33cfa78ac238ee09
author | artbio |
---|---|
date | Sat, 22 Jul 2017 11:45:52 -0400 |
parents | 2e0dc6032a98 |
children | f924a33e1eef |
comparison
equal
deleted
inserted
replaced
4:6ff925458e05 | 5:d65045e976e6 |
---|---|
10 the_parser = argparse.ArgumentParser() | 10 the_parser = argparse.ArgumentParser() |
11 the_parser.add_argument('--input', dest='input', required=True, | 11 the_parser.add_argument('--input', dest='input', required=True, |
12 nargs='+', help='input BAM files') | 12 nargs='+', help='input BAM files') |
13 the_parser.add_argument('--sample_name', dest='sample_name', | 13 the_parser.add_argument('--sample_name', dest='sample_name', |
14 required=True, nargs='+', help='sample name') | 14 required=True, nargs='+', help='sample name') |
15 the_parser.add_argument('--output', action="store", | 15 the_parser.add_argument('--output', action='store', |
16 type=str, help='output tabular file') | 16 type=str, help='output tabular file') |
17 the_parser.add_argument('-S', '--sizes', action='store', | |
18 help='use to output read sizes dataframe') | |
17 args = the_parser.parse_args() | 19 args = the_parser.parse_args() |
18 return args | 20 return args |
19 | 21 |
20 | 22 |
21 def compute_map_dictionary(pysam_object): | 23 class Map: |
22 ''' | 24 |
23 returns | 25 def __init__(self, bam_file, sample): |
24 - map_dictionary {(read_chromosome,read_position,polarity): | 26 self.sample_name = sample |
25 [list_of_read_length]} | 27 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') |
26 - max_dictionary {chromosome_name: | 28 self.chromosomes = dict(zip(self.bam_object.references, |
27 max_of_number_of_read_at_any_position} | 29 self.bam_object.lengths)) |
28 ''' | 30 self.map_dict = self.create_map(self.bam_object) |
29 unmatched_chromosomes = list(pysam_object.references) | 31 self.max = self.compute_max(self.map_dict) |
30 map_dictionary = defaultdict(list) | 32 self.mean = self.compute_mean(self.map_dict) |
31 max_dictionary = dict.fromkeys(pysam_object.references, 0) | 33 self.median = self.compute_median(self.map_dict) |
32 for read in pysam_object: | 34 self.coverage = self.compute_coverage(self.map_dict) |
33 if not read.is_unmapped: | 35 self.size = self.compute_size(self.map_dict) |
34 read_chromosome = read.reference_name | 36 |
35 # to shift from 0-based to 1-based coordinate | 37 def create_map(self, bam_object): |
36 read_position = read.pos + 1 | 38 ''' |
37 polarity = "F" if (not read.is_reverse) else "R" | 39 Returns a map_dictionary {(chromosome,read_position,polarity): |
38 map_dictionary[(read_chromosome, read_position, | 40 [read_length, ...]} |
39 polarity)].append(read.qlen) | 41 ''' |
40 if read_chromosome in unmatched_chromosomes: | 42 map_dictionary = dict() |
41 unmatched_chromosomes.remove(read_chromosome) | 43 for chrom in self.chromosomes: |
42 for name in unmatched_chromosomes: | 44 for pos in range(self.chromosomes[chrom]): |
43 # to put at least a null value by chromosome | 45 map_dictionary[(chrom, pos+1, 'F')] = [] # get all chromosomes |
44 map_dictionary[(name, 1, 'F')] = [0] | 46 map_dictionary[(chrom, pos+1, 'R')] = [] # get all chromosomes |
45 # make a dictionary with max value of reads by chromosome | 47 for chrom in self.chromosomes: |
46 max_dictionary = defaultdict(int) | 48 for read in bam_object.fetch(chrom): |
47 for name in pysam_object.references: | 49 read_positions = read.positions # a list of covered positions |
48 for key in map_dictionary.keys(): | 50 if read.is_reverse: |
49 if name in key and len(map_dictionary[key]) > max_dictionary[name]: | 51 map_dictionary[(chrom, read_positions[-1]+1, |
50 max_dictionary[name] = len(map_dictionary[key]) | 52 'R')].append(read.query_alignment_length) |
51 return map_dictionary, max_dictionary | 53 else: |
54 map_dictionary[(chrom, read_positions[0]+1, | |
55 'F')].append(read.query_alignment_length) | |
56 return map_dictionary | |
57 | |
58 def compute_max(self, map_dictionary): | |
59 ''' | |
60 takes a map_dictionary as input and returns | |
61 a max_dictionary {(chromosome,read_position,polarity): | |
62 max_of_number_of_read_at_any_position} | |
63 ''' | |
64 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()] | |
65 max_dictionary = dict(merge_keylist) | |
66 for key in map_dictionary: | |
67 if len(map_dictionary[key]) > max_dictionary[key[0]]: | |
68 max_dictionary[key[0]] = len(map_dictionary[key]) | |
69 return max_dictionary | |
70 | |
71 def compute_mean(self, map_dictionary): | |
72 ''' | |
73 takes a map_dictionary as input and returns | |
74 a mean_dictionary {(chromosome,read_position,polarity): | |
75 mean_value_of_reads} | |
76 ''' | |
77 mean_dictionary = dict() | |
78 for key in map_dictionary: | |
79 if len(map_dictionary[key]) == 0: | |
80 mean_dictionary[key] = 0 | |
81 else: | |
82 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]), | |
83 1) | |
84 return mean_dictionary | |
85 | |
86 def compute_median(self, map_dictionary): | |
87 ''' | |
88 takes a map_dictionary as input and returns | |
89 a mean_dictionary {(chromosome,read_position,polarity): | |
90 mean_value_of_reads} | |
91 ''' | |
92 median_dictionary = dict() | |
93 for key in map_dictionary: | |
94 if len(map_dictionary[key]) == 0: | |
95 median_dictionary[key] = 0 | |
96 else: | |
97 median_dictionary[key] = numpy.median(map_dictionary[key]) | |
98 return median_dictionary | |
99 | |
100 def compute_coverage(self, map_dictionary, quality=10): | |
101 ''' | |
102 Takes a map_dictionary and returns a dictionary of lists | |
103 of coverage along the coordinates of pre_mirs (as keys) | |
104 ''' | |
105 coverage_dictionary = dict() | |
106 for chrom in self.chromosomes: | |
107 coverage = self.bam_object.count_coverage( | |
108 reference=chrom, | |
109 start=0, | |
110 end=self.chromosomes[chrom], | |
111 quality_threshold=quality) | |
112 """ Add the 4 coverage values """ | |
113 coverage = [sum(x) for x in zip(*coverage)] | |
114 for pos, cov in enumerate(coverage): | |
115 coverage_dictionary[(chrom, pos+1, "F")] = cov | |
116 coverage_dictionary[(chrom, pos+1, "R")] = cov | |
117 return coverage_dictionary | |
118 | |
119 def compute_size(self, map_dictionary): | |
120 ''' | |
121 Takes a map_dictionary and returns a dictionary of sizes: | |
122 {chrom: {size: {polarity: nbre of reads}}} | |
123 ''' | |
124 size_dictionary = defaultdict(lambda: defaultdict(lambda: defaultdict( int ))) | |
125 for key in map_dictionary: | |
126 for size in map_dictionary[key]: | |
127 try: | |
128 size_dictionary[key[0]][key[2]][size] += 1 | |
129 except KeyError: | |
130 size_dictionary[key[0]][key[2]][size] = 1 | |
131 return size_dictionary | |
132 | |
133 def write_size_table(self, out): | |
134 ''' | |
135 Dataset, Chromosome, Polarity, Size, Nbr_reads | |
136 out is an *open* file handler | |
137 ''' | |
138 for chrom in sorted(self.size): | |
139 sizes = self.size[chrom]['F'].keys() | |
140 sizes.extend(self.size[chrom]['R'].keys()) | |
141 for polarity in sorted(self.size[chrom]): | |
142 for size in range(min(sizes), max(sizes)+1): | |
143 try: | |
144 line = [self.sample_name, chrom, polarity, size, | |
145 self.size[chrom][polarity][size]] | |
146 except KeyError: | |
147 line = [self.sample_name, chrom, polarity, size, 0] | |
148 line = [str(i) for i in line] | |
149 out.write('\t'.join(line) + '\n') | |
150 | |
151 def write_table(self, out): | |
152 ''' | |
153 Dataset, Chromosome, Chrom_length, Coordinate, Nbr_reads | |
154 Polarity, Max, Mean, Median, Coverage | |
155 out is an *open* file handler | |
156 ''' | |
157 for key in sorted(self.map_dict): | |
158 line = [self.sample_name, key[0], self.chromosomes[key[0]], | |
159 key[1], len(self.map_dict[key]), key[2], self.max[key[0]], | |
160 self.mean[key], self.median[key], self.coverage[key]] | |
161 line = [str(i) for i in line] | |
162 out.write('\t'.join(line) + '\n') | |
52 | 163 |
53 | 164 |
54 def main(inputs, sample_names, output): | 165 def main(inputs, samples, file_out, size_file_out=''): |
55 with open(output, 'w') as outfile: | 166 F = open(file_out, 'w') |
56 outfile.write('\t'.join(['Dataset', 'Chromosome', 'Chrom_length', | 167 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate", |
57 'Coordinate', 'Nbr_reads', 'Polarity', | 168 "Nbr_reads", "Polarity", "Max", "Mean", "Median", "Coverage"] |
58 'Max', 'Mean', 'Median\n'])) | 169 F.write('\t'.join(header) + '\n') |
59 chr_lengths = dict(zip(pysam.AlignmentFile(inputs[0], 'rb').references, | 170 if size_file_out: |
60 pysam.AlignmentFile(inputs[0], 'rb').lengths)) | 171 Fs = open(size_file_out, 'w') |
61 for file, sample in zip(inputs, sample_names): | 172 header = ["Dataset", "Chromosome", "Polarity", "Size", "Nbr_reads"] |
62 with pysam.AlignmentFile(file, 'rb') as bamfile: | 173 Fs.write('\t'.join(header) + '\n') |
63 map, maximum = compute_map_dictionary(bamfile) | 174 for file, sample in zip(inputs, samples): |
64 for record in sorted(map): | 175 mapobj = Map(file, sample) |
65 line = [sample, record[0], | 176 mapobj.write_table(F) |
66 str(chr_lengths[record[0]]), | 177 if size_file_out: |
67 str(record[1]), str(len(map[record])), | 178 mapobj.write_size_table(Fs) |
68 record[2], str(maximum[record[0]]), | 179 F.close() |
69 str(round(numpy.mean(map[record]), 1)), | 180 if size_file_out: |
70 str(numpy.median(map[record]))] | 181 Fs.close() |
71 outfile.write("\t".join(line) + "\n") | |
72 | 182 |
73 | 183 |
74 if __name__ == "__main__": | 184 if __name__ == "__main__": |
75 args = Parser() | 185 args = Parser() |
76 # if identical sample names # to be tested | 186 # if identical sample names # to be tested |
77 if len(set(args.sample_name)) != len(args.sample_name): | 187 if len(set(args.sample_name)) != len(args.sample_name): |
78 args.sample_name = [name + '_' + str(i) for | 188 args.sample_name = [name + '_' + str(i) for |
79 i, name in enumerate(args.sample_name)] | 189 i, name in enumerate(args.sample_name)] |
80 main(args.input, args.sample_name, args.output) | 190 main(args.input, args.sample_name, args.output, args.sizes) |