comparison small_rna_map.py @ 5:d65045e976e6 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit b673d39fbe79f5164ba6489b33cfa78ac238ee09
author artbio
date Sat, 22 Jul 2017 11:45:52 -0400
parents 2e0dc6032a98
children f924a33e1eef
comparison
equal deleted inserted replaced
4:6ff925458e05 5:d65045e976e6
10 the_parser = argparse.ArgumentParser() 10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--input', dest='input', required=True, 11 the_parser.add_argument('--input', dest='input', required=True,
12 nargs='+', help='input BAM files') 12 nargs='+', help='input BAM files')
13 the_parser.add_argument('--sample_name', dest='sample_name', 13 the_parser.add_argument('--sample_name', dest='sample_name',
14 required=True, nargs='+', help='sample name') 14 required=True, nargs='+', help='sample name')
15 the_parser.add_argument('--output', action="store", 15 the_parser.add_argument('--output', action='store',
16 type=str, help='output tabular file') 16 type=str, help='output tabular file')
17 the_parser.add_argument('-S', '--sizes', action='store',
18 help='use to output read sizes dataframe')
17 args = the_parser.parse_args() 19 args = the_parser.parse_args()
18 return args 20 return args
19 21
20 22
21 def compute_map_dictionary(pysam_object): 23 class Map:
22 ''' 24
23 returns 25 def __init__(self, bam_file, sample):
24 - map_dictionary {(read_chromosome,read_position,polarity): 26 self.sample_name = sample
25 [list_of_read_length]} 27 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
26 - max_dictionary {chromosome_name: 28 self.chromosomes = dict(zip(self.bam_object.references,
27 max_of_number_of_read_at_any_position} 29 self.bam_object.lengths))
28 ''' 30 self.map_dict = self.create_map(self.bam_object)
29 unmatched_chromosomes = list(pysam_object.references) 31 self.max = self.compute_max(self.map_dict)
30 map_dictionary = defaultdict(list) 32 self.mean = self.compute_mean(self.map_dict)
31 max_dictionary = dict.fromkeys(pysam_object.references, 0) 33 self.median = self.compute_median(self.map_dict)
32 for read in pysam_object: 34 self.coverage = self.compute_coverage(self.map_dict)
33 if not read.is_unmapped: 35 self.size = self.compute_size(self.map_dict)
34 read_chromosome = read.reference_name 36
35 # to shift from 0-based to 1-based coordinate 37 def create_map(self, bam_object):
36 read_position = read.pos + 1 38 '''
37 polarity = "F" if (not read.is_reverse) else "R" 39 Returns a map_dictionary {(chromosome,read_position,polarity):
38 map_dictionary[(read_chromosome, read_position, 40 [read_length, ...]}
39 polarity)].append(read.qlen) 41 '''
40 if read_chromosome in unmatched_chromosomes: 42 map_dictionary = dict()
41 unmatched_chromosomes.remove(read_chromosome) 43 for chrom in self.chromosomes:
42 for name in unmatched_chromosomes: 44 for pos in range(self.chromosomes[chrom]):
43 # to put at least a null value by chromosome 45 map_dictionary[(chrom, pos+1, 'F')] = [] # get all chromosomes
44 map_dictionary[(name, 1, 'F')] = [0] 46 map_dictionary[(chrom, pos+1, 'R')] = [] # get all chromosomes
45 # make a dictionary with max value of reads by chromosome 47 for chrom in self.chromosomes:
46 max_dictionary = defaultdict(int) 48 for read in bam_object.fetch(chrom):
47 for name in pysam_object.references: 49 read_positions = read.positions # a list of covered positions
48 for key in map_dictionary.keys(): 50 if read.is_reverse:
49 if name in key and len(map_dictionary[key]) > max_dictionary[name]: 51 map_dictionary[(chrom, read_positions[-1]+1,
50 max_dictionary[name] = len(map_dictionary[key]) 52 'R')].append(read.query_alignment_length)
51 return map_dictionary, max_dictionary 53 else:
54 map_dictionary[(chrom, read_positions[0]+1,
55 'F')].append(read.query_alignment_length)
56 return map_dictionary
57
58 def compute_max(self, map_dictionary):
59 '''
60 takes a map_dictionary as input and returns
61 a max_dictionary {(chromosome,read_position,polarity):
62 max_of_number_of_read_at_any_position}
63 '''
64 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()]
65 max_dictionary = dict(merge_keylist)
66 for key in map_dictionary:
67 if len(map_dictionary[key]) > max_dictionary[key[0]]:
68 max_dictionary[key[0]] = len(map_dictionary[key])
69 return max_dictionary
70
71 def compute_mean(self, map_dictionary):
72 '''
73 takes a map_dictionary as input and returns
74 a mean_dictionary {(chromosome,read_position,polarity):
75 mean_value_of_reads}
76 '''
77 mean_dictionary = dict()
78 for key in map_dictionary:
79 if len(map_dictionary[key]) == 0:
80 mean_dictionary[key] = 0
81 else:
82 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]),
83 1)
84 return mean_dictionary
85
86 def compute_median(self, map_dictionary):
87 '''
88 takes a map_dictionary as input and returns
89 a mean_dictionary {(chromosome,read_position,polarity):
90 mean_value_of_reads}
91 '''
92 median_dictionary = dict()
93 for key in map_dictionary:
94 if len(map_dictionary[key]) == 0:
95 median_dictionary[key] = 0
96 else:
97 median_dictionary[key] = numpy.median(map_dictionary[key])
98 return median_dictionary
99
100 def compute_coverage(self, map_dictionary, quality=10):
101 '''
102 Takes a map_dictionary and returns a dictionary of lists
103 of coverage along the coordinates of pre_mirs (as keys)
104 '''
105 coverage_dictionary = dict()
106 for chrom in self.chromosomes:
107 coverage = self.bam_object.count_coverage(
108 reference=chrom,
109 start=0,
110 end=self.chromosomes[chrom],
111 quality_threshold=quality)
112 """ Add the 4 coverage values """
113 coverage = [sum(x) for x in zip(*coverage)]
114 for pos, cov in enumerate(coverage):
115 coverage_dictionary[(chrom, pos+1, "F")] = cov
116 coverage_dictionary[(chrom, pos+1, "R")] = cov
117 return coverage_dictionary
118
119 def compute_size(self, map_dictionary):
120 '''
121 Takes a map_dictionary and returns a dictionary of sizes:
122 {chrom: {size: {polarity: nbre of reads}}}
123 '''
124 size_dictionary = defaultdict(lambda: defaultdict(lambda: defaultdict( int )))
125 for key in map_dictionary:
126 for size in map_dictionary[key]:
127 try:
128 size_dictionary[key[0]][key[2]][size] += 1
129 except KeyError:
130 size_dictionary[key[0]][key[2]][size] = 1
131 return size_dictionary
132
133 def write_size_table(self, out):
134 '''
135 Dataset, Chromosome, Polarity, Size, Nbr_reads
136 out is an *open* file handler
137 '''
138 for chrom in sorted(self.size):
139 sizes = self.size[chrom]['F'].keys()
140 sizes.extend(self.size[chrom]['R'].keys())
141 for polarity in sorted(self.size[chrom]):
142 for size in range(min(sizes), max(sizes)+1):
143 try:
144 line = [self.sample_name, chrom, polarity, size,
145 self.size[chrom][polarity][size]]
146 except KeyError:
147 line = [self.sample_name, chrom, polarity, size, 0]
148 line = [str(i) for i in line]
149 out.write('\t'.join(line) + '\n')
150
151 def write_table(self, out):
152 '''
153 Dataset, Chromosome, Chrom_length, Coordinate, Nbr_reads
154 Polarity, Max, Mean, Median, Coverage
155 out is an *open* file handler
156 '''
157 for key in sorted(self.map_dict):
158 line = [self.sample_name, key[0], self.chromosomes[key[0]],
159 key[1], len(self.map_dict[key]), key[2], self.max[key[0]],
160 self.mean[key], self.median[key], self.coverage[key]]
161 line = [str(i) for i in line]
162 out.write('\t'.join(line) + '\n')
52 163
53 164
54 def main(inputs, sample_names, output): 165 def main(inputs, samples, file_out, size_file_out=''):
55 with open(output, 'w') as outfile: 166 F = open(file_out, 'w')
56 outfile.write('\t'.join(['Dataset', 'Chromosome', 'Chrom_length', 167 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate",
57 'Coordinate', 'Nbr_reads', 'Polarity', 168 "Nbr_reads", "Polarity", "Max", "Mean", "Median", "Coverage"]
58 'Max', 'Mean', 'Median\n'])) 169 F.write('\t'.join(header) + '\n')
59 chr_lengths = dict(zip(pysam.AlignmentFile(inputs[0], 'rb').references, 170 if size_file_out:
60 pysam.AlignmentFile(inputs[0], 'rb').lengths)) 171 Fs = open(size_file_out, 'w')
61 for file, sample in zip(inputs, sample_names): 172 header = ["Dataset", "Chromosome", "Polarity", "Size", "Nbr_reads"]
62 with pysam.AlignmentFile(file, 'rb') as bamfile: 173 Fs.write('\t'.join(header) + '\n')
63 map, maximum = compute_map_dictionary(bamfile) 174 for file, sample in zip(inputs, samples):
64 for record in sorted(map): 175 mapobj = Map(file, sample)
65 line = [sample, record[0], 176 mapobj.write_table(F)
66 str(chr_lengths[record[0]]), 177 if size_file_out:
67 str(record[1]), str(len(map[record])), 178 mapobj.write_size_table(Fs)
68 record[2], str(maximum[record[0]]), 179 F.close()
69 str(round(numpy.mean(map[record]), 1)), 180 if size_file_out:
70 str(numpy.median(map[record]))] 181 Fs.close()
71 outfile.write("\t".join(line) + "\n")
72 182
73 183
74 if __name__ == "__main__": 184 if __name__ == "__main__":
75 args = Parser() 185 args = Parser()
76 # if identical sample names # to be tested 186 # if identical sample names # to be tested
77 if len(set(args.sample_name)) != len(args.sample_name): 187 if len(set(args.sample_name)) != len(args.sample_name):
78 args.sample_name = [name + '_' + str(i) for 188 args.sample_name = [name + '_' + str(i) for
79 i, name in enumerate(args.sample_name)] 189 i, name in enumerate(args.sample_name)]
80 main(args.input, args.sample_name, args.output) 190 main(args.input, args.sample_name, args.output, args.sizes)