annotate small_rna_map.py @ 4:6ff925458e05 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit dcd643f146afe371d3e418aa91ddbaff6211a09a
author artbio
date Tue, 18 Jul 2017 17:35:52 -0400
parents 2e0dc6032a98
children d65045e976e6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
1 import argparse
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
2 from collections import defaultdict
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
4 import numpy
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
5
0
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
6 import pysam
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
7
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
8
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
9 def Parser():
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
10 the_parser = argparse.ArgumentParser()
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
11 the_parser.add_argument('--input', dest='input', required=True,
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
12 nargs='+', help='input BAM files')
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
13 the_parser.add_argument('--sample_name', dest='sample_name',
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
14 required=True, nargs='+', help='sample name')
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
15 the_parser.add_argument('--output', action="store",
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
16 type=str, help='output tabular file')
0
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
17 args = the_parser.parse_args()
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
18 return args
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
19
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
20
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
21 def compute_map_dictionary(pysam_object):
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
22 '''
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
23 returns
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
24 - map_dictionary {(read_chromosome,read_position,polarity):
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
25 [list_of_read_length]}
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
26 - max_dictionary {chromosome_name:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
27 max_of_number_of_read_at_any_position}
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
28 '''
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
29 unmatched_chromosomes = list(pysam_object.references)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
30 map_dictionary = defaultdict(list)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
31 max_dictionary = dict.fromkeys(pysam_object.references, 0)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
32 for read in pysam_object:
0
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
33 if not read.is_unmapped:
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
34 read_chromosome = read.reference_name
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
35 # to shift from 0-based to 1-based coordinate
0
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
36 read_position = read.pos + 1
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
37 polarity = "F" if (not read.is_reverse) else "R"
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
38 map_dictionary[(read_chromosome, read_position,
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
39 polarity)].append(read.qlen)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
40 if read_chromosome in unmatched_chromosomes:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
41 unmatched_chromosomes.remove(read_chromosome)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
42 for name in unmatched_chromosomes:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
43 # to put at least a null value by chromosome
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
44 map_dictionary[(name, 1, 'F')] = [0]
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
45 # make a dictionary with max value of reads by chromosome
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
46 max_dictionary = defaultdict(int)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
47 for name in pysam_object.references:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
48 for key in map_dictionary.keys():
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
49 if name in key and len(map_dictionary[key]) > max_dictionary[name]:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
50 max_dictionary[name] = len(map_dictionary[key])
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
51 return map_dictionary, max_dictionary
0
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
52
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
53
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
54 def main(inputs, sample_names, output):
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
55 with open(output, 'w') as outfile:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
56 outfile.write('\t'.join(['Dataset', 'Chromosome', 'Chrom_length',
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
57 'Coordinate', 'Nbr_reads', 'Polarity',
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
58 'Max', 'Mean', 'Median\n']))
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
59 chr_lengths = dict(zip(pysam.AlignmentFile(inputs[0], 'rb').references,
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
60 pysam.AlignmentFile(inputs[0], 'rb').lengths))
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
61 for file, sample in zip(inputs, sample_names):
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
62 with pysam.AlignmentFile(file, 'rb') as bamfile:
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
63 map, maximum = compute_map_dictionary(bamfile)
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
64 for record in sorted(map):
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
65 line = [sample, record[0],
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
66 str(chr_lengths[record[0]]),
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
67 str(record[1]), str(len(map[record])),
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
68 record[2], str(maximum[record[0]]),
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
69 str(round(numpy.mean(map[record]), 1)),
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
70 str(numpy.median(map[record]))]
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
71 outfile.write("\t".join(line) + "\n")
0
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
72
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
73
1ad5d040f85f planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit fa452d860cf550c7524df59e77b36fd39e3e2a45
artbio
parents:
diff changeset
74 if __name__ == "__main__":
3
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
75 args = Parser()
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
76 # if identical sample names # to be tested
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
77 if len(set(args.sample_name)) != len(args.sample_name):
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
78 args.sample_name = [name + '_' + str(i) for
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
79 i, name in enumerate(args.sample_name)]
2e0dc6032a98 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_map commit 93f212712d9846c7aaa389de60babb332d38363e
artbio
parents: 0
diff changeset
80 main(args.input, args.sample_name, args.output)