annotate merge_cut_up_clustering.py @ 2:bd409bbd287f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 77dea70f156d56c83b6851b61dc997d2b344bdc9
author iuc
date Fri, 01 Jul 2022 14:10:59 +0000
parents 1f4286d836a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
1 #!/usr/bin/env python
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
2
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
3 import argparse
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
4 import re
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
5 import sys
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
6 from collections import Counter
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
7 from collections import defaultdict
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
8
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
9
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
10 CONTIG_PART_EXPR = re.compile(r'(.*)\.concoct_part_([0-9]*)')
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
11
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
12
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
13 def original_contig_name_special(contig_id):
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
14 try:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
15 original_id, part_index = CONTIG_PART_EXPR.match(contig_id).group(1, 2)
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
16 return original_id, part_index
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
17 except AttributeError:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
18 # No matches for concoct_part regex.
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
19 return contig_id, 0
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
20
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
21
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
22 parser = argparse.ArgumentParser()
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
23 parser.add_argument("--input", action="store", dest="input", help="Tabular file with cut up clusters")
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
24 parser.add_argument("--output", action="store", dest="output", help="Output file with merged clusters")
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
25
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
26 args = parser.parse_args()
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
27
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
28 # Get cut up clusters
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
29 all_seqs = {}
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
30 all_originals = defaultdict(dict)
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
31 with open(args.input, 'r') as ifh:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
32 for i, line in enumerate(ifh):
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
33 if i == 0:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
34 if 'contig_id' not in line:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
35 sys.stderr.write("ERROR nvalid clustering file, 'contig_id' is not found in the header.")
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
36 sys.exit(-1)
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
37 # Skip header.
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
38 continue
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
39 line = line.rstrip('\r\n')
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
40 contig_id, cluster_id = line.split('\t')
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
41 original_contig_name, part_id = original_contig_name_special(contig_id)
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
42 all_originals[original_contig_name][part_id] = cluster_id
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
43
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
44 # Merge cut up clusters.
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
45 with open(args.output, 'w') as ofh:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
46 ofh.write("contig_id\tcluster_id\n")
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
47 for original_contig_id, part_ids_d in all_originals.items():
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
48 if len(part_ids_d) > 1:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
49 c = Counter(part_ids_d.values())
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
50 cluster_id = c.most_common(1)[0][0]
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
51 c_string = [(a, b) for a, b in c.items()]
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
52 # Here if len(c.values()) > 1,
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
53 # then no cluster for contig.
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
54 else:
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
55 cluster_id = list(part_ids_d.values())[0]
1f4286d836a3 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
iuc
parents:
diff changeset
56 ofh.write(f"{original_contig_id}\t{cluster_id}\n")