Mercurial > repos > bebatut > format_cd_hit_output
comparison format_cd_hit_output.py @ 0:bbd903996900 draft
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
author | bebatut |
---|---|
date | Wed, 27 Jan 2016 03:28:42 -0500 |
parents | |
children | 4ba41bcee051 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bbd903996900 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 import sys | |
5 import os | |
6 import argparse | |
7 import copy | |
8 import operator | |
9 from sets import Set | |
10 | |
11 def extract_mapping_info(input_mapping_filepath): | |
12 mapping_info = {} | |
13 categories = Set([]) | |
14 | |
15 with open(input_mapping_filepath,'r') as mapping_file: | |
16 for line in mapping_file.readlines(): | |
17 split_line = line[:-1].split('\t') | |
18 mapping_info.setdefault(split_line[0],split_line[1]) | |
19 categories.add(split_line[1]) | |
20 | |
21 return mapping_info, categories | |
22 | |
23 def init_category_distribution(categories = None): | |
24 cluster_category_distribution = {} | |
25 if categories != None: | |
26 for category in categories: | |
27 cluster_category_distribution[category] = 0 | |
28 return cluster_category_distribution | |
29 | |
30 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | |
31 cluster_category_distribution, categories, output_category_distribution_file, | |
32 cluster_seq_number): | |
33 if cluster_name != '': | |
34 if categories != None: | |
35 output_category_distribution_file.write(cluster_name) | |
36 output_category_distribution_file.write('\t' + str(cluster_seq_number)) | |
37 for category in categories: | |
38 output_category_distribution_file.write('\t') | |
39 output_category_distribution_file.write(str(cluster_category_distribution[category])) | |
40 output_category_distribution_file.write('\n') | |
41 | |
42 if cluster_ref_seq == '': | |
43 string = "No reference sequence found for " | |
44 string += cluster_name | |
45 raise ValueError(string) | |
46 | |
47 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name) | |
48 | |
49 def extract_cluster_info(args, mapping_info = None, categories = None): | |
50 ref_seq_cluster = {} | |
51 | |
52 if args.output_category_distribution != None: | |
53 if mapping_info == None or categories == None: | |
54 string = "A file with category distribution is expected but " | |
55 string += "no mapping information are available" | |
56 raise ValueError(string) | |
57 output_category_distribution_file = open( | |
58 args.output_category_distribution, 'w') | |
59 output_category_distribution_file.write('Cluster\tSequence_number') | |
60 for category in categories: | |
61 output_category_distribution_file.write('\t' + category) | |
62 | |
63 output_category_distribution_file.write('\n') | |
64 | |
65 with open(args.input_cluster_info,'r') as cluster_info_file: | |
66 cluster_name = '' | |
67 cluster_category_distribution = init_category_distribution(categories) | |
68 cluster_ref_seq = '' | |
69 cluster_seq_number = 0 | |
70 for line in cluster_info_file.readlines(): | |
71 if line[0] == '>': | |
72 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | |
73 cluster_category_distribution, categories, | |
74 output_category_distribution_file, cluster_seq_number) | |
75 cluster_name = line[1:-1] | |
76 cluster_name = cluster_name.replace(' ','_') | |
77 cluster_category_distribution = init_category_distribution(categories) | |
78 cluster_ref_seq = '' | |
79 cluster_seq_number = 0 | |
80 else: | |
81 seq_info = line[:-1].split('\t')[1].split(' ') | |
82 seq_name = seq_info[1][1:-3] | |
83 cluster_seq_number += 1 | |
84 | |
85 if categories != None: | |
86 seq_count = 1 | |
87 if args.number_sum == 'false': | |
88 if seq_name.find('size') != -1: | |
89 substring = seq_name[seq_name.find('size'):-1] | |
90 seq_count = int(substring.split('=')[1]) | |
91 if not mapping_info.has_key(seq_name): | |
92 string = seq_name + " not found in mapping" | |
93 raise ValueError(string) | |
94 category = mapping_info[seq_name] | |
95 cluster_category_distribution[category] += seq_count | |
96 | |
97 | |
98 if seq_info[-1] == '*': | |
99 if cluster_ref_seq != '': | |
100 string = "A reference sequence (" + cluster_ref_seq | |
101 string += ") already found for cluster " + cluster_name | |
102 string += " (" + seq_name + ")" | |
103 raise ValueError(string) | |
104 cluster_ref_seq = seq_name | |
105 | |
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | |
107 cluster_category_distribution, categories, | |
108 output_category_distribution_file, cluster_seq_number) | |
109 | |
110 if args.output_category_distribution != None: | |
111 output_category_distribution_file.close() | |
112 | |
113 return ref_seq_cluster | |
114 | |
115 def rename_representative_sequences(args, ref_seq_cluster): | |
116 with open(args.input_representative_sequences,'r') as input_sequences: | |
117 with open(args.output_representative_sequences,'w') as output_sequences: | |
118 for line in input_sequences.readlines(): | |
119 if line[0] == '>': | |
120 seq_name = line[1:-1] | |
121 if not ref_seq_cluster.has_key(seq_name): | |
122 string = seq_name + " not found as reference sequence" | |
123 raise ValueError(string) | |
124 output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n') | |
125 else: | |
126 output_sequences.write(line) | |
127 | |
128 def format_cd_hit_outputs(args): | |
129 if args.input_mapping != None: | |
130 mapping_info, categories = extract_mapping_info(args.input_mapping) | |
131 else: | |
132 mapping_info = None | |
133 categories = None | |
134 | |
135 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories) | |
136 | |
137 if args.input_representative_sequences != None: | |
138 rename_representative_sequences(args, ref_seq_cluster) | |
139 | |
140 if __name__ == "__main__": | |
141 parser = argparse.ArgumentParser() | |
142 parser.add_argument('--input_cluster_info', required=True) | |
143 parser.add_argument('--input_representative_sequences') | |
144 parser.add_argument('--output_representative_sequences') | |
145 parser.add_argument('--input_mapping') | |
146 parser.add_argument('--output_category_distribution') | |
147 parser.add_argument('--number_sum') | |
148 args = parser.parse_args() | |
149 | |
150 format_cd_hit_outputs(args) |