comparison format_cd_hit_output.py @ 0:bbd903996900 draft

planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit ffb68b2ddd94854a34a2533105f7bc08884c6e38-dirty
author bebatut
date Wed, 27 Jan 2016 03:28:42 -0500
parents
children 4ba41bcee051
comparison
equal deleted inserted replaced
-1:000000000000 0:bbd903996900
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import argparse
7 import copy
8 import operator
9 from sets import Set
10
11 def extract_mapping_info(input_mapping_filepath):
12 mapping_info = {}
13 categories = Set([])
14
15 with open(input_mapping_filepath,'r') as mapping_file:
16 for line in mapping_file.readlines():
17 split_line = line[:-1].split('\t')
18 mapping_info.setdefault(split_line[0],split_line[1])
19 categories.add(split_line[1])
20
21 return mapping_info, categories
22
23 def init_category_distribution(categories = None):
24 cluster_category_distribution = {}
25 if categories != None:
26 for category in categories:
27 cluster_category_distribution[category] = 0
28 return cluster_category_distribution
29
30 def flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
31 cluster_category_distribution, categories, output_category_distribution_file,
32 cluster_seq_number):
33 if cluster_name != '':
34 if categories != None:
35 output_category_distribution_file.write(cluster_name)
36 output_category_distribution_file.write('\t' + str(cluster_seq_number))
37 for category in categories:
38 output_category_distribution_file.write('\t')
39 output_category_distribution_file.write(str(cluster_category_distribution[category]))
40 output_category_distribution_file.write('\n')
41
42 if cluster_ref_seq == '':
43 string = "No reference sequence found for "
44 string += cluster_name
45 raise ValueError(string)
46
47 ref_seq_cluster.setdefault(cluster_ref_seq, cluster_name)
48
49 def extract_cluster_info(args, mapping_info = None, categories = None):
50 ref_seq_cluster = {}
51
52 if args.output_category_distribution != None:
53 if mapping_info == None or categories == None:
54 string = "A file with category distribution is expected but "
55 string += "no mapping information are available"
56 raise ValueError(string)
57 output_category_distribution_file = open(
58 args.output_category_distribution, 'w')
59 output_category_distribution_file.write('Cluster\tSequence_number')
60 for category in categories:
61 output_category_distribution_file.write('\t' + category)
62
63 output_category_distribution_file.write('\n')
64
65 with open(args.input_cluster_info,'r') as cluster_info_file:
66 cluster_name = ''
67 cluster_category_distribution = init_category_distribution(categories)
68 cluster_ref_seq = ''
69 cluster_seq_number = 0
70 for line in cluster_info_file.readlines():
71 if line[0] == '>':
72 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
73 cluster_category_distribution, categories,
74 output_category_distribution_file, cluster_seq_number)
75 cluster_name = line[1:-1]
76 cluster_name = cluster_name.replace(' ','_')
77 cluster_category_distribution = init_category_distribution(categories)
78 cluster_ref_seq = ''
79 cluster_seq_number = 0
80 else:
81 seq_info = line[:-1].split('\t')[1].split(' ')
82 seq_name = seq_info[1][1:-3]
83 cluster_seq_number += 1
84
85 if categories != None:
86 seq_count = 1
87 if args.number_sum == 'false':
88 if seq_name.find('size') != -1:
89 substring = seq_name[seq_name.find('size'):-1]
90 seq_count = int(substring.split('=')[1])
91 if not mapping_info.has_key(seq_name):
92 string = seq_name + " not found in mapping"
93 raise ValueError(string)
94 category = mapping_info[seq_name]
95 cluster_category_distribution[category] += seq_count
96
97
98 if seq_info[-1] == '*':
99 if cluster_ref_seq != '':
100 string = "A reference sequence (" + cluster_ref_seq
101 string += ") already found for cluster " + cluster_name
102 string += " (" + seq_name + ")"
103 raise ValueError(string)
104 cluster_ref_seq = seq_name
105
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
107 cluster_category_distribution, categories,
108 output_category_distribution_file, cluster_seq_number)
109
110 if args.output_category_distribution != None:
111 output_category_distribution_file.close()
112
113 return ref_seq_cluster
114
115 def rename_representative_sequences(args, ref_seq_cluster):
116 with open(args.input_representative_sequences,'r') as input_sequences:
117 with open(args.output_representative_sequences,'w') as output_sequences:
118 for line in input_sequences.readlines():
119 if line[0] == '>':
120 seq_name = line[1:-1]
121 if not ref_seq_cluster.has_key(seq_name):
122 string = seq_name + " not found as reference sequence"
123 raise ValueError(string)
124 output_sequences.write('>' + ref_seq_cluster[seq_name] + '\n')
125 else:
126 output_sequences.write(line)
127
128 def format_cd_hit_outputs(args):
129 if args.input_mapping != None:
130 mapping_info, categories = extract_mapping_info(args.input_mapping)
131 else:
132 mapping_info = None
133 categories = None
134
135 ref_seq_cluster = extract_cluster_info(args, mapping_info, categories)
136
137 if args.input_representative_sequences != None:
138 rename_representative_sequences(args, ref_seq_cluster)
139
140 if __name__ == "__main__":
141 parser = argparse.ArgumentParser()
142 parser.add_argument('--input_cluster_info', required=True)
143 parser.add_argument('--input_representative_sequences')
144 parser.add_argument('--output_representative_sequences')
145 parser.add_argument('--input_mapping')
146 parser.add_argument('--output_category_distribution')
147 parser.add_argument('--number_sum')
148 args = parser.parse_args()
149
150 format_cd_hit_outputs(args)