comparison format_cd_hit_output.py @ 1:4ba41bcee051 draft default tip

planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
author bebatut
date Tue, 26 Apr 2016 08:54:26 -0400
parents bbd903996900
children
comparison
equal deleted inserted replaced
0:bbd903996900 1:4ba41bcee051
52 if args.output_category_distribution != None: 52 if args.output_category_distribution != None:
53 if mapping_info == None or categories == None: 53 if mapping_info == None or categories == None:
54 string = "A file with category distribution is expected but " 54 string = "A file with category distribution is expected but "
55 string += "no mapping information are available" 55 string += "no mapping information are available"
56 raise ValueError(string) 56 raise ValueError(string)
57 output_category_distribution_file = open( 57 output_cat_distri_file = open(args.output_category_distribution, 'w')
58 args.output_category_distribution, 'w') 58 output_cat_distri_file.write('Cluster\tSequence_number')
59 output_category_distribution_file.write('Cluster\tSequence_number')
60 for category in categories: 59 for category in categories:
61 output_category_distribution_file.write('\t' + category) 60 output_cat_distri_file.write('\t' + category)
62 61
63 output_category_distribution_file.write('\n') 62 output_cat_distri_file.write('\n')
63 else:
64 output_cat_distri_file = None
64 65
65 with open(args.input_cluster_info,'r') as cluster_info_file: 66 with open(args.input_cluster_info,'r') as cluster_info_file:
66 cluster_name = '' 67 cluster_name = ''
67 cluster_category_distribution = init_category_distribution(categories) 68 cluster_category_distribution = init_category_distribution(categories)
68 cluster_ref_seq = '' 69 cluster_ref_seq = ''
69 cluster_seq_number = 0 70 cluster_seq_number = 0
70 for line in cluster_info_file.readlines(): 71 for line in cluster_info_file.readlines():
71 if line[0] == '>': 72 if line[0] == '>':
72 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, 73 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
73 cluster_category_distribution, categories, 74 cluster_category_distribution, categories,
74 output_category_distribution_file, cluster_seq_number) 75 output_cat_distri_file, cluster_seq_number)
75 cluster_name = line[1:-1] 76 cluster_name = line[1:-1]
76 cluster_name = cluster_name.replace(' ','_') 77 cluster_name = cluster_name.replace(' ','_')
77 cluster_category_distribution = init_category_distribution(categories) 78 cluster_category_distribution = init_category_distribution(categories)
78 cluster_ref_seq = '' 79 cluster_ref_seq = ''
79 cluster_seq_number = 0 80 cluster_seq_number = 0
82 seq_name = seq_info[1][1:-3] 83 seq_name = seq_info[1][1:-3]
83 cluster_seq_number += 1 84 cluster_seq_number += 1
84 85
85 if categories != None: 86 if categories != None:
86 seq_count = 1 87 seq_count = 1
87 if args.number_sum == 'false': 88 if args.number_sum != None:
88 if seq_name.find('size') != -1: 89 if seq_name.find('size') != -1:
89 substring = seq_name[seq_name.find('size'):-1] 90 substring = seq_name[seq_name.find('size'):-1]
90 seq_count = int(substring.split('=')[1]) 91 seq_count = int(substring.split('=')[1])
91 if not mapping_info.has_key(seq_name): 92 if not mapping_info.has_key(seq_name):
92 string = seq_name + " not found in mapping" 93 string = seq_name + " not found in mapping"
93 raise ValueError(string) 94 raise ValueError(string)
94 category = mapping_info[seq_name] 95 category = mapping_info[seq_name]
95 cluster_category_distribution[category] += seq_count 96 cluster_category_distribution[category] += seq_count
96
97 97
98 if seq_info[-1] == '*': 98 if seq_info[-1] == '*':
99 if cluster_ref_seq != '': 99 if cluster_ref_seq != '':
100 string = "A reference sequence (" + cluster_ref_seq 100 string = "A reference sequence (" + cluster_ref_seq
101 string += ") already found for cluster " + cluster_name 101 string += ") already found for cluster " + cluster_name
102 string += " (" + seq_name + ")" 102 string += " (" + seq_name + ")"
103 raise ValueError(string) 103 raise ValueError(string)
104 cluster_ref_seq = seq_name 104 cluster_ref_seq = seq_name
105 105
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, 106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster,
107 cluster_category_distribution, categories, 107 cluster_category_distribution, categories, output_cat_distri_file,
108 output_category_distribution_file, cluster_seq_number) 108 cluster_seq_number)
109 109
110 if args.output_category_distribution != None: 110 if args.output_category_distribution != None:
111 output_category_distribution_file.close() 111 output_cat_distri_file.close()
112 112
113 return ref_seq_cluster 113 return ref_seq_cluster
114 114
115 def rename_representative_sequences(args, ref_seq_cluster): 115 def rename_representative_sequences(args, ref_seq_cluster):
116 with open(args.input_representative_sequences,'r') as input_sequences: 116 with open(args.input_representative_sequences,'r') as input_sequences: