Mercurial > repos > bebatut > format_cd_hit_output
comparison format_cd_hit_output.py @ 1:4ba41bcee051 draft default tip
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/format_cd_hit_output/ commit 975a480d80c774a1de58c8fc80b71ea44c5c702b-dirty
author | bebatut |
---|---|
date | Tue, 26 Apr 2016 08:54:26 -0400 |
parents | bbd903996900 |
children |
comparison
equal
deleted
inserted
replaced
0:bbd903996900 | 1:4ba41bcee051 |
---|---|
52 if args.output_category_distribution != None: | 52 if args.output_category_distribution != None: |
53 if mapping_info == None or categories == None: | 53 if mapping_info == None or categories == None: |
54 string = "A file with category distribution is expected but " | 54 string = "A file with category distribution is expected but " |
55 string += "no mapping information are available" | 55 string += "no mapping information are available" |
56 raise ValueError(string) | 56 raise ValueError(string) |
57 output_category_distribution_file = open( | 57 output_cat_distri_file = open(args.output_category_distribution, 'w') |
58 args.output_category_distribution, 'w') | 58 output_cat_distri_file.write('Cluster\tSequence_number') |
59 output_category_distribution_file.write('Cluster\tSequence_number') | |
60 for category in categories: | 59 for category in categories: |
61 output_category_distribution_file.write('\t' + category) | 60 output_cat_distri_file.write('\t' + category) |
62 | 61 |
63 output_category_distribution_file.write('\n') | 62 output_cat_distri_file.write('\n') |
63 else: | |
64 output_cat_distri_file = None | |
64 | 65 |
65 with open(args.input_cluster_info,'r') as cluster_info_file: | 66 with open(args.input_cluster_info,'r') as cluster_info_file: |
66 cluster_name = '' | 67 cluster_name = '' |
67 cluster_category_distribution = init_category_distribution(categories) | 68 cluster_category_distribution = init_category_distribution(categories) |
68 cluster_ref_seq = '' | 69 cluster_ref_seq = '' |
69 cluster_seq_number = 0 | 70 cluster_seq_number = 0 |
70 for line in cluster_info_file.readlines(): | 71 for line in cluster_info_file.readlines(): |
71 if line[0] == '>': | 72 if line[0] == '>': |
72 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 73 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
73 cluster_category_distribution, categories, | 74 cluster_category_distribution, categories, |
74 output_category_distribution_file, cluster_seq_number) | 75 output_cat_distri_file, cluster_seq_number) |
75 cluster_name = line[1:-1] | 76 cluster_name = line[1:-1] |
76 cluster_name = cluster_name.replace(' ','_') | 77 cluster_name = cluster_name.replace(' ','_') |
77 cluster_category_distribution = init_category_distribution(categories) | 78 cluster_category_distribution = init_category_distribution(categories) |
78 cluster_ref_seq = '' | 79 cluster_ref_seq = '' |
79 cluster_seq_number = 0 | 80 cluster_seq_number = 0 |
82 seq_name = seq_info[1][1:-3] | 83 seq_name = seq_info[1][1:-3] |
83 cluster_seq_number += 1 | 84 cluster_seq_number += 1 |
84 | 85 |
85 if categories != None: | 86 if categories != None: |
86 seq_count = 1 | 87 seq_count = 1 |
87 if args.number_sum == 'false': | 88 if args.number_sum != None: |
88 if seq_name.find('size') != -1: | 89 if seq_name.find('size') != -1: |
89 substring = seq_name[seq_name.find('size'):-1] | 90 substring = seq_name[seq_name.find('size'):-1] |
90 seq_count = int(substring.split('=')[1]) | 91 seq_count = int(substring.split('=')[1]) |
91 if not mapping_info.has_key(seq_name): | 92 if not mapping_info.has_key(seq_name): |
92 string = seq_name + " not found in mapping" | 93 string = seq_name + " not found in mapping" |
93 raise ValueError(string) | 94 raise ValueError(string) |
94 category = mapping_info[seq_name] | 95 category = mapping_info[seq_name] |
95 cluster_category_distribution[category] += seq_count | 96 cluster_category_distribution[category] += seq_count |
96 | |
97 | 97 |
98 if seq_info[-1] == '*': | 98 if seq_info[-1] == '*': |
99 if cluster_ref_seq != '': | 99 if cluster_ref_seq != '': |
100 string = "A reference sequence (" + cluster_ref_seq | 100 string = "A reference sequence (" + cluster_ref_seq |
101 string += ") already found for cluster " + cluster_name | 101 string += ") already found for cluster " + cluster_name |
102 string += " (" + seq_name + ")" | 102 string += " (" + seq_name + ")" |
103 raise ValueError(string) | 103 raise ValueError(string) |
104 cluster_ref_seq = seq_name | 104 cluster_ref_seq = seq_name |
105 | 105 |
106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, | 106 flush_cluster_info(cluster_name, cluster_ref_seq, ref_seq_cluster, |
107 cluster_category_distribution, categories, | 107 cluster_category_distribution, categories, output_cat_distri_file, |
108 output_category_distribution_file, cluster_seq_number) | 108 cluster_seq_number) |
109 | 109 |
110 if args.output_category_distribution != None: | 110 if args.output_category_distribution != None: |
111 output_category_distribution_file.close() | 111 output_cat_distri_file.close() |
112 | 112 |
113 return ref_seq_cluster | 113 return ref_seq_cluster |
114 | 114 |
115 def rename_representative_sequences(args, ref_seq_cluster): | 115 def rename_representative_sequences(args, ref_seq_cluster): |
116 with open(args.input_representative_sequences,'r') as input_sequences: | 116 with open(args.input_representative_sequences,'r') as input_sequences: |