Mercurial > repos > pieterlukasse > prims_metabolomics
comparison export_to_metexp_tabular.py @ 1:071a185c2ced
new tools
| author | pieter.lukasse@wur.nl | 
|---|---|
| date | Fri, 24 Oct 2014 12:52:56 +0200 | 
| parents | 4b94bb2d381c | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:4b94bb2d381c | 1:071a185c2ced | 
|---|---|
| 3 ''' | 3 ''' | 
| 4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust | 4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust | 
| 5 into a tabular file that can be uploaded to the MetExp database. | 5 into a tabular file that can be uploaded to the MetExp database. | 
| 6 | 6 | 
| 7 RankFilter, CasLookup are already combined by combine_output.py so here we will use | 7 RankFilter, CasLookup are already combined by combine_output.py so here we will use | 
| 8 this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust | 8 this result. Furthermore here one of the MsClust | 
| 9 quantification files are to be combined with combine_output.py result as well. | 9 quantification files containing the respective spectra details are to be combined as well. | 
| 10 | 10 | 
| 11 Extra calculations performed: | 11 Extra calculations performed: | 
| 12 - The column MW is also added here and is derived from the column FORMULA found | 12 - The column MW is also added here and is derived from the column FORMULA found | 
| 13 in combine_output.py result. | 13 in RankFilter, CasLookup combined result. | 
| 14 | 14 | 
| 15 So in total here we merge 3 files and calculate one new column. | 15 So in total here we merge 2 files and calculate one new column. | 
| 16 ''' | 16 ''' | 
| 17 | 17 from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 | 
| 18 import csv | 18 import csv | 
| 19 import re | |
| 19 import sys | 20 import sys | 
| 20 from collections import OrderedDict | 21 from collections import OrderedDict | 
| 21 | 22 | 
| 22 __author__ = "Pieter Lukasse" | 23 __author__ = "Pieter Lukasse" | 
| 23 __contact__ = "pieter.lukasse@wur.nl" | 24 __contact__ = "pieter.lukasse@wur.nl" | 
| 38 return output | 39 return output | 
| 39 | 40 | 
| 40 ONE_TO_ONE = 'one_to_one' | 41 ONE_TO_ONE = 'one_to_one' | 
| 41 N_TO_ONE = 'n_to_one' | 42 N_TO_ONE = 'n_to_one' | 
| 42 | 43 | 
| 43 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): | 44 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): | 
| 44 ''' | 45 ''' | 
| 45 Merges data from both input dictionaries based on the link fields. This method will | 46 Merges data from both input dictionaries based on the link fields. This method will | 
| 46 build up a new list containing the merged hits as the items. | 47 build up a new list containing the merged hits as the items. | 
| 47 @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) | 48 @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) | 
| 48 @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) | 49 @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) | 
| 49 ''' | 50 ''' | 
| 50 # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): | 51 # TODO test for correct input files -> same link_field values should be there | 
| 52 # (test at least number of unique link_field values): | |
| 51 # | 53 # | 
| 52 # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): | 54 # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): | 
| 53 # raise Exception('input files should have the same nr of key values ') | 55 # raise Exception('input files should have the same nr of key values ') | 
| 54 | 56 | 
| 55 | 57 | 
| 62 processed[link_field_set1_value] = link_field_set1_value | 64 processed[link_field_set1_value] = link_field_set1_value | 
| 63 | 65 | 
| 64 # Get the indices for current link_field_set1_value in both data-structures for proper matching | 66 # Get the indices for current link_field_set1_value in both data-structures for proper matching | 
| 65 set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] | 67 set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] | 
| 66 set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] | 68 set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] | 
| 67 | 69 # Validation : | 
| 68 | 70 if len(set2index) == 0: | 
| 71 # means that corresponding data could not be found in set2, then throw error | |
| 72 raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + | |
| 73 link_field_set1_value + " only found in first dataset. ") | |
| 69 | 74 | 
| 70 merged_hits = [] | 75 merged_hits = [] | 
| 71 # Combine hits | 76 # Combine hits | 
| 72 for hit in xrange(len(set1index)): | 77 for hit in xrange(len(set1index)): | 
| 73 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do | 78 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do | 
| 74 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its | 79 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its | 
| 75 # corresponding value in the rankfilter or caslookup tables; i.e. | 80 # corresponding value in the sets; i.e. | 
| 76 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute | 81 # set1[key] => returns the list/array with size = nrrows, with the values for the attribute | 
| 77 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) | 82 # represented by "key". | 
| 83 # set1index[hit] => points to the row nr=hit (hit is a rownr/index) | |
| 84 # So set1[x][set1index[n]] = set1.attributeX.instanceN | |
| 85 # | |
| 78 # It just ensures the entry is made available as a plain named array for easy access. | 86 # It just ensures the entry is made available as a plain named array for easy access. | 
| 79 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) | 87 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) | 
| 80 if relation_type == ONE_TO_ONE : | 88 if relation_type == ONE_TO_ONE : | 
| 81 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) | 89 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) | 
| 82 else: | 90 else: | 
| 83 # is N to 1: | 91 # is N to 1: | 
| 84 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) | 92 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) | 
| 85 | 93 | 
| 86 merged_hit = merge_function(rf_record, cl_record) | 94 merged_hit = merge_function(rf_record, cl_record, metadata) | 
| 87 merged_hits.append(merged_hit) | 95 merged_hits.append(merged_hit) | 
| 88 | 96 | 
| 89 merged.append(merged_hits) | 97 merged.append(merged_hits) | 
| 90 | 98 | 
| 91 return merged, len(set1index) | 99 return merged, len(set1index) | 
| 101 else: | 109 else: | 
| 102 return False | 110 return False | 
| 103 | 111 | 
| 104 | 112 | 
| 105 | 113 | 
| 106 def _merge_records(rank_caslookup_combi, msclust_quant_record): | 114 def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): | 
| 107 ''' | 115 ''' | 
| 108 Combines single records from both the RankFilter+CasLookup combi file and from MsClust file | 116 Combines single records from both the RankFilter+CasLookup combi file and from MsClust file | 
| 109 | 117 | 
| 110 @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) | 118 @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) | 
| 111 @param msclust_quant_record: msclust quantification + spectrum record | 119 @param msclust_quant_record: msclust quantification + spectrum record | 
| 112 ''' | 120 ''' | 
| 113 i = 0 | |
| 114 record = [] | 121 record = [] | 
| 115 for column in rank_caslookup_combi: | 122 for column in rank_caslookup_combi: | 
| 116 record.append(rank_caslookup_combi[column]) | 123 record.append(rank_caslookup_combi[column]) | 
| 117 i += 1 | |
| 118 | 124 | 
| 119 for column in msclust_quant_record: | 125 for column in msclust_quant_record: | 
| 120 record.append(msclust_quant_record[column]) | 126 record.append(msclust_quant_record[column]) | 
| 121 i += 1 | 127 | 
| 128 for column in metadata: | |
| 129 record.append(metadata[column]) | |
| 130 | |
| 131 # add MOLECULAR MASS (MM) | |
| 132 molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) | |
| 133 # limit to two decimals: | |
| 134 record.append("{0:.2f}".format(molecular_mass)) | |
| 135 | |
| 136 # add MOLECULAR WEIGHT (MW) - TODO - calculate this | |
| 137 record.append('0.0') | |
| 138 | |
| 139 # level of identification and Location of reference standard | |
| 140 record.append('0') | |
| 141 record.append('') | |
| 122 | 142 | 
| 123 return record | 143 return record | 
| 124 | 144 | 
| 125 | 145 | 
| 126 | 146 def get_molecular_mass(formula): | 
| 127 | 147 ''' | 
| 128 def _save_data(data, headers, nhits, out_csv): | 148 Calculates the molecular mass (MM). | 
| 149 E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O | |
| 150 ''' | |
| 151 | |
| 152 # Each element is represented by a capital letter, followed optionally by | |
| 153 # lower case, with one or more digits as for how many elements: | |
| 154 element_pattern = re.compile("([A-Z][a-z]?)(\d*)") | |
| 155 | |
| 156 total_mass = 0 | |
| 157 for (element_name, count) in element_pattern.findall(formula): | |
| 158 if count == "": | |
| 159 count = 1 | |
| 160 else: | |
| 161 count = int(count) | |
| 162 element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) | |
| 163 total_mass += element_mass * count | |
| 164 | |
| 165 return total_mass | |
| 166 | |
| 167 | |
| 168 | |
| 169 def _save_data(data, headers, out_csv): | |
| 129 ''' | 170 ''' | 
| 130 Writes tab-separated data to file | 171 Writes tab-separated data to file | 
| 131 @param data: dictionary containing merged dataset | 172 @param data: dictionary containing merged dataset | 
| 132 @param out_csv: output csv file | 173 @param out_csv: output csv file | 
| 133 ''' | 174 ''' | 
| 137 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | 178 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | 
| 138 | 179 | 
| 139 # Write headers | 180 # Write headers | 
| 140 output_single_handle.writerow(headers) | 181 output_single_handle.writerow(headers) | 
| 141 | 182 | 
| 142 # Write one line for each centrotype | 183 # Write | 
| 143 for centrotype_idx in xrange(len(data)): | 184 for item_idx in xrange(len(data)): | 
| 144 for hit in data[centrotype_idx]: | 185 for hit in data[item_idx]: | 
| 145 output_single_handle.writerow(hit) | 186 output_single_handle.writerow(hit) | 
| 146 | 187 | 
| 188 | |
| 189 def _get_map_for_elements_and_masses(elements_and_masses): | |
| 190 ''' | |
| 191 This method will read out the column 'Chemical symbol' and make a map | |
| 192 of this, storing the column 'Relative atomic mass' as its value | |
| 193 ''' | |
| 194 resultMap = {} | |
| 195 index = 0 | |
| 196 for entry in elements_and_masses['Chemical symbol']: | |
| 197 resultMap[entry] = elements_and_masses['Relative atomic mass'][index] | |
| 198 index += 1 | |
| 199 | |
| 200 return resultMap | |
| 201 | |
| 202 | |
| 203 def init_elements_and_masses_map(): | |
| 204 ''' | |
| 205 Initializes the lookup map containing the elements and their respective masses | |
| 206 ''' | |
| 207 elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) | |
| 208 global elements_and_masses_map | |
| 209 elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) | |
| 210 | |
| 147 | 211 | 
| 148 def main(): | 212 def main(): | 
| 149 ''' | 213 ''' | 
| 150 Combine Output main function | 214 Combine Output main function | 
| 151 | 215 | 
| 154 quantification files are to be combined with combine_output.py result as well. | 218 quantification files are to be combined with combine_output.py result as well. | 
| 155 ''' | 219 ''' | 
| 156 rankfilter_and_caslookup_combined_file = sys.argv[1] | 220 rankfilter_and_caslookup_combined_file = sys.argv[1] | 
| 157 msclust_quantification_and_spectra_file = sys.argv[2] | 221 msclust_quantification_and_spectra_file = sys.argv[2] | 
| 158 output_csv = sys.argv[3] | 222 output_csv = sys.argv[3] | 
| 223 # metadata | |
| 224 metadata = OrderedDict() | |
| 225 metadata['organism'] = sys.argv[4] | |
| 226 metadata['tissue'] = sys.argv[5] | |
| 227 metadata['experiment_name'] = sys.argv[6] | |
| 228 metadata['user_name'] = sys.argv[7] | |
| 229 metadata['column_type'] = sys.argv[8] | |
| 159 | 230 | 
| 160 # Read RankFilter and CasLookup output files | 231 # Read RankFilter and CasLookup output files | 
| 161 rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) | 232 rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) | 
| 162 msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') | 233 msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') | 
| 163 | 234 | 
| 235 # Read elements and masses to use for the MW/MM calculation : | |
| 236 init_elements_and_masses_map() | |
| 237 | |
| 164 merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', | 238 merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', | 
| 165 msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) | 239 msclust_quantification_and_spectra, 'centrotype', | 
| 166 headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() | 240 _compare_records, _merge_records, metadata, | 
| 167 _save_data(merged, headers, nhits, output_csv) | 241 N_TO_ONE) | 
| 242 headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] | |
| 243 _save_data(merged, headers, output_csv) | |
| 168 | 244 | 
| 169 | 245 | 
| 170 if __name__ == '__main__': | 246 if __name__ == '__main__': | 
| 171 main() | 247 main() | 
