| 0 | 1 #!/usr/bin/env python | 
|  | 2 # encoding: utf-8 | 
|  | 3 ''' | 
|  | 4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust | 
|  | 5 into a tabular file that can be uploaded to the MetExp database. | 
|  | 6 | 
|  | 7 RankFilter, CasLookup are already combined by combine_output.py so here we will use | 
| 1 | 8 this result. Furthermore here one of the MsClust | 
|  | 9 quantification files containing the respective spectra details are to be combined as well. | 
| 0 | 10 | 
|  | 11 Extra calculations performed: | 
|  | 12 - The column MW is also added here and is derived from the column FORMULA found | 
| 1 | 13   in RankFilter, CasLookup combined result. | 
| 0 | 14 | 
| 1 | 15 So in total here we merge 2 files and calculate one new column. | 
| 0 | 16 ''' | 
| 1 | 17 from pkg_resources import resource_filename  # @UnresolvedImport # pylint: disable=E0611 | 
| 0 | 18 import csv | 
| 1 | 19 import re | 
| 0 | 20 import sys | 
|  | 21 from collections import OrderedDict | 
|  | 22 | 
|  | 23 __author__ = "Pieter Lukasse" | 
|  | 24 __contact__ = "pieter.lukasse@wur.nl" | 
|  | 25 __copyright__ = "Copyright, 2013, Plant Research International, WUR" | 
|  | 26 __license__ = "Apache v2" | 
|  | 27 | 
|  | 28 def _process_data(in_csv, delim='\t'): | 
|  | 29     ''' | 
|  | 30     Generic method to parse a tab-separated file returning a dictionary with named columns | 
|  | 31     @param in_csv: input filename to be parsed | 
|  | 32     ''' | 
|  | 33     data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) | 
|  | 34     header = data.pop(0) | 
|  | 35     # Create dictionary with column name as key | 
|  | 36     output = OrderedDict() | 
|  | 37     for index in xrange(len(header)): | 
|  | 38         output[header[index]] = [row[index] for row in data] | 
|  | 39     return output | 
|  | 40 | 
|  | 41 ONE_TO_ONE = 'one_to_one' | 
|  | 42 N_TO_ONE = 'n_to_one' | 
|  | 43 | 
| 1 | 44 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): | 
| 0 | 45     ''' | 
|  | 46     Merges data from both input dictionaries based on the link fields. This method will | 
|  | 47     build up a new list containing the merged hits as the items. | 
|  | 48     @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) | 
|  | 49     @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) | 
|  | 50     ''' | 
| 1 | 51     # TODO test for correct input files -> same link_field values should be there | 
|  | 52     # (test at least number of unique link_field values): | 
| 0 | 53     # | 
|  | 54     # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): | 
|  | 55     #    raise Exception('input files should have the same nr of key values  ') | 
|  | 56 | 
|  | 57 | 
|  | 58     merged = [] | 
|  | 59     processed = {} | 
|  | 60     for link_field_set1_idx in xrange(len(set1[link_field_set1])): | 
|  | 61         link_field_set1_value = set1[link_field_set1][link_field_set1_idx] | 
|  | 62         if not link_field_set1_value in processed : | 
|  | 63             # keep track of processed items to not repeat them | 
|  | 64             processed[link_field_set1_value] = link_field_set1_value | 
|  | 65 | 
|  | 66             # Get the indices for current link_field_set1_value in both data-structures for proper matching | 
|  | 67             set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] | 
|  | 68             set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] | 
| 1 | 69             # Validation : | 
|  | 70             if len(set2index) == 0: | 
|  | 71                 # means that corresponding data could not be found in set2, then throw error | 
|  | 72                 raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + | 
|  | 73                                 link_field_set1_value + " only found in first dataset. ") | 
| 0 | 74 | 
|  | 75             merged_hits = [] | 
|  | 76             # Combine hits | 
|  | 77             for hit in xrange(len(set1index)): | 
|  | 78                 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do | 
|  | 79                 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its | 
| 1 | 80                 # corresponding value in the sets; i.e. | 
|  | 81                 # set1[key] => returns the list/array with size = nrrows, with the values for the attribute | 
|  | 82                 #                    represented by "key". | 
|  | 83                 # set1index[hit] => points to the row nr=hit (hit is a rownr/index) | 
|  | 84                 # So set1[x][set1index[n]] = set1.attributeX.instanceN | 
|  | 85                 # | 
| 0 | 86                 # It just ensures the entry is made available as a plain named array for easy access. | 
|  | 87                 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) | 
|  | 88                 if relation_type == ONE_TO_ONE : | 
|  | 89                     cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) | 
|  | 90                 else: | 
|  | 91                     # is N to 1: | 
|  | 92                     cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) | 
|  | 93 | 
| 1 | 94                 merged_hit = merge_function(rf_record, cl_record, metadata) | 
| 0 | 95                 merged_hits.append(merged_hit) | 
|  | 96 | 
|  | 97             merged.append(merged_hits) | 
|  | 98 | 
|  | 99     return merged, len(set1index) | 
|  | 100 | 
|  | 101 | 
|  | 102 def _compare_records(key1, key2): | 
|  | 103     ''' | 
|  | 104     in this case the compare method is really simple as both keys are expected to contain | 
|  | 105     same value when records are the same | 
|  | 106     ''' | 
|  | 107     if key1 == key2: | 
|  | 108         return True | 
|  | 109     else: | 
|  | 110         return False | 
|  | 111 | 
|  | 112 | 
|  | 113 | 
| 1 | 114 def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): | 
| 0 | 115     ''' | 
|  | 116     Combines single records from both the RankFilter+CasLookup combi file and from MsClust file | 
|  | 117 | 
|  | 118     @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) | 
|  | 119     @param msclust_quant_record: msclust quantification + spectrum record | 
|  | 120     ''' | 
|  | 121     record = [] | 
|  | 122     for column in rank_caslookup_combi: | 
|  | 123         record.append(rank_caslookup_combi[column]) | 
|  | 124 | 
|  | 125     for column in msclust_quant_record: | 
|  | 126         record.append(msclust_quant_record[column]) | 
| 1 | 127 | 
|  | 128     for column in metadata: | 
|  | 129         record.append(metadata[column]) | 
|  | 130 | 
|  | 131     # add MOLECULAR MASS (MM) | 
|  | 132     molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) | 
|  | 133     # limit to two decimals: | 
|  | 134     record.append("{0:.2f}".format(molecular_mass)) | 
|  | 135 | 
|  | 136     # add MOLECULAR WEIGHT (MW) - TODO - calculate this | 
|  | 137     record.append('0.0') | 
|  | 138 | 
|  | 139     # level of identification and Location of reference standard | 
|  | 140     record.append('0') | 
|  | 141     record.append('') | 
| 0 | 142 | 
|  | 143     return record | 
|  | 144 | 
|  | 145 | 
| 1 | 146 def get_molecular_mass(formula): | 
|  | 147     ''' | 
|  | 148     Calculates the molecular mass (MM). | 
|  | 149     E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O | 
|  | 150     ''' | 
|  | 151 | 
|  | 152     # Each element is represented by a capital letter, followed optionally by | 
|  | 153     # lower case, with one or more digits as for how many elements: | 
|  | 154     element_pattern = re.compile("([A-Z][a-z]?)(\d*)") | 
| 0 | 155 | 
| 1 | 156     total_mass = 0 | 
|  | 157     for (element_name, count) in element_pattern.findall(formula): | 
|  | 158         if count == "": | 
|  | 159             count = 1 | 
|  | 160         else: | 
|  | 161             count = int(count) | 
|  | 162         element_mass = float(elements_and_masses_map[element_name])  # "found: Python's built-in float type has double precision " (? check if really correct ?) | 
|  | 163         total_mass += element_mass * count | 
|  | 164 | 
|  | 165     return total_mass | 
|  | 166 | 
|  | 167 | 
|  | 168 | 
|  | 169 def _save_data(data, headers, out_csv): | 
| 0 | 170     ''' | 
|  | 171     Writes tab-separated data to file | 
|  | 172     @param data: dictionary containing merged dataset | 
|  | 173     @param out_csv: output csv file | 
|  | 174     ''' | 
|  | 175 | 
|  | 176     # Open output file for writing | 
|  | 177     outfile_single_handle = open(out_csv, 'wb') | 
|  | 178     output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | 
|  | 179 | 
|  | 180     # Write headers | 
|  | 181     output_single_handle.writerow(headers) | 
|  | 182 | 
| 1 | 183     # Write | 
|  | 184     for item_idx in xrange(len(data)): | 
|  | 185         for hit in data[item_idx]: | 
| 0 | 186             output_single_handle.writerow(hit) | 
|  | 187 | 
|  | 188 | 
| 1 | 189 def _get_map_for_elements_and_masses(elements_and_masses): | 
|  | 190     ''' | 
|  | 191     This method will read out the column 'Chemical symbol' and make a map | 
|  | 192     of this, storing the column 'Relative atomic mass' as its value | 
|  | 193     ''' | 
|  | 194     resultMap = {} | 
|  | 195     index = 0 | 
|  | 196     for entry in elements_and_masses['Chemical symbol']: | 
|  | 197         resultMap[entry] = elements_and_masses['Relative atomic mass'][index] | 
|  | 198         index += 1 | 
|  | 199 | 
|  | 200     return resultMap | 
|  | 201 | 
|  | 202 | 
|  | 203 def init_elements_and_masses_map(): | 
|  | 204     ''' | 
|  | 205     Initializes the lookup map containing the elements and their respective masses | 
|  | 206     ''' | 
|  | 207     elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) | 
|  | 208     global elements_and_masses_map | 
|  | 209     elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) | 
|  | 210 | 
|  | 211 | 
| 0 | 212 def main(): | 
|  | 213     ''' | 
|  | 214     Combine Output main function | 
|  | 215 | 
|  | 216     RankFilter, CasLookup are already combined by combine_output.py so here we will use | 
|  | 217     this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust | 
|  | 218     quantification files are to be combined with combine_output.py result as well. | 
|  | 219     ''' | 
|  | 220     rankfilter_and_caslookup_combined_file = sys.argv[1] | 
|  | 221     msclust_quantification_and_spectra_file = sys.argv[2] | 
|  | 222     output_csv = sys.argv[3] | 
| 1 | 223     # metadata | 
|  | 224     metadata = OrderedDict() | 
|  | 225     metadata['organism'] = sys.argv[4] | 
|  | 226     metadata['tissue'] = sys.argv[5] | 
|  | 227     metadata['experiment_name'] = sys.argv[6] | 
|  | 228     metadata['user_name'] = sys.argv[7] | 
|  | 229     metadata['column_type'] = sys.argv[8] | 
| 0 | 230 | 
|  | 231     # Read RankFilter and CasLookup output files | 
|  | 232     rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) | 
|  | 233     msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') | 
|  | 234 | 
| 1 | 235     # Read elements and masses to use for the MW/MM calculation : | 
|  | 236     init_elements_and_masses_map() | 
|  | 237 | 
| 0 | 238     merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', | 
| 1 | 239                                 msclust_quantification_and_spectra, 'centrotype', | 
|  | 240                                 _compare_records, _merge_records, metadata, | 
|  | 241                                 N_TO_ONE) | 
|  | 242     headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] | 
|  | 243     _save_data(merged, headers, output_csv) | 
| 0 | 244 | 
|  | 245 | 
|  | 246 if __name__ == '__main__': | 
|  | 247     main() |