Mercurial > repos > pieterlukasse > prims_metabolomics
comparison export_to_metexp_tabular.py @ 0:4b94bb2d381c
Initial commit to toolshed
| author | pieter.lukasse@wur.nl |
|---|---|
| date | Thu, 16 Jan 2014 13:22:38 +0100 |
| parents | |
| children | 071a185c2ced |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4b94bb2d381c |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # encoding: utf-8 | |
| 3 ''' | |
| 4 Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust | |
| 5 into a tabular file that can be uploaded to the MetExp database. | |
| 6 | |
| 7 RankFilter, CasLookup are already combined by combine_output.py so here we will use | |
| 8 this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust | |
| 9 quantification files are to be combined with combine_output.py result as well. | |
| 10 | |
| 11 Extra calculations performed: | |
| 12 - The column MW is also added here and is derived from the column FORMULA found | |
| 13 in combine_output.py result. | |
| 14 | |
| 15 So in total here we merge 3 files and calculate one new column. | |
| 16 ''' | |
| 17 | |
| 18 import csv | |
| 19 import sys | |
| 20 from collections import OrderedDict | |
| 21 | |
| 22 __author__ = "Pieter Lukasse" | |
| 23 __contact__ = "pieter.lukasse@wur.nl" | |
| 24 __copyright__ = "Copyright, 2013, Plant Research International, WUR" | |
| 25 __license__ = "Apache v2" | |
| 26 | |
| 27 def _process_data(in_csv, delim='\t'): | |
| 28 ''' | |
| 29 Generic method to parse a tab-separated file returning a dictionary with named columns | |
| 30 @param in_csv: input filename to be parsed | |
| 31 ''' | |
| 32 data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) | |
| 33 header = data.pop(0) | |
| 34 # Create dictionary with column name as key | |
| 35 output = OrderedDict() | |
| 36 for index in xrange(len(header)): | |
| 37 output[header[index]] = [row[index] for row in data] | |
| 38 return output | |
| 39 | |
| 40 ONE_TO_ONE = 'one_to_one' | |
| 41 N_TO_ONE = 'n_to_one' | |
| 42 | |
| 43 def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): | |
| 44 ''' | |
| 45 Merges data from both input dictionaries based on the link fields. This method will | |
| 46 build up a new list containing the merged hits as the items. | |
| 47 @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) | |
| 48 @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) | |
| 49 ''' | |
| 50 # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): | |
| 51 # | |
| 52 # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): | |
| 53 # raise Exception('input files should have the same nr of key values ') | |
| 54 | |
| 55 | |
| 56 merged = [] | |
| 57 processed = {} | |
| 58 for link_field_set1_idx in xrange(len(set1[link_field_set1])): | |
| 59 link_field_set1_value = set1[link_field_set1][link_field_set1_idx] | |
| 60 if not link_field_set1_value in processed : | |
| 61 # keep track of processed items to not repeat them | |
| 62 processed[link_field_set1_value] = link_field_set1_value | |
| 63 | |
| 64 # Get the indices for current link_field_set1_value in both data-structures for proper matching | |
| 65 set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] | |
| 66 set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] | |
| 67 | |
| 68 | |
| 69 | |
| 70 merged_hits = [] | |
| 71 # Combine hits | |
| 72 for hit in xrange(len(set1index)): | |
| 73 # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do | |
| 74 # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its | |
| 75 # corresponding value in the rankfilter or caslookup tables; i.e. | |
| 76 # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute | |
| 77 # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) | |
| 78 # It just ensures the entry is made available as a plain named array for easy access. | |
| 79 rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) | |
| 80 if relation_type == ONE_TO_ONE : | |
| 81 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) | |
| 82 else: | |
| 83 # is N to 1: | |
| 84 cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) | |
| 85 | |
| 86 merged_hit = merge_function(rf_record, cl_record) | |
| 87 merged_hits.append(merged_hit) | |
| 88 | |
| 89 merged.append(merged_hits) | |
| 90 | |
| 91 return merged, len(set1index) | |
| 92 | |
| 93 | |
| 94 def _compare_records(key1, key2): | |
| 95 ''' | |
| 96 in this case the compare method is really simple as both keys are expected to contain | |
| 97 same value when records are the same | |
| 98 ''' | |
| 99 if key1 == key2: | |
| 100 return True | |
| 101 else: | |
| 102 return False | |
| 103 | |
| 104 | |
| 105 | |
| 106 def _merge_records(rank_caslookup_combi, msclust_quant_record): | |
| 107 ''' | |
| 108 Combines single records from both the RankFilter+CasLookup combi file and from MsClust file | |
| 109 | |
| 110 @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) | |
| 111 @param msclust_quant_record: msclust quantification + spectrum record | |
| 112 ''' | |
| 113 i = 0 | |
| 114 record = [] | |
| 115 for column in rank_caslookup_combi: | |
| 116 record.append(rank_caslookup_combi[column]) | |
| 117 i += 1 | |
| 118 | |
| 119 for column in msclust_quant_record: | |
| 120 record.append(msclust_quant_record[column]) | |
| 121 i += 1 | |
| 122 | |
| 123 return record | |
| 124 | |
| 125 | |
| 126 | |
| 127 | |
| 128 def _save_data(data, headers, nhits, out_csv): | |
| 129 ''' | |
| 130 Writes tab-separated data to file | |
| 131 @param data: dictionary containing merged dataset | |
| 132 @param out_csv: output csv file | |
| 133 ''' | |
| 134 | |
| 135 # Open output file for writing | |
| 136 outfile_single_handle = open(out_csv, 'wb') | |
| 137 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
| 138 | |
| 139 # Write headers | |
| 140 output_single_handle.writerow(headers) | |
| 141 | |
| 142 # Write one line for each centrotype | |
| 143 for centrotype_idx in xrange(len(data)): | |
| 144 for hit in data[centrotype_idx]: | |
| 145 output_single_handle.writerow(hit) | |
| 146 | |
| 147 | |
| 148 def main(): | |
| 149 ''' | |
| 150 Combine Output main function | |
| 151 | |
| 152 RankFilter, CasLookup are already combined by combine_output.py so here we will use | |
| 153 this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust | |
| 154 quantification files are to be combined with combine_output.py result as well. | |
| 155 ''' | |
| 156 rankfilter_and_caslookup_combined_file = sys.argv[1] | |
| 157 msclust_quantification_and_spectra_file = sys.argv[2] | |
| 158 output_csv = sys.argv[3] | |
| 159 | |
| 160 # Read RankFilter and CasLookup output files | |
| 161 rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) | |
| 162 msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') | |
| 163 | |
| 164 merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', | |
| 165 msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) | |
| 166 headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() | |
| 167 _save_data(merged, headers, nhits, output_csv) | |
| 168 | |
| 169 | |
| 170 if __name__ == '__main__': | |
| 171 main() |
