Mercurial > repos > recetox > aplcms_to_ramclustr_converter
comparison aplcms_to_ramclustr_converter.py @ 4:9ea34e24474f draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tools/aplcms_to_ramclustr_converter/ commit 2dd20229f0c7f43dacc0d201ea50fef3c993d30e"
| author | recetox |
|---|---|
| date | Mon, 09 Aug 2021 15:29:08 +0000 |
| parents | 07667688735e |
| children |
comparison
equal
deleted
inserted
replaced
| 3:07667688735e | 4:9ea34e24474f |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import sys | 4 import sys |
| 5 import warnings | |
| 6 | 5 |
| 7 import pandas as pd | 6 import pandas as pd |
| 8 | 7 |
| 9 | 8 |
| 10 warnings.simplefilter('ignore') | |
| 11 | |
| 12 parser = argparse.ArgumentParser() | 9 parser = argparse.ArgumentParser() |
| 13 parser.add_argument("--dataframe", help="Name of hdf dataframe") | 10 parser.add_argument("--dataframe", help="Parquet dataframe") |
| 14 parser.add_argument("--table", help="Name of a table in the dataframe") | |
| 15 parser.add_argument('output') | 11 parser.add_argument('output') |
| 16 args = parser.parse_args() | 12 args = parser.parse_args() |
| 17 | 13 |
| 18 | 14 |
| 19 def extract_data(table): | 15 def main(): |
| 20 num_samples = int((len(table.columns.tolist()) - 4) / 2) | 16 featureTable = pd.read_parquet(args.dataframe) |
| 21 mz_rt = table['mz'].map(str) + "_" + table['rt'].map(str) | |
| 22 | 17 |
| 23 intensities = table.iloc[:, 4:(4 + num_samples)] | 18 # Concatenate "mz" and "rt" columns; select relevant columns; pivot the table |
| 24 sample_labels = [label.split('.')[1] for label in intensities.columns.tolist()] | 19 featureTable["mz_rt"] = featureTable["mz"].astype(str) + "_" + featureTable["rt"].astype(str) |
| 25 ramclustr_data = pd.DataFrame({'mz_rt': mz_rt}) | 20 featureTable = featureTable[["sample", "mz_rt", "sample_intensity"]] |
| 21 featureTable = pd.pivot_table(featureTable, columns="mz_rt", index="sample", values="sample_intensity") | |
| 26 | 22 |
| 27 for idx in range(num_samples): | |
| 28 label = sample_labels[idx] | |
| 29 ramclustr_data[label] = intensities.iloc[:, idx] | |
| 30 | |
| 31 return ramclustr_data | |
| 32 | |
| 33 | |
| 34 def format_table(ramclustr_data): | |
| 35 ramclustr_data.set_index('mz_rt', inplace=True) | |
| 36 ramclustr_data = ramclustr_data.transpose() | |
| 37 ramclustr_data.index.rename('sample', inplace=True) | |
| 38 return ramclustr_data | |
| 39 | |
| 40 | |
| 41 def main(): | |
| 42 try: | 23 try: |
| 43 aplcms_table = pd.read_hdf(args.dataframe, args.table, errors='None') | 24 featureTable.to_csv(args.output, sep=',') |
| 44 except KeyError: | 25 msg = f"Dataset of {len(featureTable)} samples is converted to a feature-by-sample table" |
| 45 msg = "Selected table does not exist in HDF dataframe" | 26 print(msg, file=sys.stdout) |
| 46 print(msg, file=sys.stderr) | 27 return 0 |
| 47 sys.exit(1) | 28 except Exception: |
| 48 | 29 print("Could not write the data", file=sys.stdout) |
| 49 ramclustr_data = extract_data(aplcms_table) | 30 return 1 |
| 50 ramclustr_table = format_table(ramclustr_data) | |
| 51 | |
| 52 ramclustr_table.to_csv(args.output, sep=',') | |
| 53 msg = "Table '{}' of HDF dataset is converted to csv for RamClutsR".format(args.table) | |
| 54 print(msg, file=sys.stdout) | |
| 55 | 31 |
| 56 | 32 |
| 57 if __name__ == "__main__": | 33 if __name__ == "__main__": |
| 58 main() | 34 main() |
