Mercurial > repos > bgruening > flexynesis
annotate fetch_cbioportal_data.py @ 1:b353dad17ab7 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
author | bgruening |
---|---|
date | Mon, 14 Apr 2025 09:56:16 +0000 |
parents | |
children |
rev | line source |
---|---|
1
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
1 #!/usr/bin/env python |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
2 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
3 import argparse |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
4 import os |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
5 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
6 from flexynesis.utils import CBioPortalData |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
7 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
8 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
9 def main(): |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
10 parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
11 parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
12 parser.add_argument("--data_types", required=True, help="Comma-separated list of data types (e.g., 'clin,mut,omics')") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
13 parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
14 parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
15 parser.add_argument("--output_dir", required=True, help="Output directory for datasets") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
16 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
17 args = parser.parse_args() |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
18 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
19 data_types = args.data_types.split(",") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
20 if "clin" not in data_types: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
21 raise ValueError("Clinical data ('clin') is required for splitting the dataset.") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
22 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
23 file_mapping = { |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
24 "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
25 "mut": "data_mutations.txt", # any with 'mutations' in file name |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
26 "omics": "data_cna.txt", |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
27 "other": None |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
28 } |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
29 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
30 if args.mapped_files: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
31 mapped_files = args.mapped_files.split(",") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
32 if len(mapped_files) != len(data_types): |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
33 raise ValueError(f"Number of mapped files ({len(mapped_files)}) must match number of data types ({len(data_types)}).") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
34 files_to_fetch = {dt: mf for dt, mf in zip(data_types, mapped_files)} |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
35 for mf in mapped_files: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
36 if not mf.endswith(".txt"): |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
37 raise ValueError(f"Mapped file '{mf}' must end with '.txt'.") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
38 else: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
39 files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping} |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
40 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
41 invalid_types = set(data_types) - set(file_mapping.keys()) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
42 if invalid_types: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
43 raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
44 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
45 cbioportal = CBioPortalData(study_id=args.study_id) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
46 cbioportal.get_cbioportal_data(study_id=args.study_id, files=files_to_fetch) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
47 dataset = cbioportal.split_data(ratio=args.split_ratio) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
48 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
49 os.makedirs(args.output_dir, exist_ok=True) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
50 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
51 for data_type in data_types: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
52 if data_type in dataset['train']: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
53 train_file = os.path.join(args.output_dir, f"{data_type}_train.csv") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
54 dataset['train'][data_type].to_csv(train_file, index=True) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
55 if data_type in dataset['test']: |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
56 test_file = os.path.join(args.output_dir, f"{data_type}_test.csv") |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
57 dataset['test'][data_type].to_csv(test_file, index=True) |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
58 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
59 |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
60 if __name__ == "__main__": |
b353dad17ab7
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 973836fb40ecb9c0ac26f675d12b20fc8e5f51f4
bgruening
parents:
diff
changeset
|
61 main() |