Mercurial > repos > ric > test1
comparison galaxy-tools/biobank/utils/split_by_study.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:47bf0086e082 | 3:43be74e62bfe |
---|---|
1 """ | |
2 Split a file like:: | |
3 | |
4 individual gender father mother | |
5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 | |
6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 | |
7 BSTUDY:12515 FEMALE BSTUDY:3512 BSTUDY:124 | |
8 | |
9 into multiple files based on the STUDY value of the label stored in the "individual" column. | |
10 Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line | |
11 will be skipped. | |
12 """ | |
13 | |
14 import sys, argparse, csv, os | |
15 from bl.vl.utils import LOG_LEVELS, get_logger | |
16 | |
17 | |
18 def get_parser(): | |
19 parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria') | |
20 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
21 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
22 help='logging level', default='INFO') | |
23 parser.add_argument('--in-file', type=str, required=True, | |
24 help='input TSV file') | |
25 parser.add_argument('--record-id', type=int, | |
26 help='Output ID record given by Galaxy') | |
27 parser.add_argument('--out-path', type=str, help='Output directory', | |
28 default='.') | |
29 return parser | |
30 | |
31 | |
32 def split_element(element, logger): | |
33 try: | |
34 study, code = element.split(':') | |
35 return study, code | |
36 except ValueError: | |
37 logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element) | |
38 return None, None | |
39 | |
40 | |
41 def map_by_study(records, logger): | |
42 records_map = {} | |
43 for rec in records: | |
44 study, code = split_element(rec['individual'], logger) | |
45 if not study and not code: | |
46 logger.debug('Skipping record %r', rec) | |
47 continue | |
48 records_map.setdefault(study, []).append(rec) | |
49 logger.info('Records splitted between %d studies', len(records_map.keys())) | |
50 return records_map | |
51 | |
52 | |
53 def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None): | |
54 | |
55 def get_file_name(study, out_path, galaxy_id=None): | |
56 if not galaxy_id: | |
57 file_name = '%s_individuals.tsv' % study | |
58 else: | |
59 file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-')) | |
60 return os.path.join(out_path, file_name) | |
61 | |
62 fname = get_file_name(study_label, output_path, galaxy_record_id) | |
63 with open(fname, 'w') as ofile: | |
64 logger.info('Dumping %d records to file %s', len(records), fname) | |
65 writer = csv.DictWriter(ofile, header, delimiter='\t') | |
66 writer.writeheader() | |
67 writer.writerows(records) | |
68 | |
69 | |
70 def main(argv): | |
71 parser = get_parser() | |
72 args = parser.parse_args(argv) | |
73 | |
74 logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile) | |
75 | |
76 logger.info('Start processing file %s', args.in_file) | |
77 with open(args.in_file) as in_file: | |
78 reader = csv.DictReader(in_file, delimiter='\t') | |
79 records = [row for row in reader] | |
80 | |
81 records_map = map_by_study(records, logger) | |
82 # Force the header of the output files in order to prevent problems when running the workflow later | |
83 header = ['individual', 'gender', 'father', 'mother'] | |
84 for study, records in records_map.iteritems(): | |
85 dump_records(study, records, header, args.out_path, logger, args.record_id) | |
86 logger.info('Job completed') | |
87 | |
88 | |
89 if __name__ == '__main__': | |
90 main(sys.argv[1:]) |