comparison galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ba6cf6ede027
1 """
2 Split a file like::
3
4 source enrollment
5 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141
6 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390
7
8 into two separated a new TSV files
9
10 source study label
11 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141
12 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390
13
14 """
15
16 import sys, argparse, csv
17 from bl.vl.utils import LOG_LEVELS, get_logger
18
19
20 def get_parser():
21 parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow')
22 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
23 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
24 help='logging level', default='INFO')
25 parser.add_argument('--in-file', type=str, required=True,
26 help='input TSV file')
27 parser.add_argument('--out-enrollments', type=str, required=True,
28 help='input file with Enrollments definitions')
29 return parser
30
31
32 def get_enrollments_definitions(records, logger):
33 logger.info('Creating enrollment definitions')
34 enr_defs = []
35 for rec in records:
36 try:
37 edef = {}
38 edef['source'] = rec['source']
39 try:
40 edef['study'], edef['label'] = rec['enrollment'].split(':')
41 except ValueError:
42 logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment'])
43 continue
44 except KeyError, ke:
45 logger.error('Skipped record %r, missing key %s', rec, ke)
46 continue
47 enr_defs.append(edef)
48 logger.info('Retrieved %d enrollment definitions', len(enr_defs))
49 return enr_defs
50
51
52 def get_parents_definitions(records, logger):
53 logger.info('Creating parents definitions')
54 parents_defs = []
55 for rec in records:
56 try:
57 pdef = dict()
58 pdef['individual'] = rec['individual']
59 if rec['father'] != 'None' or rec['mother'] != 'None':
60 pdef['father'] = rec['father']
61 pdef['mother'] = rec['mother']
62 parents_defs.append(pdef)
63 else:
64 continue
65 except KeyError, ke:
66 logger.error('Skipped record %r, missing key %s', rec, ke)
67 continue
68 logger.info('Retrieved %d parents definitions', len(parents_defs))
69 return parents_defs
70
71
72 def main(argv):
73 parser = get_parser()
74 args = parser.parse_args(argv)
75
76 logger = get_logger('prepare_enrollments_import', level=args.loglevel,
77 filename=args.logfile)
78
79 logger.info('Start processing file %s', args.in_file)
80
81 with open(args.in_file) as in_file:
82 reader = csv.DictReader(in_file, delimiter='\t')
83 records = [row for row in reader]
84 logger.info('Loaded %d records', len(records))
85
86 enrollment_defs = get_enrollments_definitions(records, logger)
87 with open(args.out_enrollments, 'w') as enr_out:
88 enr_writer = csv.DictWriter(enr_out,
89 ['source', 'study', 'label'],
90 delimiter='\t')
91 enr_writer.writeheader()
92 enr_writer.writerows(enrollment_defs)
93
94 logger.info('Job completed')
95
96 if __name__ == '__main__':
97 main(sys.argv[1:])