3
|
1 """
|
|
2 Split a file like::
|
|
3
|
|
4 source enrollment
|
|
5 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141
|
|
6 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390
|
|
7
|
|
8 into two separated a new TSV files
|
|
9
|
|
10 source study label
|
|
11 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141
|
|
12 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390
|
|
13
|
|
14 """
|
|
15
|
|
16 import sys, argparse, csv
|
|
17 from bl.vl.utils import LOG_LEVELS, get_logger
|
|
18
|
|
19
|
|
20 def get_parser():
|
|
21 parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow')
|
|
22 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
|
|
23 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
|
|
24 help='logging level', default='INFO')
|
|
25 parser.add_argument('--in-file', type=str, required=True,
|
|
26 help='input TSV file')
|
|
27 parser.add_argument('--out-enrollments', type=str, required=True,
|
|
28 help='input file with Enrollments definitions')
|
|
29 return parser
|
|
30
|
|
31
|
|
32 def get_enrollments_definitions(records, logger):
|
|
33 logger.info('Creating enrollment definitions')
|
|
34 enr_defs = []
|
|
35 for rec in records:
|
|
36 try:
|
|
37 edef = {}
|
|
38 edef['source'] = rec['source']
|
|
39 try:
|
|
40 edef['study'], edef['label'] = rec['enrollment'].split(':')
|
|
41 except ValueError:
|
|
42 logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment'])
|
|
43 continue
|
|
44 except KeyError, ke:
|
|
45 logger.error('Skipped record %r, missing key %s', rec, ke)
|
|
46 continue
|
|
47 enr_defs.append(edef)
|
|
48 logger.info('Retrieved %d enrollment definitions', len(enr_defs))
|
|
49 return enr_defs
|
|
50
|
|
51
|
|
52 def get_parents_definitions(records, logger):
|
|
53 logger.info('Creating parents definitions')
|
|
54 parents_defs = []
|
|
55 for rec in records:
|
|
56 try:
|
|
57 pdef = dict()
|
|
58 pdef['individual'] = rec['individual']
|
|
59 if rec['father'] != 'None' or rec['mother'] != 'None':
|
|
60 pdef['father'] = rec['father']
|
|
61 pdef['mother'] = rec['mother']
|
|
62 parents_defs.append(pdef)
|
|
63 else:
|
|
64 continue
|
|
65 except KeyError, ke:
|
|
66 logger.error('Skipped record %r, missing key %s', rec, ke)
|
|
67 continue
|
|
68 logger.info('Retrieved %d parents definitions', len(parents_defs))
|
|
69 return parents_defs
|
|
70
|
|
71
|
|
72 def main(argv):
|
|
73 parser = get_parser()
|
|
74 args = parser.parse_args(argv)
|
|
75
|
|
76 logger = get_logger('prepare_enrollments_import', level=args.loglevel,
|
|
77 filename=args.logfile)
|
|
78
|
|
79 logger.info('Start processing file %s', args.in_file)
|
|
80
|
|
81 with open(args.in_file) as in_file:
|
|
82 reader = csv.DictReader(in_file, delimiter='\t')
|
|
83 records = [row for row in reader]
|
|
84 logger.info('Loaded %d records', len(records))
|
|
85
|
|
86 enrollment_defs = get_enrollments_definitions(records, logger)
|
|
87 with open(args.out_enrollments, 'w') as enr_out:
|
|
88 enr_writer = csv.DictWriter(enr_out,
|
|
89 ['source', 'study', 'label'],
|
|
90 delimiter='\t')
|
|
91 enr_writer.writeheader()
|
|
92 enr_writer.writerows(enrollment_defs)
|
|
93
|
|
94 logger.info('Job completed')
|
|
95
|
|
96 if __name__ == '__main__':
|
|
97 main(sys.argv[1:])
|