annotate galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
1 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
2 Split a file like::
43be74e62bfe Uploaded
ric
parents:
diff changeset
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
4 source enrollment
43be74e62bfe Uploaded
ric
parents:
diff changeset
5 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141
43be74e62bfe Uploaded
ric
parents:
diff changeset
6 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390
43be74e62bfe Uploaded
ric
parents:
diff changeset
7
43be74e62bfe Uploaded
ric
parents:
diff changeset
8 into two separated a new TSV files
43be74e62bfe Uploaded
ric
parents:
diff changeset
9
43be74e62bfe Uploaded
ric
parents:
diff changeset
10 source study label
43be74e62bfe Uploaded
ric
parents:
diff changeset
11 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141
43be74e62bfe Uploaded
ric
parents:
diff changeset
12 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390
43be74e62bfe Uploaded
ric
parents:
diff changeset
13
43be74e62bfe Uploaded
ric
parents:
diff changeset
14 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
15
43be74e62bfe Uploaded
ric
parents:
diff changeset
16 import sys, argparse, csv
43be74e62bfe Uploaded
ric
parents:
diff changeset
17 from bl.vl.utils import LOG_LEVELS, get_logger
43be74e62bfe Uploaded
ric
parents:
diff changeset
18
43be74e62bfe Uploaded
ric
parents:
diff changeset
19
43be74e62bfe Uploaded
ric
parents:
diff changeset
20 def get_parser():
43be74e62bfe Uploaded
ric
parents:
diff changeset
21 parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow')
43be74e62bfe Uploaded
ric
parents:
diff changeset
22 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
43be74e62bfe Uploaded
ric
parents:
diff changeset
23 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
43be74e62bfe Uploaded
ric
parents:
diff changeset
24 help='logging level', default='INFO')
43be74e62bfe Uploaded
ric
parents:
diff changeset
25 parser.add_argument('--in-file', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
26 help='input TSV file')
43be74e62bfe Uploaded
ric
parents:
diff changeset
27 parser.add_argument('--out-enrollments', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
28 help='input file with Enrollments definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
29 return parser
43be74e62bfe Uploaded
ric
parents:
diff changeset
30
43be74e62bfe Uploaded
ric
parents:
diff changeset
31
43be74e62bfe Uploaded
ric
parents:
diff changeset
32 def get_enrollments_definitions(records, logger):
43be74e62bfe Uploaded
ric
parents:
diff changeset
33 logger.info('Creating enrollment definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
34 enr_defs = []
43be74e62bfe Uploaded
ric
parents:
diff changeset
35 for rec in records:
43be74e62bfe Uploaded
ric
parents:
diff changeset
36 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
37 edef = {}
43be74e62bfe Uploaded
ric
parents:
diff changeset
38 edef['source'] = rec['source']
43be74e62bfe Uploaded
ric
parents:
diff changeset
39 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
40 edef['study'], edef['label'] = rec['enrollment'].split(':')
43be74e62bfe Uploaded
ric
parents:
diff changeset
41 except ValueError:
43be74e62bfe Uploaded
ric
parents:
diff changeset
42 logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment'])
43be74e62bfe Uploaded
ric
parents:
diff changeset
43 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
44 except KeyError, ke:
43be74e62bfe Uploaded
ric
parents:
diff changeset
45 logger.error('Skipped record %r, missing key %s', rec, ke)
43be74e62bfe Uploaded
ric
parents:
diff changeset
46 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
47 enr_defs.append(edef)
43be74e62bfe Uploaded
ric
parents:
diff changeset
48 logger.info('Retrieved %d enrollment definitions', len(enr_defs))
43be74e62bfe Uploaded
ric
parents:
diff changeset
49 return enr_defs
43be74e62bfe Uploaded
ric
parents:
diff changeset
50
43be74e62bfe Uploaded
ric
parents:
diff changeset
51
43be74e62bfe Uploaded
ric
parents:
diff changeset
52 def get_parents_definitions(records, logger):
43be74e62bfe Uploaded
ric
parents:
diff changeset
53 logger.info('Creating parents definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
54 parents_defs = []
43be74e62bfe Uploaded
ric
parents:
diff changeset
55 for rec in records:
43be74e62bfe Uploaded
ric
parents:
diff changeset
56 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
57 pdef = dict()
43be74e62bfe Uploaded
ric
parents:
diff changeset
58 pdef['individual'] = rec['individual']
43be74e62bfe Uploaded
ric
parents:
diff changeset
59 if rec['father'] != 'None' or rec['mother'] != 'None':
43be74e62bfe Uploaded
ric
parents:
diff changeset
60 pdef['father'] = rec['father']
43be74e62bfe Uploaded
ric
parents:
diff changeset
61 pdef['mother'] = rec['mother']
43be74e62bfe Uploaded
ric
parents:
diff changeset
62 parents_defs.append(pdef)
43be74e62bfe Uploaded
ric
parents:
diff changeset
63 else:
43be74e62bfe Uploaded
ric
parents:
diff changeset
64 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
65 except KeyError, ke:
43be74e62bfe Uploaded
ric
parents:
diff changeset
66 logger.error('Skipped record %r, missing key %s', rec, ke)
43be74e62bfe Uploaded
ric
parents:
diff changeset
67 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
68 logger.info('Retrieved %d parents definitions', len(parents_defs))
43be74e62bfe Uploaded
ric
parents:
diff changeset
69 return parents_defs
43be74e62bfe Uploaded
ric
parents:
diff changeset
70
43be74e62bfe Uploaded
ric
parents:
diff changeset
71
43be74e62bfe Uploaded
ric
parents:
diff changeset
72 def main(argv):
43be74e62bfe Uploaded
ric
parents:
diff changeset
73 parser = get_parser()
43be74e62bfe Uploaded
ric
parents:
diff changeset
74 args = parser.parse_args(argv)
43be74e62bfe Uploaded
ric
parents:
diff changeset
75
43be74e62bfe Uploaded
ric
parents:
diff changeset
76 logger = get_logger('prepare_enrollments_import', level=args.loglevel,
43be74e62bfe Uploaded
ric
parents:
diff changeset
77 filename=args.logfile)
43be74e62bfe Uploaded
ric
parents:
diff changeset
78
43be74e62bfe Uploaded
ric
parents:
diff changeset
79 logger.info('Start processing file %s', args.in_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
80
43be74e62bfe Uploaded
ric
parents:
diff changeset
81 with open(args.in_file) as in_file:
43be74e62bfe Uploaded
ric
parents:
diff changeset
82 reader = csv.DictReader(in_file, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
83 records = [row for row in reader]
43be74e62bfe Uploaded
ric
parents:
diff changeset
84 logger.info('Loaded %d records', len(records))
43be74e62bfe Uploaded
ric
parents:
diff changeset
85
43be74e62bfe Uploaded
ric
parents:
diff changeset
86 enrollment_defs = get_enrollments_definitions(records, logger)
43be74e62bfe Uploaded
ric
parents:
diff changeset
87 with open(args.out_enrollments, 'w') as enr_out:
43be74e62bfe Uploaded
ric
parents:
diff changeset
88 enr_writer = csv.DictWriter(enr_out,
43be74e62bfe Uploaded
ric
parents:
diff changeset
89 ['source', 'study', 'label'],
43be74e62bfe Uploaded
ric
parents:
diff changeset
90 delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
91 enr_writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
92 enr_writer.writerows(enrollment_defs)
43be74e62bfe Uploaded
ric
parents:
diff changeset
93
43be74e62bfe Uploaded
ric
parents:
diff changeset
94 logger.info('Job completed')
43be74e62bfe Uploaded
ric
parents:
diff changeset
95
43be74e62bfe Uploaded
ric
parents:
diff changeset
96 if __name__ == '__main__':
43be74e62bfe Uploaded
ric
parents:
diff changeset
97 main(sys.argv[1:])