annotate galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
1 """
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
2 Split a file like::
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
3
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
4 source enrollment
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
5 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
6 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
7
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
8 into two separated a new TSV files
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
9
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
10 source study label
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
11 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
12 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
13
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
14 """
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
15
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
16 import sys, argparse, csv
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
17 from bl.vl.utils import LOG_LEVELS, get_logger
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
18
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
19
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
20 def get_parser():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
21 parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
22 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
23 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
24 help='logging level', default='INFO')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
25 parser.add_argument('--in-file', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
26 help='input TSV file')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
27 parser.add_argument('--out-enrollments', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
28 help='input file with Enrollments definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
29 return parser
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
30
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
31
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
32 def get_enrollments_definitions(records, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
33 logger.info('Creating enrollment definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
34 enr_defs = []
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
35 for rec in records:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
36 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
37 edef = {}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
38 edef['source'] = rec['source']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
39 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
40 edef['study'], edef['label'] = rec['enrollment'].split(':')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
41 except ValueError:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
42 logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment'])
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
43 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
44 except KeyError, ke:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
45 logger.error('Skipped record %r, missing key %s', rec, ke)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
46 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
47 enr_defs.append(edef)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
48 logger.info('Retrieved %d enrollment definitions', len(enr_defs))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
49 return enr_defs
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
50
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
51
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
52 def get_parents_definitions(records, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
53 logger.info('Creating parents definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
54 parents_defs = []
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
55 for rec in records:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
56 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
57 pdef = dict()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
58 pdef['individual'] = rec['individual']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
59 if rec['father'] != 'None' or rec['mother'] != 'None':
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
60 pdef['father'] = rec['father']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
61 pdef['mother'] = rec['mother']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
62 parents_defs.append(pdef)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
63 else:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
64 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
65 except KeyError, ke:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
66 logger.error('Skipped record %r, missing key %s', rec, ke)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
67 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
68 logger.info('Retrieved %d parents definitions', len(parents_defs))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
69 return parents_defs
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
70
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
71
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
72 def main(argv):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
73 parser = get_parser()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
74 args = parser.parse_args(argv)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
75
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
76 logger = get_logger('prepare_enrollments_import', level=args.loglevel,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
77 filename=args.logfile)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
78
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
79 logger.info('Start processing file %s', args.in_file)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
80
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
81 with open(args.in_file) as in_file:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
82 reader = csv.DictReader(in_file, delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
83 records = [row for row in reader]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
84 logger.info('Loaded %d records', len(records))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
85
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
86 enrollment_defs = get_enrollments_definitions(records, logger)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
87 with open(args.out_enrollments, 'w') as enr_out:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
88 enr_writer = csv.DictWriter(enr_out,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
89 ['source', 'study', 'label'],
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
90 delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
91 enr_writer.writeheader()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
92 enr_writer.writerows(enrollment_defs)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
93
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
94 logger.info('Job completed')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
95
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
96 if __name__ == '__main__':
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
97 main(sys.argv[1:])