annotate galaxy-tools/biobank/utils/prepare_individuals_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
1 """
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
2 Split a file like::
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
3
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
4 individual gender father mother
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
7
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
8 into two separated TSV files, the first one will be used to import new individuals and enrollments,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
9 the second one will be used to update father and mother informations for the individuals in the first
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
10 file.
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
11 """
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
12
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
13 import sys, argparse, csv
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
14 from bl.vl.utils import LOG_LEVELS, get_logger
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
15
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
16
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
17 def get_parser():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
18 parser = argparse.ArgumentParser('Prepare input files for individuals import workflow')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
21 help='logging level', default='INFO')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
22 parser.add_argument('--in-file', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
23 help='input TSV file')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
24 parser.add_argument('--out-individuals', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
25 help='input file with Individuals definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
26 parser.add_argument('--out-parents', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
27 help='input file with parents definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
28 return parser
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
29
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
30
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
31 def get_individual_definitions(records, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
32 logger.info('Creating individual definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
33 ind_defs = []
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
34 for rec in records:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
35 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
36 idef = {'father': 'None', 'mother': 'None'}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
37 idef['gender'] = rec['gender']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
38 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
39 idef['study'], idef['label'] = rec['individual'].split(':')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
40 except ValueError:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
41 logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual'])
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
42 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
43 except KeyError, ke:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
44 logger.error('Skipped record %r, missing key %s', rec, ke)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
45 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
46 ind_defs.append(idef)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
47 logger.info('Retrieved %d individual definitions', len(ind_defs))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
48 return ind_defs
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
49
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
50
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
51 def get_parents_definitions(records, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
52 logger.info('Creating parents definitions')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
53 parents_defs = []
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
54 for rec in records:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
55 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
56 pdef = dict()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
57 pdef['individual'] = rec['individual']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
58 if rec['father'] != 'None' or rec['mother'] != 'None':
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
59 pdef['father'] = rec['father']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
60 pdef['mother'] = rec['mother']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
61 parents_defs.append(pdef)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
62 else:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
63 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
64 except KeyError, ke:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
65 logger.error('Skipped record %r, missing key %s', rec, ke)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
66 continue
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
67 logger.info('Retrieved %d parents definitions', len(parents_defs))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
68 return parents_defs
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
69
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
70
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
71 def main(argv):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
72 parser = get_parser()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
73 args = parser.parse_args(argv)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
74
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
75 logger = get_logger('prepare_individuals_import', level=args.loglevel,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
76 filename=args.logfile)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
77
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
78 logger.info('Start processing file %s', args.in_file)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
79
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
80 with open(args.in_file) as in_file:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
81 reader = csv.DictReader(in_file, delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
82 records = [row for row in reader]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
83 logger.info('Loaded %d records', len(records))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
84
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
85 individual_defs = get_individual_definitions(records, logger)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
86 with open(args.out_individuals, 'w') as inds_out:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
87 inds_writer = csv.DictWriter(inds_out,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
88 ['study', 'label', 'gender', 'father', 'mother'],
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
89 delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
90 inds_writer.writeheader()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
91 inds_writer.writerows(individual_defs)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
92
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
93 parents_defs = get_parents_definitions(records, logger)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
94 with open(args.out_parents, 'w') as parents_out:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
95 parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'],
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
96 delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
97 parents_writer.writeheader()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
98 parents_writer.writerows(parents_defs)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
99
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
100 logger.info('Job completed')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
101
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
102 if __name__ == '__main__':
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
103 main(sys.argv[1:])