|
0
|
1 """
|
|
|
2 Split a file like::
|
|
|
3
|
|
|
4 individual gender father mother
|
|
|
5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341
|
|
|
6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612
|
|
|
7
|
|
|
8 into two separated TSV files, the first one will be used to import new individuals and enrollments,
|
|
|
9 the second one will be used to update father and mother informations for the individuals in the first
|
|
|
10 file.
|
|
|
11 """
|
|
|
12
|
|
|
13 import sys, argparse, csv
|
|
|
14 from bl.vl.utils import LOG_LEVELS, get_logger
|
|
|
15
|
|
|
16
|
|
|
17 def get_parser():
|
|
|
18 parser = argparse.ArgumentParser('Prepare input files for individuals import workflow')
|
|
|
19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
|
|
|
20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
|
|
|
21 help='logging level', default='INFO')
|
|
|
22 parser.add_argument('--in-file', type=str, required=True,
|
|
|
23 help='input TSV file')
|
|
|
24 parser.add_argument('--out-individuals', type=str, required=True,
|
|
|
25 help='input file with Individuals definitions')
|
|
|
26 parser.add_argument('--out-parents', type=str, required=True,
|
|
|
27 help='input file with parents definitions')
|
|
|
28 return parser
|
|
|
29
|
|
|
30
|
|
|
31 def get_individual_definitions(records, logger):
|
|
|
32 logger.info('Creating individual definitions')
|
|
|
33 ind_defs = []
|
|
|
34 for rec in records:
|
|
|
35 try:
|
|
|
36 idef = {'father': 'None', 'mother': 'None'}
|
|
|
37 idef['gender'] = rec['gender']
|
|
|
38 try:
|
|
|
39 idef['study'], idef['label'] = rec['individual'].split(':')
|
|
|
40 except ValueError:
|
|
|
41 logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual'])
|
|
|
42 continue
|
|
|
43 except KeyError, ke:
|
|
|
44 logger.error('Skipped record %r, missing key %s', rec, ke)
|
|
|
45 continue
|
|
|
46 ind_defs.append(idef)
|
|
|
47 logger.info('Retrieved %d individual definitions', len(ind_defs))
|
|
|
48 return ind_defs
|
|
|
49
|
|
|
50
|
|
|
51 def get_parents_definitions(records, logger):
|
|
|
52 logger.info('Creating parents definitions')
|
|
|
53 parents_defs = []
|
|
|
54 for rec in records:
|
|
|
55 try:
|
|
|
56 pdef = dict()
|
|
|
57 pdef['individual'] = rec['individual']
|
|
|
58 if rec['father'] != 'None' or rec['mother'] != 'None':
|
|
|
59 pdef['father'] = rec['father']
|
|
|
60 pdef['mother'] = rec['mother']
|
|
|
61 parents_defs.append(pdef)
|
|
|
62 else:
|
|
|
63 continue
|
|
|
64 except KeyError, ke:
|
|
|
65 logger.error('Skipped record %r, missing key %s', rec, ke)
|
|
|
66 continue
|
|
|
67 logger.info('Retrieved %d parents definitions', len(parents_defs))
|
|
|
68 return parents_defs
|
|
|
69
|
|
|
70
|
|
|
71 def main(argv):
|
|
|
72 parser = get_parser()
|
|
|
73 args = parser.parse_args(argv)
|
|
|
74
|
|
|
75 logger = get_logger('prepare_individuals_import', level=args.loglevel,
|
|
|
76 filename=args.logfile)
|
|
|
77
|
|
|
78 logger.info('Start processing file %s', args.in_file)
|
|
|
79
|
|
|
80 with open(args.in_file) as in_file:
|
|
|
81 reader = csv.DictReader(in_file, delimiter='\t')
|
|
|
82 records = [row for row in reader]
|
|
|
83 logger.info('Loaded %d records', len(records))
|
|
|
84
|
|
|
85 individual_defs = get_individual_definitions(records, logger)
|
|
|
86 with open(args.out_individuals, 'w') as inds_out:
|
|
|
87 inds_writer = csv.DictWriter(inds_out,
|
|
|
88 ['study', 'label', 'gender', 'father', 'mother'],
|
|
|
89 delimiter='\t')
|
|
|
90 inds_writer.writeheader()
|
|
|
91 inds_writer.writerows(individual_defs)
|
|
|
92
|
|
|
93 parents_defs = get_parents_definitions(records, logger)
|
|
|
94 with open(args.out_parents, 'w') as parents_out:
|
|
|
95 parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'],
|
|
|
96 delimiter='\t')
|
|
|
97 parents_writer.writeheader()
|
|
|
98 parents_writer.writerows(parents_defs)
|
|
|
99
|
|
|
100 logger.info('Job completed')
|
|
|
101
|
|
|
102 if __name__ == '__main__':
|
|
|
103 main(sys.argv[1:]) |