comparison galaxy-tools/biobank/utils/prepare_individuals_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ba6cf6ede027
1 """
2 Split a file like::
3
4 individual gender father mother
5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341
6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612
7
8 into two separated TSV files, the first one will be used to import new individuals and enrollments,
9 the second one will be used to update father and mother informations for the individuals in the first
10 file.
11 """
12
13 import sys, argparse, csv
14 from bl.vl.utils import LOG_LEVELS, get_logger
15
16
17 def get_parser():
18 parser = argparse.ArgumentParser('Prepare input files for individuals import workflow')
19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
21 help='logging level', default='INFO')
22 parser.add_argument('--in-file', type=str, required=True,
23 help='input TSV file')
24 parser.add_argument('--out-individuals', type=str, required=True,
25 help='input file with Individuals definitions')
26 parser.add_argument('--out-parents', type=str, required=True,
27 help='input file with parents definitions')
28 return parser
29
30
31 def get_individual_definitions(records, logger):
32 logger.info('Creating individual definitions')
33 ind_defs = []
34 for rec in records:
35 try:
36 idef = {'father': 'None', 'mother': 'None'}
37 idef['gender'] = rec['gender']
38 try:
39 idef['study'], idef['label'] = rec['individual'].split(':')
40 except ValueError:
41 logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual'])
42 continue
43 except KeyError, ke:
44 logger.error('Skipped record %r, missing key %s', rec, ke)
45 continue
46 ind_defs.append(idef)
47 logger.info('Retrieved %d individual definitions', len(ind_defs))
48 return ind_defs
49
50
51 def get_parents_definitions(records, logger):
52 logger.info('Creating parents definitions')
53 parents_defs = []
54 for rec in records:
55 try:
56 pdef = dict()
57 pdef['individual'] = rec['individual']
58 if rec['father'] != 'None' or rec['mother'] != 'None':
59 pdef['father'] = rec['father']
60 pdef['mother'] = rec['mother']
61 parents_defs.append(pdef)
62 else:
63 continue
64 except KeyError, ke:
65 logger.error('Skipped record %r, missing key %s', rec, ke)
66 continue
67 logger.info('Retrieved %d parents definitions', len(parents_defs))
68 return parents_defs
69
70
71 def main(argv):
72 parser = get_parser()
73 args = parser.parse_args(argv)
74
75 logger = get_logger('prepare_individuals_import', level=args.loglevel,
76 filename=args.logfile)
77
78 logger.info('Start processing file %s', args.in_file)
79
80 with open(args.in_file) as in_file:
81 reader = csv.DictReader(in_file, delimiter='\t')
82 records = [row for row in reader]
83 logger.info('Loaded %d records', len(records))
84
85 individual_defs = get_individual_definitions(records, logger)
86 with open(args.out_individuals, 'w') as inds_out:
87 inds_writer = csv.DictWriter(inds_out,
88 ['study', 'label', 'gender', 'father', 'mother'],
89 delimiter='\t')
90 inds_writer.writeheader()
91 inds_writer.writerows(individual_defs)
92
93 parents_defs = get_parents_definitions(records, logger)
94 with open(args.out_parents, 'w') as parents_out:
95 parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'],
96 delimiter='\t')
97 parents_writer.writeheader()
98 parents_writer.writerows(parents_defs)
99
100 logger.info('Job completed')
101
102 if __name__ == '__main__':
103 main(sys.argv[1:])