annotate galaxy-tools/biobank/utils/prepare_individuals_import.py @ 4:f833f23d38a3 draft

Deleted selected files
author ric
date Thu, 22 Sep 2016 09:28:03 -0400
parents 43be74e62bfe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
1 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
2 Split a file like::
43be74e62bfe Uploaded
ric
parents:
diff changeset
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
4 individual gender father mother
43be74e62bfe Uploaded
ric
parents:
diff changeset
5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341
43be74e62bfe Uploaded
ric
parents:
diff changeset
6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612
43be74e62bfe Uploaded
ric
parents:
diff changeset
7
43be74e62bfe Uploaded
ric
parents:
diff changeset
8 into two separated TSV files, the first one will be used to import new individuals and enrollments,
43be74e62bfe Uploaded
ric
parents:
diff changeset
9 the second one will be used to update father and mother informations for the individuals in the first
43be74e62bfe Uploaded
ric
parents:
diff changeset
10 file.
43be74e62bfe Uploaded
ric
parents:
diff changeset
11 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
12
43be74e62bfe Uploaded
ric
parents:
diff changeset
13 import sys, argparse, csv
43be74e62bfe Uploaded
ric
parents:
diff changeset
14 from bl.vl.utils import LOG_LEVELS, get_logger
43be74e62bfe Uploaded
ric
parents:
diff changeset
15
43be74e62bfe Uploaded
ric
parents:
diff changeset
16
43be74e62bfe Uploaded
ric
parents:
diff changeset
17 def get_parser():
43be74e62bfe Uploaded
ric
parents:
diff changeset
18 parser = argparse.ArgumentParser('Prepare input files for individuals import workflow')
43be74e62bfe Uploaded
ric
parents:
diff changeset
19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
43be74e62bfe Uploaded
ric
parents:
diff changeset
20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
43be74e62bfe Uploaded
ric
parents:
diff changeset
21 help='logging level', default='INFO')
43be74e62bfe Uploaded
ric
parents:
diff changeset
22 parser.add_argument('--in-file', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
23 help='input TSV file')
43be74e62bfe Uploaded
ric
parents:
diff changeset
24 parser.add_argument('--out-individuals', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
25 help='input file with Individuals definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
26 parser.add_argument('--out-parents', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
27 help='input file with parents definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
28 return parser
43be74e62bfe Uploaded
ric
parents:
diff changeset
29
43be74e62bfe Uploaded
ric
parents:
diff changeset
30
43be74e62bfe Uploaded
ric
parents:
diff changeset
31 def get_individual_definitions(records, logger):
43be74e62bfe Uploaded
ric
parents:
diff changeset
32 logger.info('Creating individual definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
33 ind_defs = []
43be74e62bfe Uploaded
ric
parents:
diff changeset
34 for rec in records:
43be74e62bfe Uploaded
ric
parents:
diff changeset
35 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
36 idef = {'father': 'None', 'mother': 'None'}
43be74e62bfe Uploaded
ric
parents:
diff changeset
37 idef['gender'] = rec['gender']
43be74e62bfe Uploaded
ric
parents:
diff changeset
38 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
39 idef['study'], idef['label'] = rec['individual'].split(':')
43be74e62bfe Uploaded
ric
parents:
diff changeset
40 except ValueError:
43be74e62bfe Uploaded
ric
parents:
diff changeset
41 logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual'])
43be74e62bfe Uploaded
ric
parents:
diff changeset
42 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
43 except KeyError, ke:
43be74e62bfe Uploaded
ric
parents:
diff changeset
44 logger.error('Skipped record %r, missing key %s', rec, ke)
43be74e62bfe Uploaded
ric
parents:
diff changeset
45 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
46 ind_defs.append(idef)
43be74e62bfe Uploaded
ric
parents:
diff changeset
47 logger.info('Retrieved %d individual definitions', len(ind_defs))
43be74e62bfe Uploaded
ric
parents:
diff changeset
48 return ind_defs
43be74e62bfe Uploaded
ric
parents:
diff changeset
49
43be74e62bfe Uploaded
ric
parents:
diff changeset
50
43be74e62bfe Uploaded
ric
parents:
diff changeset
51 def get_parents_definitions(records, logger):
43be74e62bfe Uploaded
ric
parents:
diff changeset
52 logger.info('Creating parents definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
53 parents_defs = []
43be74e62bfe Uploaded
ric
parents:
diff changeset
54 for rec in records:
43be74e62bfe Uploaded
ric
parents:
diff changeset
55 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
56 pdef = dict()
43be74e62bfe Uploaded
ric
parents:
diff changeset
57 pdef['individual'] = rec['individual']
43be74e62bfe Uploaded
ric
parents:
diff changeset
58 if rec['father'] != 'None' or rec['mother'] != 'None':
43be74e62bfe Uploaded
ric
parents:
diff changeset
59 pdef['father'] = rec['father']
43be74e62bfe Uploaded
ric
parents:
diff changeset
60 pdef['mother'] = rec['mother']
43be74e62bfe Uploaded
ric
parents:
diff changeset
61 parents_defs.append(pdef)
43be74e62bfe Uploaded
ric
parents:
diff changeset
62 else:
43be74e62bfe Uploaded
ric
parents:
diff changeset
63 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
64 except KeyError, ke:
43be74e62bfe Uploaded
ric
parents:
diff changeset
65 logger.error('Skipped record %r, missing key %s', rec, ke)
43be74e62bfe Uploaded
ric
parents:
diff changeset
66 continue
43be74e62bfe Uploaded
ric
parents:
diff changeset
67 logger.info('Retrieved %d parents definitions', len(parents_defs))
43be74e62bfe Uploaded
ric
parents:
diff changeset
68 return parents_defs
43be74e62bfe Uploaded
ric
parents:
diff changeset
69
43be74e62bfe Uploaded
ric
parents:
diff changeset
70
43be74e62bfe Uploaded
ric
parents:
diff changeset
71 def main(argv):
43be74e62bfe Uploaded
ric
parents:
diff changeset
72 parser = get_parser()
43be74e62bfe Uploaded
ric
parents:
diff changeset
73 args = parser.parse_args(argv)
43be74e62bfe Uploaded
ric
parents:
diff changeset
74
43be74e62bfe Uploaded
ric
parents:
diff changeset
75 logger = get_logger('prepare_individuals_import', level=args.loglevel,
43be74e62bfe Uploaded
ric
parents:
diff changeset
76 filename=args.logfile)
43be74e62bfe Uploaded
ric
parents:
diff changeset
77
43be74e62bfe Uploaded
ric
parents:
diff changeset
78 logger.info('Start processing file %s', args.in_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
79
43be74e62bfe Uploaded
ric
parents:
diff changeset
80 with open(args.in_file) as in_file:
43be74e62bfe Uploaded
ric
parents:
diff changeset
81 reader = csv.DictReader(in_file, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
82 records = [row for row in reader]
43be74e62bfe Uploaded
ric
parents:
diff changeset
83 logger.info('Loaded %d records', len(records))
43be74e62bfe Uploaded
ric
parents:
diff changeset
84
43be74e62bfe Uploaded
ric
parents:
diff changeset
85 individual_defs = get_individual_definitions(records, logger)
43be74e62bfe Uploaded
ric
parents:
diff changeset
86 with open(args.out_individuals, 'w') as inds_out:
43be74e62bfe Uploaded
ric
parents:
diff changeset
87 inds_writer = csv.DictWriter(inds_out,
43be74e62bfe Uploaded
ric
parents:
diff changeset
88 ['study', 'label', 'gender', 'father', 'mother'],
43be74e62bfe Uploaded
ric
parents:
diff changeset
89 delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
90 inds_writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
91 inds_writer.writerows(individual_defs)
43be74e62bfe Uploaded
ric
parents:
diff changeset
92
43be74e62bfe Uploaded
ric
parents:
diff changeset
93 parents_defs = get_parents_definitions(records, logger)
43be74e62bfe Uploaded
ric
parents:
diff changeset
94 with open(args.out_parents, 'w') as parents_out:
43be74e62bfe Uploaded
ric
parents:
diff changeset
95 parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'],
43be74e62bfe Uploaded
ric
parents:
diff changeset
96 delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
97 parents_writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
98 parents_writer.writerows(parents_defs)
43be74e62bfe Uploaded
ric
parents:
diff changeset
99
43be74e62bfe Uploaded
ric
parents:
diff changeset
100 logger.info('Job completed')
43be74e62bfe Uploaded
ric
parents:
diff changeset
101
43be74e62bfe Uploaded
ric
parents:
diff changeset
102 if __name__ == '__main__':
43be74e62bfe Uploaded
ric
parents:
diff changeset
103 main(sys.argv[1:])
43be74e62bfe Uploaded
ric
parents:
diff changeset
104
43be74e62bfe Uploaded
ric
parents:
diff changeset
105