comparison galaxy-tools/biobank/utils/build_enrollments_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ba6cf6ede027
1 """
2 prepare a tsv to be imported with a study code foreach individuals not
3 present in a specified study.
4
5 Can be specified also a study from which each individuals enrolled in will
6 be ignored
7
8 Report file contains enrollments codes in the others studies
9
10 Codes are short hashes from numbers generated using Hashids.org with
11 study label as salt parameter
12
13 ex:
14 source study label
15 V03CB1DB357B274B17B139EA56A2FFA19E AUTOIMMUNITY ORVL5KMK5
16 V0BA695C2E326F4C13AD7F6052BB20539B AUTOIMMUNITY 9R0M2E12N
17 V067C445E35DA04ECCA21FA3E2DF3BBCF6 AUTOIMMUNITY QGZLQJ1RV
18 ...
19
20 """
21
22 import argparse
23 import csv
24 import string
25 import sys
26
27 from hashids import Hashids
28 from bl.vl.kb import KnowledgeBase as KB
29 from bl.vl.utils import LOG_LEVELS, get_logger
30 import bl.vl.utils.ome_utils as vlu
31
32
33 def make_parser():
34 parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project')
35 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
36 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
37 help='logger level', default='INFO')
38 parser.add_argument('--study', type=str, help='Study label', required=True)
39 parser.add_argument('--study_to_be_ignored', type=str,
40 help='Study label to be ignored')
41 parser.add_argument('--host', type=str, help='Omero hostname')
42 parser.add_argument('--user', type=str, help='Omero user')
43 parser.add_argument('--passwd', type=str, help='Omero password')
44 parser.add_argument('--ofile', type=str, help='output file path',
45 required=True)
46 parser.add_argument('--reportfile', type=str, help='report file',
47 default='report.tsv')
48 return parser
49
50
51 def init_hashids(study):
52 hashids = Hashids(salt=study, min_length=9,
53 alphabet=string.ascii_uppercase + string.digits)
54 return hashids
55
56
57 def write_csv_to_be_enrolled(logger, hashids, path, inds_map,
58 highest_id=0):
59 csv_header = ['source', 'study', 'label']
60 study_id = highest_id
61
62 # Write to CSV file
63 logger.debug('Writing CSV file %s' % path)
64 with open(path, 'w') as f:
65 writer = csv.DictWriter(f, csv_header,
66 delimiter='\t', quotechar='"',
67 restval='None')
68 writer.writeheader()
69 for k, v in inds_map.iteritems():
70 study_id += 1
71 v['label'] = hashids.encrypt(study_id)
72 writer.writerow(v)
73 return
74
75
76 def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map):
77 logger.debug('Writing CSV file %s' % filename)
78 with open(filename, 'w') as f:
79 writer = csv.DictWriter(f, csv_header,
80 delimiter='\t', quotechar='"',
81 restval='None')
82 writer.writeheader()
83 for k, v in enrolls_map.iteritems():
84 writer.writerow(v)
85 return
86
87
88 def get_enrollments_codes(logger, kb, inds_map):
89 """Retrieve enrollments codes in other studies for the individuals
90 to be enrolled into the specified study"""
91 # Retrieve all studies from omero
92 studies = kb.get_objects(kb.Study)
93 logger.info('Retrieved %d studies from database' % len(studies))
94
95 csv_header = ['individual_uuid']
96 enrolls_map = {}
97 # For each study, retrieve all enrollments
98 for s in studies:
99 logger.info('Retrieving enrollments for study %s' % s.label)
100 enrolls = kb.get_enrolled(s)
101 logger.info('%s enrollments retrieved' % len(enrolls))
102 if len(enrolls) > 0:
103 logger.debug('Building lookup dictionary....')
104 csv_header.append(s.label) # Add study label to CSV header
105 for e in enrolls:
106 if e.individual.id in inds_map:
107 enrolls_map.setdefault(e.individual.omero_id,
108 {})['individual_uuid'] = e.individual.id
109 enrolls_map[e.individual.omero_id][s.label] = e.studyCode
110 else:
111 logger.debug('No enrollments found, skip study %s' % s.label)
112
113 return csv_header, enrolls_map
114
115
116 def main(argv):
117 parser = make_parser()
118 args = parser.parse_args(argv)
119
120 logger = get_logger('inds_not_enrolled', level=args.loglevel,
121 filename=args.logfile)
122 try:
123 host = args.host or vlu.ome_host()
124 user = args.user or vlu.ome_user()
125 passwd = args.passwd or vlu.ome_passwd()
126 except ValueError, ve:
127 logger.critical(ve)
128 sys.exit(ve)
129
130 out_file_path = args.ofile
131
132 kb = KB(driver='omero')(host, user, passwd)
133
134 inds = kb.get_objects(kb.Individual)
135 #len_inds = len(inds)
136 logger.info('Retrieved {} individuals'.format(len(inds)))
137
138 inds_map = {}
139
140 for i in inds:
141 inds_map.setdefault(i.id, {})['source'] = i.id
142 inds_map[i.id]['study'] = args.study
143
144 study = kb.get_by_label(kb.Study, args.study)
145 if study:
146 logger.info('{} present in the database'.format(study.label))
147 else:
148 logger.critical('{} not present in the database'.format(args.study))
149 sys.exit()
150
151 hashids = init_hashids(study.label)
152 enrolls = kb.get_enrolled(study)
153 logger.info("{} enrollments founded in {}".format(len(enrolls),
154 study.label))
155 highest_id = 0
156 #ids = []
157
158 for e in enrolls:
159 if e.individual.id in inds_map:
160 del inds_map[e.individual.id]
161 _ = hashids.decrypt(e.studyCode)
162 if _ > highest_id:
163 highest_id = _[0]
164
165 if args.study_to_be_ignored and kb.get_by_label(kb.Study,
166 args.study_to_be_ignored):
167 to_be_removed = [args.study_to_be_ignored]
168 else:
169 to_be_removed = []
170
171 for tbr_study in to_be_removed:
172 enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study))
173 logger.info('Retrieved {} enrollments from {}'.format(len(enr),
174 tbr_study))
175 for e in enr:
176 if e.individual.id in inds_map:
177 del inds_map[e.individual.id]
178
179 logger.info('{} individuals to be enrolled'.format(len(inds_map)))
180
181 write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id)
182
183 csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map)
184 write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map)
185
186 if __name__ == '__main__':
187 main(sys.argv[1:])