annotate galaxy-tools/biobank/utils/build_enrollments_import.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
1 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
2 prepare a tsv to be imported with a study code foreach individuals not
43be74e62bfe Uploaded
ric
parents:
diff changeset
3 present in a specified study.
43be74e62bfe Uploaded
ric
parents:
diff changeset
4
43be74e62bfe Uploaded
ric
parents:
diff changeset
5 Can be specified also a study from which each individuals enrolled in will
43be74e62bfe Uploaded
ric
parents:
diff changeset
6 be ignored
43be74e62bfe Uploaded
ric
parents:
diff changeset
7
43be74e62bfe Uploaded
ric
parents:
diff changeset
8 Report file contains enrollments codes in the others studies
43be74e62bfe Uploaded
ric
parents:
diff changeset
9
43be74e62bfe Uploaded
ric
parents:
diff changeset
10 Codes are short hashes from numbers generated using Hashids.org with
43be74e62bfe Uploaded
ric
parents:
diff changeset
11 study label as salt parameter
43be74e62bfe Uploaded
ric
parents:
diff changeset
12
43be74e62bfe Uploaded
ric
parents:
diff changeset
13 ex:
43be74e62bfe Uploaded
ric
parents:
diff changeset
14 source study label
43be74e62bfe Uploaded
ric
parents:
diff changeset
15 V03CB1DB357B274B17B139EA56A2FFA19E AUTOIMMUNITY ORVL5KMK5
43be74e62bfe Uploaded
ric
parents:
diff changeset
16 V0BA695C2E326F4C13AD7F6052BB20539B AUTOIMMUNITY 9R0M2E12N
43be74e62bfe Uploaded
ric
parents:
diff changeset
17 V067C445E35DA04ECCA21FA3E2DF3BBCF6 AUTOIMMUNITY QGZLQJ1RV
43be74e62bfe Uploaded
ric
parents:
diff changeset
18 ...
43be74e62bfe Uploaded
ric
parents:
diff changeset
19
43be74e62bfe Uploaded
ric
parents:
diff changeset
20 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
21
43be74e62bfe Uploaded
ric
parents:
diff changeset
22 import argparse
43be74e62bfe Uploaded
ric
parents:
diff changeset
23 import csv
43be74e62bfe Uploaded
ric
parents:
diff changeset
24 import string
43be74e62bfe Uploaded
ric
parents:
diff changeset
25 import sys
43be74e62bfe Uploaded
ric
parents:
diff changeset
26
43be74e62bfe Uploaded
ric
parents:
diff changeset
27 from hashids import Hashids
43be74e62bfe Uploaded
ric
parents:
diff changeset
28 from bl.vl.kb import KnowledgeBase as KB
43be74e62bfe Uploaded
ric
parents:
diff changeset
29 from bl.vl.utils import LOG_LEVELS, get_logger
43be74e62bfe Uploaded
ric
parents:
diff changeset
30 import bl.vl.utils.ome_utils as vlu
43be74e62bfe Uploaded
ric
parents:
diff changeset
31
43be74e62bfe Uploaded
ric
parents:
diff changeset
32
43be74e62bfe Uploaded
ric
parents:
diff changeset
33 def make_parser():
43be74e62bfe Uploaded
ric
parents:
diff changeset
34 parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project')
43be74e62bfe Uploaded
ric
parents:
diff changeset
35 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
43be74e62bfe Uploaded
ric
parents:
diff changeset
36 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
43be74e62bfe Uploaded
ric
parents:
diff changeset
37 help='logger level', default='INFO')
43be74e62bfe Uploaded
ric
parents:
diff changeset
38 parser.add_argument('--study', type=str, help='Study label', required=True)
43be74e62bfe Uploaded
ric
parents:
diff changeset
39 parser.add_argument('--study_to_be_ignored', type=str,
43be74e62bfe Uploaded
ric
parents:
diff changeset
40 help='Study label to be ignored')
43be74e62bfe Uploaded
ric
parents:
diff changeset
41 parser.add_argument('--host', type=str, help='Omero hostname')
43be74e62bfe Uploaded
ric
parents:
diff changeset
42 parser.add_argument('--user', type=str, help='Omero user')
43be74e62bfe Uploaded
ric
parents:
diff changeset
43 parser.add_argument('--passwd', type=str, help='Omero password')
43be74e62bfe Uploaded
ric
parents:
diff changeset
44 parser.add_argument('--ofile', type=str, help='output file path',
43be74e62bfe Uploaded
ric
parents:
diff changeset
45 required=True)
43be74e62bfe Uploaded
ric
parents:
diff changeset
46 parser.add_argument('--reportfile', type=str, help='report file',
43be74e62bfe Uploaded
ric
parents:
diff changeset
47 default='report.tsv')
43be74e62bfe Uploaded
ric
parents:
diff changeset
48 return parser
43be74e62bfe Uploaded
ric
parents:
diff changeset
49
43be74e62bfe Uploaded
ric
parents:
diff changeset
50
43be74e62bfe Uploaded
ric
parents:
diff changeset
51 def init_hashids(study):
43be74e62bfe Uploaded
ric
parents:
diff changeset
52 hashids = Hashids(salt=study, min_length=9,
43be74e62bfe Uploaded
ric
parents:
diff changeset
53 alphabet=string.ascii_uppercase + string.digits)
43be74e62bfe Uploaded
ric
parents:
diff changeset
54 return hashids
43be74e62bfe Uploaded
ric
parents:
diff changeset
55
43be74e62bfe Uploaded
ric
parents:
diff changeset
56
43be74e62bfe Uploaded
ric
parents:
diff changeset
57 def write_csv_to_be_enrolled(logger, hashids, path, inds_map,
43be74e62bfe Uploaded
ric
parents:
diff changeset
58 highest_id=0):
43be74e62bfe Uploaded
ric
parents:
diff changeset
59 csv_header = ['source', 'study', 'label']
43be74e62bfe Uploaded
ric
parents:
diff changeset
60 study_id = highest_id
43be74e62bfe Uploaded
ric
parents:
diff changeset
61
43be74e62bfe Uploaded
ric
parents:
diff changeset
62 # Write to CSV file
43be74e62bfe Uploaded
ric
parents:
diff changeset
63 logger.debug('Writing CSV file %s' % path)
43be74e62bfe Uploaded
ric
parents:
diff changeset
64 with open(path, 'w') as f:
43be74e62bfe Uploaded
ric
parents:
diff changeset
65 writer = csv.DictWriter(f, csv_header,
43be74e62bfe Uploaded
ric
parents:
diff changeset
66 delimiter='\t', quotechar='"',
43be74e62bfe Uploaded
ric
parents:
diff changeset
67 restval='None')
43be74e62bfe Uploaded
ric
parents:
diff changeset
68 writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
69 for k, v in inds_map.iteritems():
43be74e62bfe Uploaded
ric
parents:
diff changeset
70 study_id += 1
43be74e62bfe Uploaded
ric
parents:
diff changeset
71 v['label'] = hashids.encrypt(study_id)
43be74e62bfe Uploaded
ric
parents:
diff changeset
72 writer.writerow(v)
43be74e62bfe Uploaded
ric
parents:
diff changeset
73 return
43be74e62bfe Uploaded
ric
parents:
diff changeset
74
43be74e62bfe Uploaded
ric
parents:
diff changeset
75
43be74e62bfe Uploaded
ric
parents:
diff changeset
76 def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map):
43be74e62bfe Uploaded
ric
parents:
diff changeset
77 logger.debug('Writing CSV file %s' % filename)
43be74e62bfe Uploaded
ric
parents:
diff changeset
78 with open(filename, 'w') as f:
43be74e62bfe Uploaded
ric
parents:
diff changeset
79 writer = csv.DictWriter(f, csv_header,
43be74e62bfe Uploaded
ric
parents:
diff changeset
80 delimiter='\t', quotechar='"',
43be74e62bfe Uploaded
ric
parents:
diff changeset
81 restval='None')
43be74e62bfe Uploaded
ric
parents:
diff changeset
82 writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
83 for k, v in enrolls_map.iteritems():
43be74e62bfe Uploaded
ric
parents:
diff changeset
84 writer.writerow(v)
43be74e62bfe Uploaded
ric
parents:
diff changeset
85 return
43be74e62bfe Uploaded
ric
parents:
diff changeset
86
43be74e62bfe Uploaded
ric
parents:
diff changeset
87
43be74e62bfe Uploaded
ric
parents:
diff changeset
88 def get_enrollments_codes(logger, kb, inds_map):
43be74e62bfe Uploaded
ric
parents:
diff changeset
89 """Retrieve enrollments codes in other studies for the individuals
43be74e62bfe Uploaded
ric
parents:
diff changeset
90 to be enrolled into the specified study"""
43be74e62bfe Uploaded
ric
parents:
diff changeset
91 # Retrieve all studies from omero
43be74e62bfe Uploaded
ric
parents:
diff changeset
92 studies = kb.get_objects(kb.Study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
93 logger.info('Retrieved %d studies from database' % len(studies))
43be74e62bfe Uploaded
ric
parents:
diff changeset
94
43be74e62bfe Uploaded
ric
parents:
diff changeset
95 csv_header = ['individual_uuid']
43be74e62bfe Uploaded
ric
parents:
diff changeset
96 enrolls_map = {}
43be74e62bfe Uploaded
ric
parents:
diff changeset
97 # For each study, retrieve all enrollments
43be74e62bfe Uploaded
ric
parents:
diff changeset
98 for s in studies:
43be74e62bfe Uploaded
ric
parents:
diff changeset
99 logger.info('Retrieving enrollments for study %s' % s.label)
43be74e62bfe Uploaded
ric
parents:
diff changeset
100 enrolls = kb.get_enrolled(s)
43be74e62bfe Uploaded
ric
parents:
diff changeset
101 logger.info('%s enrollments retrieved' % len(enrolls))
43be74e62bfe Uploaded
ric
parents:
diff changeset
102 if len(enrolls) > 0:
43be74e62bfe Uploaded
ric
parents:
diff changeset
103 logger.debug('Building lookup dictionary....')
43be74e62bfe Uploaded
ric
parents:
diff changeset
104 csv_header.append(s.label) # Add study label to CSV header
43be74e62bfe Uploaded
ric
parents:
diff changeset
105 for e in enrolls:
43be74e62bfe Uploaded
ric
parents:
diff changeset
106 if e.individual.id in inds_map:
43be74e62bfe Uploaded
ric
parents:
diff changeset
107 enrolls_map.setdefault(e.individual.omero_id,
43be74e62bfe Uploaded
ric
parents:
diff changeset
108 {})['individual_uuid'] = e.individual.id
43be74e62bfe Uploaded
ric
parents:
diff changeset
109 enrolls_map[e.individual.omero_id][s.label] = e.studyCode
43be74e62bfe Uploaded
ric
parents:
diff changeset
110 else:
43be74e62bfe Uploaded
ric
parents:
diff changeset
111 logger.debug('No enrollments found, skip study %s' % s.label)
43be74e62bfe Uploaded
ric
parents:
diff changeset
112
43be74e62bfe Uploaded
ric
parents:
diff changeset
113 return csv_header, enrolls_map
43be74e62bfe Uploaded
ric
parents:
diff changeset
114
43be74e62bfe Uploaded
ric
parents:
diff changeset
115
43be74e62bfe Uploaded
ric
parents:
diff changeset
116 def main(argv):
43be74e62bfe Uploaded
ric
parents:
diff changeset
117 parser = make_parser()
43be74e62bfe Uploaded
ric
parents:
diff changeset
118 args = parser.parse_args(argv)
43be74e62bfe Uploaded
ric
parents:
diff changeset
119
43be74e62bfe Uploaded
ric
parents:
diff changeset
120 logger = get_logger('inds_not_enrolled', level=args.loglevel,
43be74e62bfe Uploaded
ric
parents:
diff changeset
121 filename=args.logfile)
43be74e62bfe Uploaded
ric
parents:
diff changeset
122 try:
43be74e62bfe Uploaded
ric
parents:
diff changeset
123 host = args.host or vlu.ome_host()
43be74e62bfe Uploaded
ric
parents:
diff changeset
124 user = args.user or vlu.ome_user()
43be74e62bfe Uploaded
ric
parents:
diff changeset
125 passwd = args.passwd or vlu.ome_passwd()
43be74e62bfe Uploaded
ric
parents:
diff changeset
126 except ValueError, ve:
43be74e62bfe Uploaded
ric
parents:
diff changeset
127 logger.critical(ve)
43be74e62bfe Uploaded
ric
parents:
diff changeset
128 sys.exit(ve)
43be74e62bfe Uploaded
ric
parents:
diff changeset
129
43be74e62bfe Uploaded
ric
parents:
diff changeset
130 out_file_path = args.ofile
43be74e62bfe Uploaded
ric
parents:
diff changeset
131
43be74e62bfe Uploaded
ric
parents:
diff changeset
132 kb = KB(driver='omero')(host, user, passwd)
43be74e62bfe Uploaded
ric
parents:
diff changeset
133
43be74e62bfe Uploaded
ric
parents:
diff changeset
134 inds = kb.get_objects(kb.Individual)
43be74e62bfe Uploaded
ric
parents:
diff changeset
135 #len_inds = len(inds)
43be74e62bfe Uploaded
ric
parents:
diff changeset
136 logger.info('Retrieved {} individuals'.format(len(inds)))
43be74e62bfe Uploaded
ric
parents:
diff changeset
137
43be74e62bfe Uploaded
ric
parents:
diff changeset
138 inds_map = {}
43be74e62bfe Uploaded
ric
parents:
diff changeset
139
43be74e62bfe Uploaded
ric
parents:
diff changeset
140 for i in inds:
43be74e62bfe Uploaded
ric
parents:
diff changeset
141 inds_map.setdefault(i.id, {})['source'] = i.id
43be74e62bfe Uploaded
ric
parents:
diff changeset
142 inds_map[i.id]['study'] = args.study
43be74e62bfe Uploaded
ric
parents:
diff changeset
143
43be74e62bfe Uploaded
ric
parents:
diff changeset
144 study = kb.get_by_label(kb.Study, args.study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
145 if study:
43be74e62bfe Uploaded
ric
parents:
diff changeset
146 logger.info('{} present in the database'.format(study.label))
43be74e62bfe Uploaded
ric
parents:
diff changeset
147 else:
43be74e62bfe Uploaded
ric
parents:
diff changeset
148 logger.critical('{} not present in the database'.format(args.study))
43be74e62bfe Uploaded
ric
parents:
diff changeset
149 sys.exit()
43be74e62bfe Uploaded
ric
parents:
diff changeset
150
43be74e62bfe Uploaded
ric
parents:
diff changeset
151 hashids = init_hashids(study.label)
43be74e62bfe Uploaded
ric
parents:
diff changeset
152 enrolls = kb.get_enrolled(study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
153 logger.info("{} enrollments founded in {}".format(len(enrolls),
43be74e62bfe Uploaded
ric
parents:
diff changeset
154 study.label))
43be74e62bfe Uploaded
ric
parents:
diff changeset
155 highest_id = 0
43be74e62bfe Uploaded
ric
parents:
diff changeset
156 #ids = []
43be74e62bfe Uploaded
ric
parents:
diff changeset
157
43be74e62bfe Uploaded
ric
parents:
diff changeset
158 for e in enrolls:
43be74e62bfe Uploaded
ric
parents:
diff changeset
159 if e.individual.id in inds_map:
43be74e62bfe Uploaded
ric
parents:
diff changeset
160 del inds_map[e.individual.id]
43be74e62bfe Uploaded
ric
parents:
diff changeset
161 _ = hashids.decrypt(e.studyCode)
43be74e62bfe Uploaded
ric
parents:
diff changeset
162 if _ > highest_id:
43be74e62bfe Uploaded
ric
parents:
diff changeset
163 highest_id = _[0]
43be74e62bfe Uploaded
ric
parents:
diff changeset
164
43be74e62bfe Uploaded
ric
parents:
diff changeset
165 if args.study_to_be_ignored and kb.get_by_label(kb.Study,
43be74e62bfe Uploaded
ric
parents:
diff changeset
166 args.study_to_be_ignored):
43be74e62bfe Uploaded
ric
parents:
diff changeset
167 to_be_removed = [args.study_to_be_ignored]
43be74e62bfe Uploaded
ric
parents:
diff changeset
168 else:
43be74e62bfe Uploaded
ric
parents:
diff changeset
169 to_be_removed = []
43be74e62bfe Uploaded
ric
parents:
diff changeset
170
43be74e62bfe Uploaded
ric
parents:
diff changeset
171 for tbr_study in to_be_removed:
43be74e62bfe Uploaded
ric
parents:
diff changeset
172 enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study))
43be74e62bfe Uploaded
ric
parents:
diff changeset
173 logger.info('Retrieved {} enrollments from {}'.format(len(enr),
43be74e62bfe Uploaded
ric
parents:
diff changeset
174 tbr_study))
43be74e62bfe Uploaded
ric
parents:
diff changeset
175 for e in enr:
43be74e62bfe Uploaded
ric
parents:
diff changeset
176 if e.individual.id in inds_map:
43be74e62bfe Uploaded
ric
parents:
diff changeset
177 del inds_map[e.individual.id]
43be74e62bfe Uploaded
ric
parents:
diff changeset
178
43be74e62bfe Uploaded
ric
parents:
diff changeset
179 logger.info('{} individuals to be enrolled'.format(len(inds_map)))
43be74e62bfe Uploaded
ric
parents:
diff changeset
180
43be74e62bfe Uploaded
ric
parents:
diff changeset
181 write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id)
43be74e62bfe Uploaded
ric
parents:
diff changeset
182
43be74e62bfe Uploaded
ric
parents:
diff changeset
183 csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map)
43be74e62bfe Uploaded
ric
parents:
diff changeset
184 write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map)
43be74e62bfe Uploaded
ric
parents:
diff changeset
185
43be74e62bfe Uploaded
ric
parents:
diff changeset
186 if __name__ == '__main__':
43be74e62bfe Uploaded
ric
parents:
diff changeset
187 main(sys.argv[1:])