Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/utils/build_enrollments_import.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 """ | |
| 2 prepare a tsv to be imported with a study code foreach individuals not | |
| 3 present in a specified study. | |
| 4 | |
| 5 Can be specified also a study from which each individuals enrolled in will | |
| 6 be ignored | |
| 7 | |
| 8 Report file contains enrollments codes in the others studies | |
| 9 | |
| 10 Codes are short hashes from numbers generated using Hashids.org with | |
| 11 study label as salt parameter | |
| 12 | |
| 13 ex: | |
| 14 source study label | |
| 15 V03CB1DB357B274B17B139EA56A2FFA19E AUTOIMMUNITY ORVL5KMK5 | |
| 16 V0BA695C2E326F4C13AD7F6052BB20539B AUTOIMMUNITY 9R0M2E12N | |
| 17 V067C445E35DA04ECCA21FA3E2DF3BBCF6 AUTOIMMUNITY QGZLQJ1RV | |
| 18 ... | |
| 19 | |
| 20 """ | |
| 21 | |
| 22 import argparse | |
| 23 import csv | |
| 24 import string | |
| 25 import sys | |
| 26 | |
| 27 from hashids import Hashids | |
| 28 from bl.vl.kb import KnowledgeBase as KB | |
| 29 from bl.vl.utils import LOG_LEVELS, get_logger | |
| 30 import bl.vl.utils.ome_utils as vlu | |
| 31 | |
| 32 | |
| 33 def make_parser(): | |
| 34 parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project') | |
| 35 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 36 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 37 help='logger level', default='INFO') | |
| 38 parser.add_argument('--study', type=str, help='Study label', required=True) | |
| 39 parser.add_argument('--study_to_be_ignored', type=str, | |
| 40 help='Study label to be ignored') | |
| 41 parser.add_argument('--host', type=str, help='Omero hostname') | |
| 42 parser.add_argument('--user', type=str, help='Omero user') | |
| 43 parser.add_argument('--passwd', type=str, help='Omero password') | |
| 44 parser.add_argument('--ofile', type=str, help='output file path', | |
| 45 required=True) | |
| 46 parser.add_argument('--reportfile', type=str, help='report file', | |
| 47 default='report.tsv') | |
| 48 return parser | |
| 49 | |
| 50 | |
| 51 def init_hashids(study): | |
| 52 hashids = Hashids(salt=study, min_length=9, | |
| 53 alphabet=string.ascii_uppercase + string.digits) | |
| 54 return hashids | |
| 55 | |
| 56 | |
| 57 def write_csv_to_be_enrolled(logger, hashids, path, inds_map, | |
| 58 highest_id=0): | |
| 59 csv_header = ['source', 'study', 'label'] | |
| 60 study_id = highest_id | |
| 61 | |
| 62 # Write to CSV file | |
| 63 logger.debug('Writing CSV file %s' % path) | |
| 64 with open(path, 'w') as f: | |
| 65 writer = csv.DictWriter(f, csv_header, | |
| 66 delimiter='\t', quotechar='"', | |
| 67 restval='None') | |
| 68 writer.writeheader() | |
| 69 for k, v in inds_map.iteritems(): | |
| 70 study_id += 1 | |
| 71 v['label'] = hashids.encrypt(study_id) | |
| 72 writer.writerow(v) | |
| 73 return | |
| 74 | |
| 75 | |
| 76 def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map): | |
| 77 logger.debug('Writing CSV file %s' % filename) | |
| 78 with open(filename, 'w') as f: | |
| 79 writer = csv.DictWriter(f, csv_header, | |
| 80 delimiter='\t', quotechar='"', | |
| 81 restval='None') | |
| 82 writer.writeheader() | |
| 83 for k, v in enrolls_map.iteritems(): | |
| 84 writer.writerow(v) | |
| 85 return | |
| 86 | |
| 87 | |
| 88 def get_enrollments_codes(logger, kb, inds_map): | |
| 89 """Retrieve enrollments codes in other studies for the individuals | |
| 90 to be enrolled into the specified study""" | |
| 91 # Retrieve all studies from omero | |
| 92 studies = kb.get_objects(kb.Study) | |
| 93 logger.info('Retrieved %d studies from database' % len(studies)) | |
| 94 | |
| 95 csv_header = ['individual_uuid'] | |
| 96 enrolls_map = {} | |
| 97 # For each study, retrieve all enrollments | |
| 98 for s in studies: | |
| 99 logger.info('Retrieving enrollments for study %s' % s.label) | |
| 100 enrolls = kb.get_enrolled(s) | |
| 101 logger.info('%s enrollments retrieved' % len(enrolls)) | |
| 102 if len(enrolls) > 0: | |
| 103 logger.debug('Building lookup dictionary....') | |
| 104 csv_header.append(s.label) # Add study label to CSV header | |
| 105 for e in enrolls: | |
| 106 if e.individual.id in inds_map: | |
| 107 enrolls_map.setdefault(e.individual.omero_id, | |
| 108 {})['individual_uuid'] = e.individual.id | |
| 109 enrolls_map[e.individual.omero_id][s.label] = e.studyCode | |
| 110 else: | |
| 111 logger.debug('No enrollments found, skip study %s' % s.label) | |
| 112 | |
| 113 return csv_header, enrolls_map | |
| 114 | |
| 115 | |
| 116 def main(argv): | |
| 117 parser = make_parser() | |
| 118 args = parser.parse_args(argv) | |
| 119 | |
| 120 logger = get_logger('inds_not_enrolled', level=args.loglevel, | |
| 121 filename=args.logfile) | |
| 122 try: | |
| 123 host = args.host or vlu.ome_host() | |
| 124 user = args.user or vlu.ome_user() | |
| 125 passwd = args.passwd or vlu.ome_passwd() | |
| 126 except ValueError, ve: | |
| 127 logger.critical(ve) | |
| 128 sys.exit(ve) | |
| 129 | |
| 130 out_file_path = args.ofile | |
| 131 | |
| 132 kb = KB(driver='omero')(host, user, passwd) | |
| 133 | |
| 134 inds = kb.get_objects(kb.Individual) | |
| 135 #len_inds = len(inds) | |
| 136 logger.info('Retrieved {} individuals'.format(len(inds))) | |
| 137 | |
| 138 inds_map = {} | |
| 139 | |
| 140 for i in inds: | |
| 141 inds_map.setdefault(i.id, {})['source'] = i.id | |
| 142 inds_map[i.id]['study'] = args.study | |
| 143 | |
| 144 study = kb.get_by_label(kb.Study, args.study) | |
| 145 if study: | |
| 146 logger.info('{} present in the database'.format(study.label)) | |
| 147 else: | |
| 148 logger.critical('{} not present in the database'.format(args.study)) | |
| 149 sys.exit() | |
| 150 | |
| 151 hashids = init_hashids(study.label) | |
| 152 enrolls = kb.get_enrolled(study) | |
| 153 logger.info("{} enrollments founded in {}".format(len(enrolls), | |
| 154 study.label)) | |
| 155 highest_id = 0 | |
| 156 #ids = [] | |
| 157 | |
| 158 for e in enrolls: | |
| 159 if e.individual.id in inds_map: | |
| 160 del inds_map[e.individual.id] | |
| 161 _ = hashids.decrypt(e.studyCode) | |
| 162 if _ > highest_id: | |
| 163 highest_id = _[0] | |
| 164 | |
| 165 if args.study_to_be_ignored and kb.get_by_label(kb.Study, | |
| 166 args.study_to_be_ignored): | |
| 167 to_be_removed = [args.study_to_be_ignored] | |
| 168 else: | |
| 169 to_be_removed = [] | |
| 170 | |
| 171 for tbr_study in to_be_removed: | |
| 172 enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study)) | |
| 173 logger.info('Retrieved {} enrollments from {}'.format(len(enr), | |
| 174 tbr_study)) | |
| 175 for e in enr: | |
| 176 if e.individual.id in inds_map: | |
| 177 del inds_map[e.individual.id] | |
| 178 | |
| 179 logger.info('{} individuals to be enrolled'.format(len(inds_map))) | |
| 180 | |
| 181 write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id) | |
| 182 | |
| 183 csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map) | |
| 184 write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map) | |
| 185 | |
| 186 if __name__ == '__main__': | |
| 187 main(sys.argv[1:]) |
