Mercurial > repos > ric > test1
diff galaxy-tools/biobank/utils/build_enrollments_import.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/build_enrollments_import.py Thu Sep 22 08:57:04 2016 -0400 @@ -0,0 +1,187 @@ +""" +prepare a tsv to be imported with a study code foreach individuals not +present in a specified study. + +Can be specified also a study from which each individuals enrolled in will +be ignored + +Report file contains enrollments codes in the others studies + +Codes are short hashes from numbers generated using Hashids.org with +study label as salt parameter + +ex: +source study label +V03CB1DB357B274B17B139EA56A2FFA19E AUTOIMMUNITY ORVL5KMK5 +V0BA695C2E326F4C13AD7F6052BB20539B AUTOIMMUNITY 9R0M2E12N +V067C445E35DA04ECCA21FA3E2DF3BBCF6 AUTOIMMUNITY QGZLQJ1RV +... + +""" + +import argparse +import csv +import string +import sys + +from hashids import Hashids +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.utils import LOG_LEVELS, get_logger +import bl.vl.utils.ome_utils as vlu + + +def make_parser(): + parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logger level', default='INFO') + parser.add_argument('--study', type=str, help='Study label', required=True) + parser.add_argument('--study_to_be_ignored', type=str, + help='Study label to be ignored') + parser.add_argument('--host', type=str, help='Omero hostname') + parser.add_argument('--user', type=str, help='Omero user') + parser.add_argument('--passwd', type=str, help='Omero password') + parser.add_argument('--ofile', type=str, help='output file path', + required=True) + parser.add_argument('--reportfile', type=str, help='report file', + default='report.tsv') + return parser + + +def init_hashids(study): + hashids = Hashids(salt=study, min_length=9, + alphabet=string.ascii_uppercase + string.digits) + return hashids + + +def write_csv_to_be_enrolled(logger, hashids, path, inds_map, + highest_id=0): + csv_header = ['source', 'study', 'label'] + study_id = highest_id + + # Write to CSV file + logger.debug('Writing CSV file %s' % path) + with open(path, 'w') as f: + writer = csv.DictWriter(f, csv_header, + delimiter='\t', quotechar='"', + restval='None') + writer.writeheader() + for k, v in inds_map.iteritems(): + study_id += 1 + v['label'] = hashids.encrypt(study_id) + writer.writerow(v) + return + + +def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map): + logger.debug('Writing CSV file %s' % filename) + with open(filename, 'w') as f: + writer = csv.DictWriter(f, csv_header, + delimiter='\t', quotechar='"', + restval='None') + writer.writeheader() + for k, v in enrolls_map.iteritems(): + writer.writerow(v) + return + + +def get_enrollments_codes(logger, kb, inds_map): + """Retrieve enrollments codes in other studies for the individuals + to be enrolled into the specified study""" + # Retrieve all studies from omero + studies = kb.get_objects(kb.Study) + logger.info('Retrieved %d studies from database' % len(studies)) + + csv_header = ['individual_uuid'] + enrolls_map = {} + # For each study, retrieve all enrollments + for s in studies: + logger.info('Retrieving enrollments for study %s' % s.label) + enrolls = kb.get_enrolled(s) + logger.info('%s enrollments retrieved' % len(enrolls)) + if len(enrolls) > 0: + logger.debug('Building lookup dictionary....') + csv_header.append(s.label) # Add study label to CSV header + for e in enrolls: + if e.individual.id in inds_map: + enrolls_map.setdefault(e.individual.omero_id, + {})['individual_uuid'] = e.individual.id + enrolls_map[e.individual.omero_id][s.label] = e.studyCode + else: + logger.debug('No enrollments found, skip study %s' % s.label) + + return csv_header, enrolls_map + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('inds_not_enrolled', level=args.loglevel, + filename=args.logfile) + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + out_file_path = args.ofile + + kb = KB(driver='omero')(host, user, passwd) + + inds = kb.get_objects(kb.Individual) + #len_inds = len(inds) + logger.info('Retrieved {} individuals'.format(len(inds))) + + inds_map = {} + + for i in inds: + inds_map.setdefault(i.id, {})['source'] = i.id + inds_map[i.id]['study'] = args.study + + study = kb.get_by_label(kb.Study, args.study) + if study: + logger.info('{} present in the database'.format(study.label)) + else: + logger.critical('{} not present in the database'.format(args.study)) + sys.exit() + + hashids = init_hashids(study.label) + enrolls = kb.get_enrolled(study) + logger.info("{} enrollments founded in {}".format(len(enrolls), + study.label)) + highest_id = 0 + #ids = [] + + for e in enrolls: + if e.individual.id in inds_map: + del inds_map[e.individual.id] + _ = hashids.decrypt(e.studyCode) + if _ > highest_id: + highest_id = _[0] + + if args.study_to_be_ignored and kb.get_by_label(kb.Study, + args.study_to_be_ignored): + to_be_removed = [args.study_to_be_ignored] + else: + to_be_removed = [] + + for tbr_study in to_be_removed: + enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study)) + logger.info('Retrieved {} enrollments from {}'.format(len(enr), + tbr_study)) + for e in enr: + if e.individual.id in inds_map: + del inds_map[e.individual.id] + + logger.info('{} individuals to be enrolled'.format(len(inds_map))) + + write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id) + + csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map) + write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map) + +if __name__ == '__main__': + main(sys.argv[1:])