Mercurial > repos > ric > test1
view galaxy-tools/biobank/utils/build_enrollments_import.py @ 12:46f08bb8dd68 draft default tip
Uploaded
author | ric |
---|---|
date | Wed, 28 Sep 2016 04:59:02 -0400 |
parents | 43be74e62bfe |
children |
line wrap: on
line source
""" prepare a tsv to be imported with a study code foreach individuals not present in a specified study. Can be specified also a study from which each individuals enrolled in will be ignored Report file contains enrollments codes in the others studies Codes are short hashes from numbers generated using Hashids.org with study label as salt parameter ex: source study label V03CB1DB357B274B17B139EA56A2FFA19E AUTOIMMUNITY ORVL5KMK5 V0BA695C2E326F4C13AD7F6052BB20539B AUTOIMMUNITY 9R0M2E12N V067C445E35DA04ECCA21FA3E2DF3BBCF6 AUTOIMMUNITY QGZLQJ1RV ... """ import argparse import csv import string import sys from hashids import Hashids from bl.vl.kb import KnowledgeBase as KB from bl.vl.utils import LOG_LEVELS, get_logger import bl.vl.utils.ome_utils as vlu def make_parser(): parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project') parser.add_argument('--logfile', type=str, help='log file (default=stderr)') parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, help='logger level', default='INFO') parser.add_argument('--study', type=str, help='Study label', required=True) parser.add_argument('--study_to_be_ignored', type=str, help='Study label to be ignored') parser.add_argument('--host', type=str, help='Omero hostname') parser.add_argument('--user', type=str, help='Omero user') parser.add_argument('--passwd', type=str, help='Omero password') parser.add_argument('--ofile', type=str, help='output file path', required=True) parser.add_argument('--reportfile', type=str, help='report file', default='report.tsv') return parser def init_hashids(study): hashids = Hashids(salt=study, min_length=9, alphabet=string.ascii_uppercase + string.digits) return hashids def write_csv_to_be_enrolled(logger, hashids, path, inds_map, highest_id=0): csv_header = ['source', 'study', 'label'] study_id = highest_id # Write to CSV file logger.debug('Writing CSV file %s' % path) with open(path, 'w') as f: writer = csv.DictWriter(f, csv_header, delimiter='\t', quotechar='"', restval='None') writer.writeheader() for k, v in inds_map.iteritems(): study_id += 1 v['label'] = hashids.encrypt(study_id) writer.writerow(v) return def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map): logger.debug('Writing CSV file %s' % filename) with open(filename, 'w') as f: writer = csv.DictWriter(f, csv_header, delimiter='\t', quotechar='"', restval='None') writer.writeheader() for k, v in enrolls_map.iteritems(): writer.writerow(v) return def get_enrollments_codes(logger, kb, inds_map): """Retrieve enrollments codes in other studies for the individuals to be enrolled into the specified study""" # Retrieve all studies from omero studies = kb.get_objects(kb.Study) logger.info('Retrieved %d studies from database' % len(studies)) csv_header = ['individual_uuid'] enrolls_map = {} # For each study, retrieve all enrollments for s in studies: logger.info('Retrieving enrollments for study %s' % s.label) enrolls = kb.get_enrolled(s) logger.info('%s enrollments retrieved' % len(enrolls)) if len(enrolls) > 0: logger.debug('Building lookup dictionary....') csv_header.append(s.label) # Add study label to CSV header for e in enrolls: if e.individual.id in inds_map: enrolls_map.setdefault(e.individual.omero_id, {})['individual_uuid'] = e.individual.id enrolls_map[e.individual.omero_id][s.label] = e.studyCode else: logger.debug('No enrollments found, skip study %s' % s.label) return csv_header, enrolls_map def main(argv): parser = make_parser() args = parser.parse_args(argv) logger = get_logger('inds_not_enrolled', level=args.loglevel, filename=args.logfile) try: host = args.host or vlu.ome_host() user = args.user or vlu.ome_user() passwd = args.passwd or vlu.ome_passwd() except ValueError, ve: logger.critical(ve) sys.exit(ve) out_file_path = args.ofile kb = KB(driver='omero')(host, user, passwd) inds = kb.get_objects(kb.Individual) #len_inds = len(inds) logger.info('Retrieved {} individuals'.format(len(inds))) inds_map = {} for i in inds: inds_map.setdefault(i.id, {})['source'] = i.id inds_map[i.id]['study'] = args.study study = kb.get_by_label(kb.Study, args.study) if study: logger.info('{} present in the database'.format(study.label)) else: logger.critical('{} not present in the database'.format(args.study)) sys.exit() hashids = init_hashids(study.label) enrolls = kb.get_enrolled(study) logger.info("{} enrollments founded in {}".format(len(enrolls), study.label)) highest_id = 0 #ids = [] for e in enrolls: if e.individual.id in inds_map: del inds_map[e.individual.id] _ = hashids.decrypt(e.studyCode) if _ > highest_id: highest_id = _[0] if args.study_to_be_ignored and kb.get_by_label(kb.Study, args.study_to_be_ignored): to_be_removed = [args.study_to_be_ignored] else: to_be_removed = [] for tbr_study in to_be_removed: enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study)) logger.info('Retrieved {} enrollments from {}'.format(len(enr), tbr_study)) for e in enr: if e.individual.id in inds_map: del inds_map[e.individual.id] logger.info('{} individuals to be enrolled'.format(len(inds_map))) write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id) csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map) write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map) if __name__ == '__main__': main(sys.argv[1:])