view galaxy-tools/biobank/utils/build_enrollments_import.py @ 12:46f08bb8dd68 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 04:59:02 -0400
parents 43be74e62bfe
children
line wrap: on
line source

"""
prepare a tsv to be imported with a study code foreach individuals not
present in a specified study.

Can be specified also a study from which each individuals enrolled in will
be ignored

Report file contains enrollments codes in the others studies

Codes are short hashes from numbers generated using Hashids.org with
study label as salt parameter

ex:
source	                            study	        label
V03CB1DB357B274B17B139EA56A2FFA19E	AUTOIMMUNITY	ORVL5KMK5
V0BA695C2E326F4C13AD7F6052BB20539B	AUTOIMMUNITY	9R0M2E12N
V067C445E35DA04ECCA21FA3E2DF3BBCF6	AUTOIMMUNITY	QGZLQJ1RV
...

"""

import argparse
import csv
import string
import sys

from hashids import Hashids
from bl.vl.kb import KnowledgeBase as KB
from bl.vl.utils import LOG_LEVELS, get_logger
import bl.vl.utils.ome_utils as vlu


def make_parser():
    parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project')
    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
                        help='logger level', default='INFO')
    parser.add_argument('--study', type=str, help='Study label', required=True)
    parser.add_argument('--study_to_be_ignored', type=str,
                        help='Study label to be ignored')
    parser.add_argument('--host', type=str, help='Omero hostname')
    parser.add_argument('--user', type=str, help='Omero user')
    parser.add_argument('--passwd', type=str, help='Omero password')
    parser.add_argument('--ofile', type=str, help='output file path',
                        required=True)
    parser.add_argument('--reportfile', type=str, help='report file',
                        default='report.tsv')
    return parser


def init_hashids(study):
    hashids = Hashids(salt=study, min_length=9,
                      alphabet=string.ascii_uppercase + string.digits)
    return hashids


def write_csv_to_be_enrolled(logger, hashids, path, inds_map,
                             highest_id=0):
    csv_header = ['source', 'study', 'label']
    study_id = highest_id
    
    # Write to CSV file
    logger.debug('Writing CSV file %s' % path)
    with open(path, 'w') as f:
        writer = csv.DictWriter(f, csv_header,
                                delimiter='\t', quotechar='"',
                                restval='None')
        writer.writeheader()
        for k, v in inds_map.iteritems():
            study_id += 1
            v['label'] = hashids.encrypt(study_id)
            writer.writerow(v)
    return


def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map):
    logger.debug('Writing CSV file %s' % filename)
    with open(filename, 'w') as f:
        writer = csv.DictWriter(f, csv_header,
                                delimiter='\t', quotechar='"',
                                restval='None')
        writer.writeheader()
        for k, v in enrolls_map.iteritems():
            writer.writerow(v)    
    return


def get_enrollments_codes(logger, kb, inds_map):
    """Retrieve enrollments codes in other studies for the individuals
    to be enrolled into the specified study"""
    # Retrieve all studies from omero
    studies = kb.get_objects(kb.Study)
    logger.info('Retrieved %d studies from database' % len(studies))

    csv_header = ['individual_uuid']
    enrolls_map = {}
    # For each study, retrieve all enrollments
    for s in studies:
        logger.info('Retrieving enrollments for study %s' % s.label)
        enrolls = kb.get_enrolled(s)
        logger.info('%s enrollments retrieved' % len(enrolls))
        if len(enrolls) > 0:
            logger.debug('Building lookup dictionary....')
            csv_header.append(s.label)  # Add study label to CSV header
            for e in enrolls:
                if e.individual.id in inds_map:
                    enrolls_map.setdefault(e.individual.omero_id,
                                           {})['individual_uuid'] = e.individual.id
                    enrolls_map[e.individual.omero_id][s.label] = e.studyCode
        else:
            logger.debug('No enrollments found, skip study %s' % s.label)
            
    return csv_header, enrolls_map


def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)

    logger = get_logger('inds_not_enrolled', level=args.loglevel,
                        filename=args.logfile)
    try:
        host = args.host or vlu.ome_host()
        user = args.user or vlu.ome_user()
        passwd = args.passwd or vlu.ome_passwd()
    except ValueError, ve:
        logger.critical(ve)
        sys.exit(ve)

    out_file_path = args.ofile

    kb = KB(driver='omero')(host, user, passwd)

    inds = kb.get_objects(kb.Individual)
    #len_inds = len(inds)
    logger.info('Retrieved {} individuals'.format(len(inds)))

    inds_map = {}

    for i in inds:
        inds_map.setdefault(i.id, {})['source'] = i.id
        inds_map[i.id]['study'] = args.study    
   
    study = kb.get_by_label(kb.Study, args.study)
    if study:
        logger.info('{} present in the database'.format(study.label))
    else:
        logger.critical('{} not present in the database'.format(args.study))
        sys.exit()

    hashids = init_hashids(study.label)
    enrolls = kb.get_enrolled(study)
    logger.info("{} enrollments founded in {}".format(len(enrolls),
                                                      study.label))
    highest_id = 0
    #ids = []
    
    for e in enrolls:
        if e.individual.id in inds_map:
                del inds_map[e.individual.id]
        _ = hashids.decrypt(e.studyCode)
        if _ > highest_id:
            highest_id = _[0]
            
    if args.study_to_be_ignored and kb.get_by_label(kb.Study,
                                                    args.study_to_be_ignored):
        to_be_removed = [args.study_to_be_ignored]
    else:
        to_be_removed = []        
    
    for tbr_study in to_be_removed:
        enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study))
        logger.info('Retrieved {} enrollments from {}'.format(len(enr),
                                                              tbr_study))
        for e in enr:
            if e.individual.id in inds_map:
                del inds_map[e.individual.id]            

    logger.info('{} individuals to be enrolled'.format(len(inds_map)))    
    
    write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id)

    csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map)
    write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map)
    
if __name__ == '__main__':
    main(sys.argv[1:])