diff galaxy-tools/biobank/utils/build_enrollments_import.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/utils/build_enrollments_import.py	Thu Sep 22 08:57:04 2016 -0400
@@ -0,0 +1,187 @@
+"""
+prepare a tsv to be imported with a study code foreach individuals not
+present in a specified study.
+
+Can be specified also a study from which each individuals enrolled in will
+be ignored
+
+Report file contains enrollments codes in the others studies
+
+Codes are short hashes from numbers generated using Hashids.org with
+study label as salt parameter
+
+ex:
+source	                            study	        label
+V03CB1DB357B274B17B139EA56A2FFA19E	AUTOIMMUNITY	ORVL5KMK5
+V0BA695C2E326F4C13AD7F6052BB20539B	AUTOIMMUNITY	9R0M2E12N
+V067C445E35DA04ECCA21FA3E2DF3BBCF6	AUTOIMMUNITY	QGZLQJ1RV
+...
+
+"""
+
+import argparse
+import csv
+import string
+import sys
+
+from hashids import Hashids
+from bl.vl.kb import KnowledgeBase as KB
+from bl.vl.utils import LOG_LEVELS, get_logger
+import bl.vl.utils.ome_utils as vlu
+
+
+def make_parser():
+    parser = argparse.ArgumentParser(description='Retrieve all individuals not enrolled in the specified project')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logger level', default='INFO')
+    parser.add_argument('--study', type=str, help='Study label', required=True)
+    parser.add_argument('--study_to_be_ignored', type=str,
+                        help='Study label to be ignored')
+    parser.add_argument('--host', type=str, help='Omero hostname')
+    parser.add_argument('--user', type=str, help='Omero user')
+    parser.add_argument('--passwd', type=str, help='Omero password')
+    parser.add_argument('--ofile', type=str, help='output file path',
+                        required=True)
+    parser.add_argument('--reportfile', type=str, help='report file',
+                        default='report.tsv')
+    return parser
+
+
+def init_hashids(study):
+    hashids = Hashids(salt=study, min_length=9,
+                      alphabet=string.ascii_uppercase + string.digits)
+    return hashids
+
+
+def write_csv_to_be_enrolled(logger, hashids, path, inds_map,
+                             highest_id=0):
+    csv_header = ['source', 'study', 'label']
+    study_id = highest_id
+    
+    # Write to CSV file
+    logger.debug('Writing CSV file %s' % path)
+    with open(path, 'w') as f:
+        writer = csv.DictWriter(f, csv_header,
+                                delimiter='\t', quotechar='"',
+                                restval='None')
+        writer.writeheader()
+        for k, v in inds_map.iteritems():
+            study_id += 1
+            v['label'] = hashids.encrypt(study_id)
+            writer.writerow(v)
+    return
+
+
+def write_csv_enrollment_codes(logger, filename, csv_header, enrolls_map):
+    logger.debug('Writing CSV file %s' % filename)
+    with open(filename, 'w') as f:
+        writer = csv.DictWriter(f, csv_header,
+                                delimiter='\t', quotechar='"',
+                                restval='None')
+        writer.writeheader()
+        for k, v in enrolls_map.iteritems():
+            writer.writerow(v)    
+    return
+
+
+def get_enrollments_codes(logger, kb, inds_map):
+    """Retrieve enrollments codes in other studies for the individuals
+    to be enrolled into the specified study"""
+    # Retrieve all studies from omero
+    studies = kb.get_objects(kb.Study)
+    logger.info('Retrieved %d studies from database' % len(studies))
+
+    csv_header = ['individual_uuid']
+    enrolls_map = {}
+    # For each study, retrieve all enrollments
+    for s in studies:
+        logger.info('Retrieving enrollments for study %s' % s.label)
+        enrolls = kb.get_enrolled(s)
+        logger.info('%s enrollments retrieved' % len(enrolls))
+        if len(enrolls) > 0:
+            logger.debug('Building lookup dictionary....')
+            csv_header.append(s.label)  # Add study label to CSV header
+            for e in enrolls:
+                if e.individual.id in inds_map:
+                    enrolls_map.setdefault(e.individual.omero_id,
+                                           {})['individual_uuid'] = e.individual.id
+                    enrolls_map[e.individual.omero_id][s.label] = e.studyCode
+        else:
+            logger.debug('No enrollments found, skip study %s' % s.label)
+            
+    return csv_header, enrolls_map
+
+
+def main(argv):
+    parser = make_parser()
+    args = parser.parse_args(argv)
+
+    logger = get_logger('inds_not_enrolled', level=args.loglevel,
+                        filename=args.logfile)
+    try:
+        host = args.host or vlu.ome_host()
+        user = args.user or vlu.ome_user()
+        passwd = args.passwd or vlu.ome_passwd()
+    except ValueError, ve:
+        logger.critical(ve)
+        sys.exit(ve)
+
+    out_file_path = args.ofile
+
+    kb = KB(driver='omero')(host, user, passwd)
+
+    inds = kb.get_objects(kb.Individual)
+    #len_inds = len(inds)
+    logger.info('Retrieved {} individuals'.format(len(inds)))
+
+    inds_map = {}
+
+    for i in inds:
+        inds_map.setdefault(i.id, {})['source'] = i.id
+        inds_map[i.id]['study'] = args.study    
+   
+    study = kb.get_by_label(kb.Study, args.study)
+    if study:
+        logger.info('{} present in the database'.format(study.label))
+    else:
+        logger.critical('{} not present in the database'.format(args.study))
+        sys.exit()
+
+    hashids = init_hashids(study.label)
+    enrolls = kb.get_enrolled(study)
+    logger.info("{} enrollments founded in {}".format(len(enrolls),
+                                                      study.label))
+    highest_id = 0
+    #ids = []
+    
+    for e in enrolls:
+        if e.individual.id in inds_map:
+                del inds_map[e.individual.id]
+        _ = hashids.decrypt(e.studyCode)
+        if _ > highest_id:
+            highest_id = _[0]
+            
+    if args.study_to_be_ignored and kb.get_by_label(kb.Study,
+                                                    args.study_to_be_ignored):
+        to_be_removed = [args.study_to_be_ignored]
+    else:
+        to_be_removed = []        
+    
+    for tbr_study in to_be_removed:
+        enr = kb.get_enrolled(kb.get_by_label(kb.Study, tbr_study))
+        logger.info('Retrieved {} enrollments from {}'.format(len(enr),
+                                                              tbr_study))
+        for e in enr:
+            if e.individual.id in inds_map:
+                del inds_map[e.individual.id]            
+
+    logger.info('{} individuals to be enrolled'.format(len(inds_map)))    
+    
+    write_csv_to_be_enrolled(logger, hashids, out_file_path, inds_map, highest_id)
+
+    csv_header, enrolls_map = get_enrollments_codes(logger, kb, inds_map)
+    write_csv_enrollment_codes(logger, args.reportfile, csv_header, enrolls_map)
+    
+if __name__ == '__main__':
+    main(sys.argv[1:])