diff galaxy-tools/biobank/tools/build_miniped.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/tools/build_miniped.py	Wed Sep 28 06:03:30 2016 -0400
@@ -0,0 +1,216 @@
+# BEGIN_COPYRIGHT
+# END_COPYRIGHT
+
+"""
+A rough example of basic pedigree info generation.
+"""
+
+import argparse
+import collections
+import csv
+import os
+import sys
+import yaml
+
+from bl.vl.kb import KnowledgeBase as KB
+from bl.vl.kb.drivers.omero.ehr import EHR
+import bl.vl.individual.pedigree as ped
+import bl.vl.utils.ome_utils as vlu
+from bl.vl.utils import LOG_LEVELS, get_logger
+
+PLINK_MISSING = -9
+PLINK_UNAFFECTED = 1
+PLINK_AFFECTED = 2
+
+FIELDS = ["fam_label", "ind_label", "fat_label", "mot_label", "gender"]
+
+
+def load_config(config_file):
+    with open(config_file) as cfg:
+        conf = yaml.load(cfg)
+    return conf
+
+
+class Diagnosis(object):
+    def __init__(self, logger, yaml_file):
+        self.logger = logger
+        if os.path.isfile(yaml_file):
+            self.conf = load_config(yaml_file)
+        else:
+            self.logger.critical('The config file {} does not exists'.format(
+                yaml_file))
+            sys.exit()
+
+    def get_openehr_data(self):
+        return self.conf['openEHR']['archetype'], self.conf['openEHR']['field']
+
+    def get_diagnosis_label(self):
+        return [l for l in self.get_diagnosis().iterkeys()]
+
+    def get_diagnosis(self):
+        results = collections.OrderedDict()
+        diagnosis = collections.OrderedDict(sorted(
+            self.conf['diagnosis'].items()))
+        for v in diagnosis.itervalues():
+            results[v['label']] = v['values']
+        return results
+
+    def get_unaffected_diagnosis_dictionary(self):
+        labels = self.get_diagnosis_label()
+        d = {}
+        for k in labels:
+            d[k] = 1
+        return d
+
+
+def make_parser():
+    parser = argparse.ArgumentParser(
+        description='build the first columns of a ped file from VL')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logging level', default='INFO')
+    parser.add_argument('--configfile', type=str, default=os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), 'build_miniped.yaml'),
+        help='config file (yaml) with diagnosis dictionary')
+    parser.add_argument('-H', '--host', type=str, help='omero hostname')
+    parser.add_argument('-U', '--user', type=str, help='omero user')
+    parser.add_argument('-P', '--passwd', type=str, help='omero password')
+    parser.add_argument('-S', '--study', type=str, required=True,
+                        help="a list of comma separated studies used to "
+                             "retrieve individuals that will be written to "
+                             "ped file")
+    parser.add_argument('--ofile', type=str, help='output file path',
+                        required=True)
+    parser.add_argument('--write_header', action='store_true', default=False,
+                        help='Write header into the output file')
+    return parser
+
+
+def build_families(individuals, logger):
+    # Individuals with only one parent will be considered like founders
+    # for i in individuals:
+    # if ((i.mother is None) or (i.father is None)):
+    # i.mother = None
+    # i.father = None
+    logger.info("individuals: %d" % len(individuals))
+    # logger.info("individuals: with 0 or 2 parents: %d" % len(not_one_parent))
+    logger.info("analyzing pedigree")
+    founders, non_founders, dangling, couples, children = ped.analyze(
+        individuals
+    )
+    logger.info("splitting into families")
+    return ped.split_disjoint(individuals, children)
+
+
+def main(argv):
+    parser = make_parser()
+    args = parser.parse_args(argv)
+
+    logger = get_logger('build_miniped', level=args.loglevel,
+                        filename=args.logfile)
+
+    dobj = Diagnosis(logger, args.configfile)
+    logger.debug("l {}".format(dobj.get_diagnosis_label()))
+
+    try:
+        host = args.host or vlu.ome_host()
+        user = args.user or vlu.ome_user()
+        passwd = args.passwd or vlu.ome_passwd()
+    except ValueError, ve:
+        logger.critical(ve)
+        sys.exit(ve)
+
+    kb = KB(driver='omero')(host, user, passwd)
+    logger.debug('Loading all individuals from omero')
+    all_inds = kb.get_objects(kb.Individual)  # store all inds to cache
+    logger.debug('%d individuals loaded' % len(all_inds))
+    studies = [kb.get_study(s) for s in args.study.split(',')]
+    # Removing None values
+    studies = set(studies)
+    try:
+        studies.remove(None)
+    except KeyError:
+        pass
+    studies = list(studies)
+    # Sorting studies
+    studies = sorted(studies, key=lambda k: k.label.lower())
+    if len(studies) == 0:
+        logger.error(
+            'No matches found for labels %s, stopping program' % args.study)
+        sys.exit(2)
+    enrolled_map = {}
+    for study in studies:
+        logger.info('Loading enrolled individuals for study %s' % study.label)
+        enrolled = kb.get_enrolled(study)
+        logger.debug('%d individuals loaded' % len(enrolled))
+        for en in enrolled:
+            if en.individual.id not in enrolled_map:
+                enrolled_map[en.individual.id] = (
+                    '%s:%s' % (en.study.label, en.studyCode),
+                    en.individual)
+            else:
+                logger.debug('Individual %s already mapped' % en.individual.id)
+        logger.debug('Loading EHR records')
+        ehr_records = kb.get_ehr_records()
+        logger.debug('%s EHR records loaded' % len(ehr_records))
+        ehr_records_map = {}
+        for r in ehr_records:
+            ehr_records_map.setdefault(r['i_id'], []).append(r)
+        affection_map = {}
+        arch, field = dobj.get_openehr_data()
+        for ind_id, ehr_recs in ehr_records_map.iteritems():
+            affection_map[ind_id] = dobj.get_unaffected_diagnosis_dictionary()
+            ehr = EHR(ehr_recs)
+            for k, v in dobj.get_diagnosis().iteritems():
+                for d in v:
+                    if ehr.matches(arch, field, d):
+                        affection_map[ind_id][k] = PLINK_AFFECTED
+
+    immuno_inds = [i for (ind_id, (st_code, i)) in enrolled_map.iteritems()]
+    families = build_families(immuno_inds, logger)
+    logger.info("found %d families" % len(families))
+
+    def resolve_label(i):
+        try:
+            return enrolled_map[i.id][0]
+        except KeyError:
+            return i.id
+
+    def resolve_pheno(i):
+        try:
+            immuno_affection = affection_map[i.id]
+        except KeyError:
+            return [(d, PLINK_MISSING) for d in dobj.get_diagnosis_label()]
+        return [(d, immuno_affection[d]) for d in dobj.get_diagnosis_label()]
+
+    kb.Gender.map_enums_values(kb)
+    gender_map = lambda x: 2 if x.enum_label() == kb.Gender.FEMALE.enum_label() \
+        else 1
+
+    for d in dobj.get_diagnosis_label():
+        FIELDS.append("_".join([d, "status"]))
+    with open(args.ofile, "w") as f:
+        writer = csv.DictWriter(f, FIELDS, delimiter="\t", lineterminator="\n")
+        if args.write_header:
+            writer.writeheader()
+        families_data = []
+        logger.info("building families data")
+        for k, fam in enumerate(families):
+            fam_label = "FAM_%d" % (k + 1)
+            for i in fam:
+                r = {"fam_label": fam_label,
+                     "ind_label": resolve_label(i),
+                     "fat_label": 0 if (i.father is None or i.father not in fam)
+                     else resolve_label(i.father),
+                     "mot_label": 0 if (i.mother is None or i.mother not in fam)
+                     else resolve_label(i.mother),
+                     "gender": gender_map(i.gender)}
+                for p in resolve_pheno(i):
+                    r["_".join([p[0], "status"])] = p[1]
+                families_data.append(r)
+        logger.info("writing miniped")
+        writer.writerows(families_data)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])