Mercurial > repos > ric > test3
diff galaxy-tools/biobank/tools/build_miniped.py @ 0:e54d14bed3f5 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Thu, 29 Sep 2016 06:09:15 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/tools/build_miniped.py Thu Sep 29 06:09:15 2016 -0400 @@ -0,0 +1,216 @@ +# BEGIN_COPYRIGHT +# END_COPYRIGHT + +""" +A rough example of basic pedigree info generation. +""" + +import argparse +import collections +import csv +import os +import sys +import yaml + +from bl.vl.kb import KnowledgeBase as KB +from bl.vl.kb.drivers.omero.ehr import EHR +import bl.vl.individual.pedigree as ped +import bl.vl.utils.ome_utils as vlu +from bl.vl.utils import LOG_LEVELS, get_logger + +PLINK_MISSING = -9 +PLINK_UNAFFECTED = 1 +PLINK_AFFECTED = 2 + +FIELDS = ["fam_label", "ind_label", "fat_label", "mot_label", "gender"] + + +def load_config(config_file): + with open(config_file) as cfg: + conf = yaml.load(cfg) + return conf + + +class Diagnosis(object): + def __init__(self, logger, yaml_file): + self.logger = logger + if os.path.isfile(yaml_file): + self.conf = load_config(yaml_file) + else: + self.logger.critical('The config file {} does not exists'.format( + yaml_file)) + sys.exit() + + def get_openehr_data(self): + return self.conf['openEHR']['archetype'], self.conf['openEHR']['field'] + + def get_diagnosis_label(self): + return [l for l in self.get_diagnosis().iterkeys()] + + def get_diagnosis(self): + results = collections.OrderedDict() + diagnosis = collections.OrderedDict(sorted( + self.conf['diagnosis'].items())) + for v in diagnosis.itervalues(): + results[v['label']] = v['values'] + return results + + def get_unaffected_diagnosis_dictionary(self): + labels = self.get_diagnosis_label() + d = {} + for k in labels: + d[k] = 1 + return d + + +def make_parser(): + parser = argparse.ArgumentParser( + description='build the first columns of a ped file from VL') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--configfile', type=str, default=os.path.join( + os.path.dirname(os.path.realpath(__file__)), 'build_miniped.yaml'), + help='config file (yaml) with diagnosis dictionary') + parser.add_argument('-H', '--host', type=str, help='omero hostname') + parser.add_argument('-U', '--user', type=str, help='omero user') + parser.add_argument('-P', '--passwd', type=str, help='omero password') + parser.add_argument('-S', '--study', type=str, required=True, + help="a list of comma separated studies used to " + "retrieve individuals that will be written to " + "ped file") + parser.add_argument('--ofile', type=str, help='output file path', + required=True) + parser.add_argument('--write_header', action='store_true', default=False, + help='Write header into the output file') + return parser + + +def build_families(individuals, logger): + # Individuals with only one parent will be considered like founders + # for i in individuals: + # if ((i.mother is None) or (i.father is None)): + # i.mother = None + # i.father = None + logger.info("individuals: %d" % len(individuals)) + # logger.info("individuals: with 0 or 2 parents: %d" % len(not_one_parent)) + logger.info("analyzing pedigree") + founders, non_founders, dangling, couples, children = ped.analyze( + individuals + ) + logger.info("splitting into families") + return ped.split_disjoint(individuals, children) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + logger = get_logger('build_miniped', level=args.loglevel, + filename=args.logfile) + + dobj = Diagnosis(logger, args.configfile) + logger.debug("l {}".format(dobj.get_diagnosis_label())) + + try: + host = args.host or vlu.ome_host() + user = args.user or vlu.ome_user() + passwd = args.passwd or vlu.ome_passwd() + except ValueError, ve: + logger.critical(ve) + sys.exit(ve) + + kb = KB(driver='omero')(host, user, passwd) + logger.debug('Loading all individuals from omero') + all_inds = kb.get_objects(kb.Individual) # store all inds to cache + logger.debug('%d individuals loaded' % len(all_inds)) + studies = [kb.get_study(s) for s in args.study.split(',')] + # Removing None values + studies = set(studies) + try: + studies.remove(None) + except KeyError: + pass + studies = list(studies) + # Sorting studies + studies = sorted(studies, key=lambda k: k.label.lower()) + if len(studies) == 0: + logger.error( + 'No matches found for labels %s, stopping program' % args.study) + sys.exit(2) + enrolled_map = {} + for study in studies: + logger.info('Loading enrolled individuals for study %s' % study.label) + enrolled = kb.get_enrolled(study) + logger.debug('%d individuals loaded' % len(enrolled)) + for en in enrolled: + if en.individual.id not in enrolled_map: + enrolled_map[en.individual.id] = ( + '%s:%s' % (en.study.label, en.studyCode), + en.individual) + else: + logger.debug('Individual %s already mapped' % en.individual.id) + logger.debug('Loading EHR records') + ehr_records = kb.get_ehr_records() + logger.debug('%s EHR records loaded' % len(ehr_records)) + ehr_records_map = {} + for r in ehr_records: + ehr_records_map.setdefault(r['i_id'], []).append(r) + affection_map = {} + arch, field = dobj.get_openehr_data() + for ind_id, ehr_recs in ehr_records_map.iteritems(): + affection_map[ind_id] = dobj.get_unaffected_diagnosis_dictionary() + ehr = EHR(ehr_recs) + for k, v in dobj.get_diagnosis().iteritems(): + for d in v: + if ehr.matches(arch, field, d): + affection_map[ind_id][k] = PLINK_AFFECTED + + immuno_inds = [i for (ind_id, (st_code, i)) in enrolled_map.iteritems()] + families = build_families(immuno_inds, logger) + logger.info("found %d families" % len(families)) + + def resolve_label(i): + try: + return enrolled_map[i.id][0] + except KeyError: + return i.id + + def resolve_pheno(i): + try: + immuno_affection = affection_map[i.id] + except KeyError: + return [(d, PLINK_MISSING) for d in dobj.get_diagnosis_label()] + return [(d, immuno_affection[d]) for d in dobj.get_diagnosis_label()] + + kb.Gender.map_enums_values(kb) + gender_map = lambda x: 2 if x.enum_label() == kb.Gender.FEMALE.enum_label() \ + else 1 + + for d in dobj.get_diagnosis_label(): + FIELDS.append("_".join([d, "status"])) + with open(args.ofile, "w") as f: + writer = csv.DictWriter(f, FIELDS, delimiter="\t", lineterminator="\n") + if args.write_header: + writer.writeheader() + families_data = [] + logger.info("building families data") + for k, fam in enumerate(families): + fam_label = "FAM_%d" % (k + 1) + for i in fam: + r = {"fam_label": fam_label, + "ind_label": resolve_label(i), + "fat_label": 0 if (i.father is None or i.father not in fam) + else resolve_label(i.father), + "mot_label": 0 if (i.mother is None or i.mother not in fam) + else resolve_label(i.mother), + "gender": gender_map(i.gender)} + for p in resolve_pheno(i): + r["_".join([p[0], "status"])] = p[1] + families_data.append(r) + logger.info("writing miniped") + writer.writerows(families_data) + + +if __name__ == "__main__": + main(sys.argv[1:])
