Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/tools/build_miniped.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 # BEGIN_COPYRIGHT | |
| 2 # END_COPYRIGHT | |
| 3 | |
| 4 """ | |
| 5 A rough example of basic pedigree info generation. | |
| 6 """ | |
| 7 | |
| 8 import argparse | |
| 9 import collections | |
| 10 import csv | |
| 11 import os | |
| 12 import sys | |
| 13 import yaml | |
| 14 | |
| 15 from bl.vl.kb import KnowledgeBase as KB | |
| 16 from bl.vl.kb.drivers.omero.ehr import EHR | |
| 17 import bl.vl.individual.pedigree as ped | |
| 18 import bl.vl.utils.ome_utils as vlu | |
| 19 from bl.vl.utils import LOG_LEVELS, get_logger | |
| 20 | |
| 21 PLINK_MISSING = -9 | |
| 22 PLINK_UNAFFECTED = 1 | |
| 23 PLINK_AFFECTED = 2 | |
| 24 | |
| 25 FIELDS = ["fam_label", "ind_label", "fat_label", "mot_label", "gender"] | |
| 26 | |
| 27 | |
| 28 def load_config(config_file): | |
| 29 with open(config_file) as cfg: | |
| 30 conf = yaml.load(cfg) | |
| 31 return conf | |
| 32 | |
| 33 | |
| 34 class Diagnosis(object): | |
| 35 def __init__(self, logger, yaml_file): | |
| 36 self.logger = logger | |
| 37 if os.path.isfile(yaml_file): | |
| 38 self.conf = load_config(yaml_file) | |
| 39 else: | |
| 40 self.logger.critical('The config file {} does not exists'.format( | |
| 41 yaml_file)) | |
| 42 sys.exit() | |
| 43 | |
| 44 def get_openehr_data(self): | |
| 45 return self.conf['openEHR']['archetype'], self.conf['openEHR']['field'] | |
| 46 | |
| 47 def get_diagnosis_label(self): | |
| 48 return [l for l in self.get_diagnosis().iterkeys()] | |
| 49 | |
| 50 def get_diagnosis(self): | |
| 51 results = collections.OrderedDict() | |
| 52 diagnosis = collections.OrderedDict(sorted( | |
| 53 self.conf['diagnosis'].items())) | |
| 54 for v in diagnosis.itervalues(): | |
| 55 results[v['label']] = v['values'] | |
| 56 return results | |
| 57 | |
| 58 def get_unaffected_diagnosis_dictionary(self): | |
| 59 labels = self.get_diagnosis_label() | |
| 60 d = {} | |
| 61 for k in labels: | |
| 62 d[k] = 1 | |
| 63 return d | |
| 64 | |
| 65 | |
| 66 def make_parser(): | |
| 67 parser = argparse.ArgumentParser( | |
| 68 description='build the first columns of a ped file from VL') | |
| 69 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 70 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 71 help='logging level', default='INFO') | |
| 72 parser.add_argument('--configfile', type=str, default=os.path.join( | |
| 73 os.path.dirname(os.path.realpath(__file__)), 'build_miniped.yaml'), | |
| 74 help='config file (yaml) with diagnosis dictionary') | |
| 75 parser.add_argument('-H', '--host', type=str, help='omero hostname') | |
| 76 parser.add_argument('-U', '--user', type=str, help='omero user') | |
| 77 parser.add_argument('-P', '--passwd', type=str, help='omero password') | |
| 78 parser.add_argument('-S', '--study', type=str, required=True, | |
| 79 help="a list of comma separated studies used to " | |
| 80 "retrieve individuals that will be written to " | |
| 81 "ped file") | |
| 82 parser.add_argument('--ofile', type=str, help='output file path', | |
| 83 required=True) | |
| 84 parser.add_argument('--write_header', action='store_true', default=False, | |
| 85 help='Write header into the output file') | |
| 86 return parser | |
| 87 | |
| 88 | |
| 89 def build_families(individuals, logger): | |
| 90 # Individuals with only one parent will be considered like founders | |
| 91 # for i in individuals: | |
| 92 # if ((i.mother is None) or (i.father is None)): | |
| 93 # i.mother = None | |
| 94 # i.father = None | |
| 95 logger.info("individuals: %d" % len(individuals)) | |
| 96 # logger.info("individuals: with 0 or 2 parents: %d" % len(not_one_parent)) | |
| 97 logger.info("analyzing pedigree") | |
| 98 founders, non_founders, dangling, couples, children = ped.analyze( | |
| 99 individuals | |
| 100 ) | |
| 101 logger.info("splitting into families") | |
| 102 return ped.split_disjoint(individuals, children) | |
| 103 | |
| 104 | |
| 105 def main(argv): | |
| 106 parser = make_parser() | |
| 107 args = parser.parse_args(argv) | |
| 108 | |
| 109 logger = get_logger('build_miniped', level=args.loglevel, | |
| 110 filename=args.logfile) | |
| 111 | |
| 112 dobj = Diagnosis(logger, args.configfile) | |
| 113 logger.debug("l {}".format(dobj.get_diagnosis_label())) | |
| 114 | |
| 115 try: | |
| 116 host = args.host or vlu.ome_host() | |
| 117 user = args.user or vlu.ome_user() | |
| 118 passwd = args.passwd or vlu.ome_passwd() | |
| 119 except ValueError, ve: | |
| 120 logger.critical(ve) | |
| 121 sys.exit(ve) | |
| 122 | |
| 123 kb = KB(driver='omero')(host, user, passwd) | |
| 124 logger.debug('Loading all individuals from omero') | |
| 125 all_inds = kb.get_objects(kb.Individual) # store all inds to cache | |
| 126 logger.debug('%d individuals loaded' % len(all_inds)) | |
| 127 studies = [kb.get_study(s) for s in args.study.split(',')] | |
| 128 # Removing None values | |
| 129 studies = set(studies) | |
| 130 try: | |
| 131 studies.remove(None) | |
| 132 except KeyError: | |
| 133 pass | |
| 134 studies = list(studies) | |
| 135 # Sorting studies | |
| 136 studies = sorted(studies, key=lambda k: k.label.lower()) | |
| 137 if len(studies) == 0: | |
| 138 logger.error( | |
| 139 'No matches found for labels %s, stopping program' % args.study) | |
| 140 sys.exit(2) | |
| 141 enrolled_map = {} | |
| 142 for study in studies: | |
| 143 logger.info('Loading enrolled individuals for study %s' % study.label) | |
| 144 enrolled = kb.get_enrolled(study) | |
| 145 logger.debug('%d individuals loaded' % len(enrolled)) | |
| 146 for en in enrolled: | |
| 147 if en.individual.id not in enrolled_map: | |
| 148 enrolled_map[en.individual.id] = ( | |
| 149 '%s:%s' % (en.study.label, en.studyCode), | |
| 150 en.individual) | |
| 151 else: | |
| 152 logger.debug('Individual %s already mapped' % en.individual.id) | |
| 153 logger.debug('Loading EHR records') | |
| 154 ehr_records = kb.get_ehr_records() | |
| 155 logger.debug('%s EHR records loaded' % len(ehr_records)) | |
| 156 ehr_records_map = {} | |
| 157 for r in ehr_records: | |
| 158 ehr_records_map.setdefault(r['i_id'], []).append(r) | |
| 159 affection_map = {} | |
| 160 arch, field = dobj.get_openehr_data() | |
| 161 for ind_id, ehr_recs in ehr_records_map.iteritems(): | |
| 162 affection_map[ind_id] = dobj.get_unaffected_diagnosis_dictionary() | |
| 163 ehr = EHR(ehr_recs) | |
| 164 for k, v in dobj.get_diagnosis().iteritems(): | |
| 165 for d in v: | |
| 166 if ehr.matches(arch, field, d): | |
| 167 affection_map[ind_id][k] = PLINK_AFFECTED | |
| 168 | |
| 169 immuno_inds = [i for (ind_id, (st_code, i)) in enrolled_map.iteritems()] | |
| 170 families = build_families(immuno_inds, logger) | |
| 171 logger.info("found %d families" % len(families)) | |
| 172 | |
| 173 def resolve_label(i): | |
| 174 try: | |
| 175 return enrolled_map[i.id][0] | |
| 176 except KeyError: | |
| 177 return i.id | |
| 178 | |
| 179 def resolve_pheno(i): | |
| 180 try: | |
| 181 immuno_affection = affection_map[i.id] | |
| 182 except KeyError: | |
| 183 return [(d, PLINK_MISSING) for d in dobj.get_diagnosis_label()] | |
| 184 return [(d, immuno_affection[d]) for d in dobj.get_diagnosis_label()] | |
| 185 | |
| 186 kb.Gender.map_enums_values(kb) | |
| 187 gender_map = lambda x: 2 if x.enum_label() == kb.Gender.FEMALE.enum_label() \ | |
| 188 else 1 | |
| 189 | |
| 190 for d in dobj.get_diagnosis_label(): | |
| 191 FIELDS.append("_".join([d, "status"])) | |
| 192 with open(args.ofile, "w") as f: | |
| 193 writer = csv.DictWriter(f, FIELDS, delimiter="\t", lineterminator="\n") | |
| 194 if args.write_header: | |
| 195 writer.writeheader() | |
| 196 families_data = [] | |
| 197 logger.info("building families data") | |
| 198 for k, fam in enumerate(families): | |
| 199 fam_label = "FAM_%d" % (k + 1) | |
| 200 for i in fam: | |
| 201 r = {"fam_label": fam_label, | |
| 202 "ind_label": resolve_label(i), | |
| 203 "fat_label": 0 if (i.father is None or i.father not in fam) | |
| 204 else resolve_label(i.father), | |
| 205 "mot_label": 0 if (i.mother is None or i.mother not in fam) | |
| 206 else resolve_label(i.mother), | |
| 207 "gender": gender_map(i.gender)} | |
| 208 for p in resolve_pheno(i): | |
| 209 r["_".join([p[0], "status"])] = p[1] | |
| 210 families_data.append(r) | |
| 211 logger.info("writing miniped") | |
| 212 writer.writerows(families_data) | |
| 213 | |
| 214 | |
| 215 if __name__ == "__main__": | |
| 216 main(sys.argv[1:]) |
