comparison galaxy-tools/biobank/tools/build_miniped.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ba6cf6ede027
1 # BEGIN_COPYRIGHT
2 # END_COPYRIGHT
3
4 """
5 A rough example of basic pedigree info generation.
6 """
7
8 import argparse
9 import collections
10 import csv
11 import os
12 import sys
13 import yaml
14
15 from bl.vl.kb import KnowledgeBase as KB
16 from bl.vl.kb.drivers.omero.ehr import EHR
17 import bl.vl.individual.pedigree as ped
18 import bl.vl.utils.ome_utils as vlu
19 from bl.vl.utils import LOG_LEVELS, get_logger
20
21 PLINK_MISSING = -9
22 PLINK_UNAFFECTED = 1
23 PLINK_AFFECTED = 2
24
25 FIELDS = ["fam_label", "ind_label", "fat_label", "mot_label", "gender"]
26
27
28 def load_config(config_file):
29 with open(config_file) as cfg:
30 conf = yaml.load(cfg)
31 return conf
32
33
34 class Diagnosis(object):
35 def __init__(self, logger, yaml_file):
36 self.logger = logger
37 if os.path.isfile(yaml_file):
38 self.conf = load_config(yaml_file)
39 else:
40 self.logger.critical('The config file {} does not exists'.format(
41 yaml_file))
42 sys.exit()
43
44 def get_openehr_data(self):
45 return self.conf['openEHR']['archetype'], self.conf['openEHR']['field']
46
47 def get_diagnosis_label(self):
48 return [l for l in self.get_diagnosis().iterkeys()]
49
50 def get_diagnosis(self):
51 results = collections.OrderedDict()
52 diagnosis = collections.OrderedDict(sorted(
53 self.conf['diagnosis'].items()))
54 for v in diagnosis.itervalues():
55 results[v['label']] = v['values']
56 return results
57
58 def get_unaffected_diagnosis_dictionary(self):
59 labels = self.get_diagnosis_label()
60 d = {}
61 for k in labels:
62 d[k] = 1
63 return d
64
65
66 def make_parser():
67 parser = argparse.ArgumentParser(
68 description='build the first columns of a ped file from VL')
69 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
70 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
71 help='logging level', default='INFO')
72 parser.add_argument('--configfile', type=str, default=os.path.join(
73 os.path.dirname(os.path.realpath(__file__)), 'build_miniped.yaml'),
74 help='config file (yaml) with diagnosis dictionary')
75 parser.add_argument('-H', '--host', type=str, help='omero hostname')
76 parser.add_argument('-U', '--user', type=str, help='omero user')
77 parser.add_argument('-P', '--passwd', type=str, help='omero password')
78 parser.add_argument('-S', '--study', type=str, required=True,
79 help="a list of comma separated studies used to "
80 "retrieve individuals that will be written to "
81 "ped file")
82 parser.add_argument('--ofile', type=str, help='output file path',
83 required=True)
84 parser.add_argument('--write_header', action='store_true', default=False,
85 help='Write header into the output file')
86 return parser
87
88
89 def build_families(individuals, logger):
90 # Individuals with only one parent will be considered like founders
91 # for i in individuals:
92 # if ((i.mother is None) or (i.father is None)):
93 # i.mother = None
94 # i.father = None
95 logger.info("individuals: %d" % len(individuals))
96 # logger.info("individuals: with 0 or 2 parents: %d" % len(not_one_parent))
97 logger.info("analyzing pedigree")
98 founders, non_founders, dangling, couples, children = ped.analyze(
99 individuals
100 )
101 logger.info("splitting into families")
102 return ped.split_disjoint(individuals, children)
103
104
105 def main(argv):
106 parser = make_parser()
107 args = parser.parse_args(argv)
108
109 logger = get_logger('build_miniped', level=args.loglevel,
110 filename=args.logfile)
111
112 dobj = Diagnosis(logger, args.configfile)
113 logger.debug("l {}".format(dobj.get_diagnosis_label()))
114
115 try:
116 host = args.host or vlu.ome_host()
117 user = args.user or vlu.ome_user()
118 passwd = args.passwd or vlu.ome_passwd()
119 except ValueError, ve:
120 logger.critical(ve)
121 sys.exit(ve)
122
123 kb = KB(driver='omero')(host, user, passwd)
124 logger.debug('Loading all individuals from omero')
125 all_inds = kb.get_objects(kb.Individual) # store all inds to cache
126 logger.debug('%d individuals loaded' % len(all_inds))
127 studies = [kb.get_study(s) for s in args.study.split(',')]
128 # Removing None values
129 studies = set(studies)
130 try:
131 studies.remove(None)
132 except KeyError:
133 pass
134 studies = list(studies)
135 # Sorting studies
136 studies = sorted(studies, key=lambda k: k.label.lower())
137 if len(studies) == 0:
138 logger.error(
139 'No matches found for labels %s, stopping program' % args.study)
140 sys.exit(2)
141 enrolled_map = {}
142 for study in studies:
143 logger.info('Loading enrolled individuals for study %s' % study.label)
144 enrolled = kb.get_enrolled(study)
145 logger.debug('%d individuals loaded' % len(enrolled))
146 for en in enrolled:
147 if en.individual.id not in enrolled_map:
148 enrolled_map[en.individual.id] = (
149 '%s:%s' % (en.study.label, en.studyCode),
150 en.individual)
151 else:
152 logger.debug('Individual %s already mapped' % en.individual.id)
153 logger.debug('Loading EHR records')
154 ehr_records = kb.get_ehr_records()
155 logger.debug('%s EHR records loaded' % len(ehr_records))
156 ehr_records_map = {}
157 for r in ehr_records:
158 ehr_records_map.setdefault(r['i_id'], []).append(r)
159 affection_map = {}
160 arch, field = dobj.get_openehr_data()
161 for ind_id, ehr_recs in ehr_records_map.iteritems():
162 affection_map[ind_id] = dobj.get_unaffected_diagnosis_dictionary()
163 ehr = EHR(ehr_recs)
164 for k, v in dobj.get_diagnosis().iteritems():
165 for d in v:
166 if ehr.matches(arch, field, d):
167 affection_map[ind_id][k] = PLINK_AFFECTED
168
169 immuno_inds = [i for (ind_id, (st_code, i)) in enrolled_map.iteritems()]
170 families = build_families(immuno_inds, logger)
171 logger.info("found %d families" % len(families))
172
173 def resolve_label(i):
174 try:
175 return enrolled_map[i.id][0]
176 except KeyError:
177 return i.id
178
179 def resolve_pheno(i):
180 try:
181 immuno_affection = affection_map[i.id]
182 except KeyError:
183 return [(d, PLINK_MISSING) for d in dobj.get_diagnosis_label()]
184 return [(d, immuno_affection[d]) for d in dobj.get_diagnosis_label()]
185
186 kb.Gender.map_enums_values(kb)
187 gender_map = lambda x: 2 if x.enum_label() == kb.Gender.FEMALE.enum_label() \
188 else 1
189
190 for d in dobj.get_diagnosis_label():
191 FIELDS.append("_".join([d, "status"]))
192 with open(args.ofile, "w") as f:
193 writer = csv.DictWriter(f, FIELDS, delimiter="\t", lineterminator="\n")
194 if args.write_header:
195 writer.writeheader()
196 families_data = []
197 logger.info("building families data")
198 for k, fam in enumerate(families):
199 fam_label = "FAM_%d" % (k + 1)
200 for i in fam:
201 r = {"fam_label": fam_label,
202 "ind_label": resolve_label(i),
203 "fat_label": 0 if (i.father is None or i.father not in fam)
204 else resolve_label(i.father),
205 "mot_label": 0 if (i.mother is None or i.mother not in fam)
206 else resolve_label(i.mother),
207 "gender": gender_map(i.gender)}
208 for p in resolve_pheno(i):
209 r["_".join([p[0], "status"])] = p[1]
210 families_data.append(r)
211 logger.info("writing miniped")
212 writer.writerows(families_data)
213
214
215 if __name__ == "__main__":
216 main(sys.argv[1:])