annotate galaxy-tools/biobank/tools/build_miniped.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
1 # BEGIN_COPYRIGHT
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
2 # END_COPYRIGHT
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
3
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
4 """
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
5 A rough example of basic pedigree info generation.
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
6 """
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
7
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
8 import argparse
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
9 import collections
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
10 import csv
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
11 import os
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
12 import sys
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
13 import yaml
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
14
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
15 from bl.vl.kb import KnowledgeBase as KB
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
16 from bl.vl.kb.drivers.omero.ehr import EHR
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
17 import bl.vl.individual.pedigree as ped
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
18 import bl.vl.utils.ome_utils as vlu
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
19 from bl.vl.utils import LOG_LEVELS, get_logger
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
20
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
21 PLINK_MISSING = -9
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
22 PLINK_UNAFFECTED = 1
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
23 PLINK_AFFECTED = 2
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
24
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
25 FIELDS = ["fam_label", "ind_label", "fat_label", "mot_label", "gender"]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
26
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
27
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
28 def load_config(config_file):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
29 with open(config_file) as cfg:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
30 conf = yaml.load(cfg)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
31 return conf
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
32
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
33
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
34 class Diagnosis(object):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
35 def __init__(self, logger, yaml_file):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
36 self.logger = logger
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
37 if os.path.isfile(yaml_file):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
38 self.conf = load_config(yaml_file)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
39 else:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
40 self.logger.critical('The config file {} does not exists'.format(
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
41 yaml_file))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
42 sys.exit()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
43
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
44 def get_openehr_data(self):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
45 return self.conf['openEHR']['archetype'], self.conf['openEHR']['field']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
46
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
47 def get_diagnosis_label(self):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
48 return [l for l in self.get_diagnosis().iterkeys()]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
49
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
50 def get_diagnosis(self):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
51 results = collections.OrderedDict()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
52 diagnosis = collections.OrderedDict(sorted(
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
53 self.conf['diagnosis'].items()))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
54 for v in diagnosis.itervalues():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
55 results[v['label']] = v['values']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
56 return results
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
57
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
58 def get_unaffected_diagnosis_dictionary(self):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
59 labels = self.get_diagnosis_label()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
60 d = {}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
61 for k in labels:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
62 d[k] = 1
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
63 return d
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
64
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
65
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
66 def make_parser():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
67 parser = argparse.ArgumentParser(
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
68 description='build the first columns of a ped file from VL')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
69 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
70 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
71 help='logging level', default='INFO')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
72 parser.add_argument('--configfile', type=str, default=os.path.join(
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
73 os.path.dirname(os.path.realpath(__file__)), 'build_miniped.yaml'),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
74 help='config file (yaml) with diagnosis dictionary')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
75 parser.add_argument('-H', '--host', type=str, help='omero hostname')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
76 parser.add_argument('-U', '--user', type=str, help='omero user')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
77 parser.add_argument('-P', '--passwd', type=str, help='omero password')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
78 parser.add_argument('-S', '--study', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
79 help="a list of comma separated studies used to "
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
80 "retrieve individuals that will be written to "
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
81 "ped file")
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
82 parser.add_argument('--ofile', type=str, help='output file path',
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
83 required=True)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
84 parser.add_argument('--write_header', action='store_true', default=False,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
85 help='Write header into the output file')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
86 return parser
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
87
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
88
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
89 def build_families(individuals, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
90 # Individuals with only one parent will be considered like founders
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
91 # for i in individuals:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
92 # if ((i.mother is None) or (i.father is None)):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
93 # i.mother = None
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
94 # i.father = None
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
95 logger.info("individuals: %d" % len(individuals))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
96 # logger.info("individuals: with 0 or 2 parents: %d" % len(not_one_parent))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
97 logger.info("analyzing pedigree")
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
98 founders, non_founders, dangling, couples, children = ped.analyze(
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
99 individuals
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
100 )
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
101 logger.info("splitting into families")
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
102 return ped.split_disjoint(individuals, children)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
103
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
104
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
105 def main(argv):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
106 parser = make_parser()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
107 args = parser.parse_args(argv)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
108
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
109 logger = get_logger('build_miniped', level=args.loglevel,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
110 filename=args.logfile)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
111
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
112 dobj = Diagnosis(logger, args.configfile)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
113 logger.debug("l {}".format(dobj.get_diagnosis_label()))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
114
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
115 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
116 host = args.host or vlu.ome_host()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
117 user = args.user or vlu.ome_user()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
118 passwd = args.passwd or vlu.ome_passwd()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
119 except ValueError, ve:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
120 logger.critical(ve)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
121 sys.exit(ve)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
122
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
123 kb = KB(driver='omero')(host, user, passwd)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
124 logger.debug('Loading all individuals from omero')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
125 all_inds = kb.get_objects(kb.Individual) # store all inds to cache
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
126 logger.debug('%d individuals loaded' % len(all_inds))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
127 studies = [kb.get_study(s) for s in args.study.split(',')]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
128 # Removing None values
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
129 studies = set(studies)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
130 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
131 studies.remove(None)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
132 except KeyError:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
133 pass
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
134 studies = list(studies)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
135 # Sorting studies
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
136 studies = sorted(studies, key=lambda k: k.label.lower())
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
137 if len(studies) == 0:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
138 logger.error(
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
139 'No matches found for labels %s, stopping program' % args.study)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
140 sys.exit(2)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
141 enrolled_map = {}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
142 for study in studies:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
143 logger.info('Loading enrolled individuals for study %s' % study.label)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
144 enrolled = kb.get_enrolled(study)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
145 logger.debug('%d individuals loaded' % len(enrolled))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
146 for en in enrolled:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
147 if en.individual.id not in enrolled_map:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
148 enrolled_map[en.individual.id] = (
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
149 '%s:%s' % (en.study.label, en.studyCode),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
150 en.individual)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
151 else:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
152 logger.debug('Individual %s already mapped' % en.individual.id)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
153 logger.debug('Loading EHR records')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
154 ehr_records = kb.get_ehr_records()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
155 logger.debug('%s EHR records loaded' % len(ehr_records))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
156 ehr_records_map = {}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
157 for r in ehr_records:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
158 ehr_records_map.setdefault(r['i_id'], []).append(r)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
159 affection_map = {}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
160 arch, field = dobj.get_openehr_data()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
161 for ind_id, ehr_recs in ehr_records_map.iteritems():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
162 affection_map[ind_id] = dobj.get_unaffected_diagnosis_dictionary()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
163 ehr = EHR(ehr_recs)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
164 for k, v in dobj.get_diagnosis().iteritems():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
165 for d in v:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
166 if ehr.matches(arch, field, d):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
167 affection_map[ind_id][k] = PLINK_AFFECTED
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
168
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
169 immuno_inds = [i for (ind_id, (st_code, i)) in enrolled_map.iteritems()]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
170 families = build_families(immuno_inds, logger)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
171 logger.info("found %d families" % len(families))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
172
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
173 def resolve_label(i):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
174 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
175 return enrolled_map[i.id][0]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
176 except KeyError:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
177 return i.id
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
178
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
179 def resolve_pheno(i):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
180 try:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
181 immuno_affection = affection_map[i.id]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
182 except KeyError:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
183 return [(d, PLINK_MISSING) for d in dobj.get_diagnosis_label()]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
184 return [(d, immuno_affection[d]) for d in dobj.get_diagnosis_label()]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
185
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
186 kb.Gender.map_enums_values(kb)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
187 gender_map = lambda x: 2 if x.enum_label() == kb.Gender.FEMALE.enum_label() \
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
188 else 1
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
189
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
190 for d in dobj.get_diagnosis_label():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
191 FIELDS.append("_".join([d, "status"]))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
192 with open(args.ofile, "w") as f:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
193 writer = csv.DictWriter(f, FIELDS, delimiter="\t", lineterminator="\n")
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
194 if args.write_header:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
195 writer.writeheader()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
196 families_data = []
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
197 logger.info("building families data")
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
198 for k, fam in enumerate(families):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
199 fam_label = "FAM_%d" % (k + 1)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
200 for i in fam:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
201 r = {"fam_label": fam_label,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
202 "ind_label": resolve_label(i),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
203 "fat_label": 0 if (i.father is None or i.father not in fam)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
204 else resolve_label(i.father),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
205 "mot_label": 0 if (i.mother is None or i.mother not in fam)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
206 else resolve_label(i.mother),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
207 "gender": gender_map(i.gender)}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
208 for p in resolve_pheno(i):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
209 r["_".join([p[0], "status"])] = p[1]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
210 families_data.append(r)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
211 logger.info("writing miniped")
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
212 writer.writerows(families_data)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
213
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
214
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
215 if __name__ == "__main__":
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
216 main(sys.argv[1:])