Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/tools/check_update_parents_data.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 import sys, csv, argparse, logging, os | |
| 2 from collections import Counter | |
| 3 | |
| 4 from bl.vl.kb import KnowledgeBase as KB | |
| 5 import bl.vl.utils.ome_utils as vlu | |
| 6 from bl.vl.utils import get_logger, LOG_LEVELS | |
| 7 | |
| 8 | |
| 9 def make_parser(): | |
| 10 parser = argparse.ArgumentParser(description='check data that will be passed to the update_parents tool') | |
| 11 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 12 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 13 help='logging level (default=INFO)', default='INFO') | |
| 14 parser.add_argument('-H', '--host', type=str, help='omero hostname') | |
| 15 parser.add_argument('-U', '--user', type=str, help='omero user') | |
| 16 parser.add_argument('-P', '--passwd', type=str, help='omero password') | |
| 17 parser.add_argument('--in_file', type=str, required=True, | |
| 18 help='input file') | |
| 19 parser.add_argument('--out_file', type=str, required=True, | |
| 20 help='output file') | |
| 21 return parser | |
| 22 | |
| 23 | |
| 24 def check_row(row, individuals_map, kb, logger): | |
| 25 logger.debug('Checking record %r' % row) | |
| 26 try: | |
| 27 ind = individuals_map[row['individual']] | |
| 28 logger.info('%s is a valid Individual ID' % ind.id) | |
| 29 if row['father'] != 'None': | |
| 30 father = individuals_map[row['father']] | |
| 31 logger.info('%s is a valid Individual ID' % father.id) | |
| 32 check_gender(father, kb.Gender.MALE) | |
| 33 logger.info('Gender check passed') | |
| 34 else: | |
| 35 logger.info('None value, no check required') | |
| 36 if row['mother'] != 'None': | |
| 37 mother = individuals_map[row['mother']] | |
| 38 logger.info('%s is a valid Individual ID' % mother.id) | |
| 39 check_gender(mother, kb.Gender.FEMALE) | |
| 40 logger.info('Gender check passed') | |
| 41 else: | |
| 42 logger.info('None value, no check required') | |
| 43 return True | |
| 44 except KeyError, ke: | |
| 45 logger.error('%s is not a valid Individual ID, rejecting row' % ke) | |
| 46 return False | |
| 47 except ValueError, ve: | |
| 48 logger.error(ve) | |
| 49 return False | |
| 50 | |
| 51 | |
| 52 def check_gender(individual, gender): | |
| 53 if individual.gender.enum_label() != gender.enum_label(): | |
| 54 raise ValueError('Gender for individual %s is %s, expected %s, rejecting row' % (individual.id, | |
| 55 individual.gender.enum_label(), | |
| 56 gender.enum_label())) | |
| 57 else: | |
| 58 pass | |
| 59 | |
| 60 | |
| 61 def main(argv): | |
| 62 parser = make_parser() | |
| 63 args = parser.parse_args(argv) | |
| 64 | |
| 65 logger = get_logger('check_update_parents_data', level=args.loglevel, | |
| 66 filename=args.logfile) | |
| 67 | |
| 68 try: | |
| 69 host = args.host or vlu.ome_host() | |
| 70 user = args.user or vlu.ome_user() | |
| 71 passwd = args.passwd or vlu.ome_passwd() | |
| 72 except ValueError, ve: | |
| 73 logger.critical(ve) | |
| 74 sys.exit(ve) | |
| 75 | |
| 76 kb = KB(driver='omero')(host, user, passwd) | |
| 77 | |
| 78 logger.info('Preloading all individuals from the system') | |
| 79 inds = kb.get_objects(kb.Individual) | |
| 80 logger.info('%d individuals loaded' % len(inds)) | |
| 81 inds_lookup = {} | |
| 82 for i in inds: | |
| 83 inds_lookup[i.id] = i | |
| 84 | |
| 85 with open(args.in_file) as infile, open(args.out_file, 'w') as outfile: | |
| 86 reader = csv.DictReader(infile, delimiter='\t') | |
| 87 records = list(reader) | |
| 88 logger.info('Check for duplicated in \'individual\' column') | |
| 89 recs_by_ind = {} | |
| 90 for rec in records: | |
| 91 recs_by_ind.setdefault(rec['individual'], []).append(rec) | |
| 92 ct = Counter() | |
| 93 duplicated = [] | |
| 94 for k,v in recs_by_ind.iteritems(): | |
| 95 if len(v) > 1: | |
| 96 duplicated.append(k) | |
| 97 for dupl in duplicated: | |
| 98 logger.info('Individual %s is a duplicated' % dupl) | |
| 99 for r in recs_by_ind.pop(dupl): | |
| 100 logger.info('Removing record %r' % r) | |
| 101 good_records = sum(recs_by_ind.itervalues(), []) | |
| 102 logger.info('Duplicated check completed') | |
| 103 writer = csv.DictWriter(outfile, reader.fieldnames, delimiter='\t') | |
| 104 writer.writeheader() | |
| 105 logger.info('Checking records') | |
| 106 for row in good_records: | |
| 107 if check_row(row, inds_lookup, kb, logger): | |
| 108 writer.writerow(row) | |
| 109 logger.debug('Record %r written in output file' % row) | |
| 110 logger.info('Records check completed') | |
| 111 | |
| 112 if __name__ == '__main__': | |
| 113 main(sys.argv[1:]) |
