Mercurial > repos > ric > test1
view galaxy-tools/biobank/utils/prepare_individuals_import.py @ 12:46f08bb8dd68 draft default tip
Uploaded
author | ric |
---|---|
date | Wed, 28 Sep 2016 04:59:02 -0400 |
parents | 43be74e62bfe |
children |
line wrap: on
line source
""" Split a file like:: individual gender father mother ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 into two separated TSV files, the first one will be used to import new individuals and enrollments, the second one will be used to update father and mother informations for the individuals in the first file. """ import sys, argparse, csv from bl.vl.utils import LOG_LEVELS, get_logger def get_parser(): parser = argparse.ArgumentParser('Prepare input files for individuals import workflow') parser.add_argument('--logfile', type=str, help='log file (default=stderr)') parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, help='logging level', default='INFO') parser.add_argument('--in-file', type=str, required=True, help='input TSV file') parser.add_argument('--out-individuals', type=str, required=True, help='input file with Individuals definitions') parser.add_argument('--out-parents', type=str, required=True, help='input file with parents definitions') return parser def get_individual_definitions(records, logger): logger.info('Creating individual definitions') ind_defs = [] for rec in records: try: idef = {'father': 'None', 'mother': 'None'} idef['gender'] = rec['gender'] try: idef['study'], idef['label'] = rec['individual'].split(':') except ValueError: logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual']) continue except KeyError, ke: logger.error('Skipped record %r, missing key %s', rec, ke) continue ind_defs.append(idef) logger.info('Retrieved %d individual definitions', len(ind_defs)) return ind_defs def get_parents_definitions(records, logger): logger.info('Creating parents definitions') parents_defs = [] for rec in records: try: pdef = dict() pdef['individual'] = rec['individual'] if rec['father'] != 'None' or rec['mother'] != 'None': pdef['father'] = rec['father'] pdef['mother'] = rec['mother'] parents_defs.append(pdef) else: continue except KeyError, ke: logger.error('Skipped record %r, missing key %s', rec, ke) continue logger.info('Retrieved %d parents definitions', len(parents_defs)) return parents_defs def main(argv): parser = get_parser() args = parser.parse_args(argv) logger = get_logger('prepare_individuals_import', level=args.loglevel, filename=args.logfile) logger.info('Start processing file %s', args.in_file) with open(args.in_file) as in_file: reader = csv.DictReader(in_file, delimiter='\t') records = [row for row in reader] logger.info('Loaded %d records', len(records)) individual_defs = get_individual_definitions(records, logger) with open(args.out_individuals, 'w') as inds_out: inds_writer = csv.DictWriter(inds_out, ['study', 'label', 'gender', 'father', 'mother'], delimiter='\t') inds_writer.writeheader() inds_writer.writerows(individual_defs) parents_defs = get_parents_definitions(records, logger) with open(args.out_parents, 'w') as parents_out: parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'], delimiter='\t') parents_writer.writeheader() parents_writer.writerows(parents_defs) logger.info('Job completed') if __name__ == '__main__': main(sys.argv[1:])