Mercurial > repos > ric > test1
view galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
line wrap: on
line source
""" Split a file like:: source enrollment V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390 into two separated a new TSV files source study label V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390 """ import sys, argparse, csv from bl.vl.utils import LOG_LEVELS, get_logger def get_parser(): parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow') parser.add_argument('--logfile', type=str, help='log file (default=stderr)') parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, help='logging level', default='INFO') parser.add_argument('--in-file', type=str, required=True, help='input TSV file') parser.add_argument('--out-enrollments', type=str, required=True, help='input file with Enrollments definitions') return parser def get_enrollments_definitions(records, logger): logger.info('Creating enrollment definitions') enr_defs = [] for rec in records: try: edef = {} edef['source'] = rec['source'] try: edef['study'], edef['label'] = rec['enrollment'].split(':') except ValueError: logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment']) continue except KeyError, ke: logger.error('Skipped record %r, missing key %s', rec, ke) continue enr_defs.append(edef) logger.info('Retrieved %d enrollment definitions', len(enr_defs)) return enr_defs def get_parents_definitions(records, logger): logger.info('Creating parents definitions') parents_defs = [] for rec in records: try: pdef = dict() pdef['individual'] = rec['individual'] if rec['father'] != 'None' or rec['mother'] != 'None': pdef['father'] = rec['father'] pdef['mother'] = rec['mother'] parents_defs.append(pdef) else: continue except KeyError, ke: logger.error('Skipped record %r, missing key %s', rec, ke) continue logger.info('Retrieved %d parents definitions', len(parents_defs)) return parents_defs def main(argv): parser = get_parser() args = parser.parse_args(argv) logger = get_logger('prepare_enrollments_import', level=args.loglevel, filename=args.logfile) logger.info('Start processing file %s', args.in_file) with open(args.in_file) as in_file: reader = csv.DictReader(in_file, delimiter='\t') records = [row for row in reader] logger.info('Loaded %d records', len(records)) enrollment_defs = get_enrollments_definitions(records, logger) with open(args.out_enrollments, 'w') as enr_out: enr_writer = csv.DictWriter(enr_out, ['source', 'study', 'label'], delimiter='\t') enr_writer.writeheader() enr_writer.writerows(enrollment_defs) logger.info('Job completed') if __name__ == '__main__': main(sys.argv[1:])