Mercurial > repos > ric > test2
diff galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_enrollments_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,97 @@ +""" +Split a file like:: + +source enrollment +V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141 +V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390 + +into two separated a new TSV files + +source study label +V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141 +V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390 + +""" + +import sys, argparse, csv +from bl.vl.utils import LOG_LEVELS, get_logger + + +def get_parser(): + parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--out-enrollments', type=str, required=True, + help='input file with Enrollments definitions') + return parser + + +def get_enrollments_definitions(records, logger): + logger.info('Creating enrollment definitions') + enr_defs = [] + for rec in records: + try: + edef = {} + edef['source'] = rec['source'] + try: + edef['study'], edef['label'] = rec['enrollment'].split(':') + except ValueError: + logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment']) + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + enr_defs.append(edef) + logger.info('Retrieved %d enrollment definitions', len(enr_defs)) + return enr_defs + + +def get_parents_definitions(records, logger): + logger.info('Creating parents definitions') + parents_defs = [] + for rec in records: + try: + pdef = dict() + pdef['individual'] = rec['individual'] + if rec['father'] != 'None' or rec['mother'] != 'None': + pdef['father'] = rec['father'] + pdef['mother'] = rec['mother'] + parents_defs.append(pdef) + else: + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + logger.info('Retrieved %d parents definitions', len(parents_defs)) + return parents_defs + + +def main(argv): + parser = get_parser() + args = parser.parse_args(argv) + + logger = get_logger('prepare_enrollments_import', level=args.loglevel, + filename=args.logfile) + + logger.info('Start processing file %s', args.in_file) + + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [row for row in reader] + logger.info('Loaded %d records', len(records)) + + enrollment_defs = get_enrollments_definitions(records, logger) + with open(args.out_enrollments, 'w') as enr_out: + enr_writer = csv.DictWriter(enr_out, + ['source', 'study', 'label'], + delimiter='\t') + enr_writer.writeheader() + enr_writer.writerows(enrollment_defs) + + logger.info('Job completed') + +if __name__ == '__main__': + main(sys.argv[1:])
