view galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
line wrap: on
line source

"""
Split a file like::

source                              enrollment
V044DE795E7F9F42FEB9855288CF577A77  ASTUDY:2141
V06C59B915C0FD47DABE6AE02C731780AF  BSTUDY:390

into two separated  a new TSV files

source                              study  label
V044DE795E7F9F42FEB9855288CF577A77  ASTUDY 2141
V06C59B915C0FD47DABE6AE02C731780AF  BSTUDY 390

"""

import sys, argparse, csv
from bl.vl.utils import LOG_LEVELS, get_logger


def get_parser():
    parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow')
    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
                        help='logging level', default='INFO')
    parser.add_argument('--in-file', type=str, required=True,
                        help='input TSV file')
    parser.add_argument('--out-enrollments', type=str, required=True,
                        help='input file with Enrollments definitions')
    return parser


def get_enrollments_definitions(records, logger):
    logger.info('Creating enrollment definitions')
    enr_defs = []
    for rec in records:
        try:
            edef = {}
            edef['source'] = rec['source']
            try:
                edef['study'], edef['label'] = rec['enrollment'].split(':')
            except ValueError:
                logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment'])
                continue
        except KeyError, ke:
            logger.error('Skipped record %r, missing key %s', rec, ke)
            continue
        enr_defs.append(edef)
    logger.info('Retrieved %d enrollment definitions', len(enr_defs))
    return enr_defs


def get_parents_definitions(records, logger):
    logger.info('Creating parents definitions')
    parents_defs = []
    for rec in records:
        try:
            pdef = dict()
            pdef['individual'] = rec['individual']
            if rec['father'] != 'None' or rec['mother'] != 'None':
                pdef['father'] = rec['father']
                pdef['mother'] = rec['mother']
                parents_defs.append(pdef)
            else:
                continue
        except KeyError, ke:
            logger.error('Skipped record %r, missing key %s', rec, ke)
            continue
    logger.info('Retrieved %d parents definitions', len(parents_defs))
    return parents_defs


def main(argv):
    parser = get_parser()
    args = parser.parse_args(argv)

    logger = get_logger('prepare_enrollments_import', level=args.loglevel,
                        filename=args.logfile)

    logger.info('Start processing file %s', args.in_file)

    with open(args.in_file) as in_file:
        reader = csv.DictReader(in_file, delimiter='\t')
        records = [row for row in reader]
        logger.info('Loaded %d records', len(records))

    enrollment_defs = get_enrollments_definitions(records, logger)
    with open(args.out_enrollments, 'w') as enr_out:
        enr_writer = csv.DictWriter(enr_out,
                                     ['source', 'study', 'label'],
                                     delimiter='\t')
        enr_writer.writeheader()
        enr_writer.writerows(enrollment_defs)

    logger.info('Job completed')

if __name__ == '__main__':
    main(sys.argv[1:])