view galaxy-tools/biobank/utils/prepare_individuals_import.py @ 12:46f08bb8dd68 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 04:59:02 -0400
parents 43be74e62bfe
children
line wrap: on
line source

"""
Split a file like::

  individual    gender  father       mother
  ASTUDY:2141   MALE    ASTUDY:12    ASTUDY:12341
  ASTUDY:415    MALE    ASTUDY:3562  ASTUDY:13612

into two separated TSV files, the first one will be used to import new individuals and enrollments,
the second one will be used to update father and mother informations for the individuals in the first
file.
"""

import sys, argparse, csv
from bl.vl.utils import LOG_LEVELS, get_logger


def get_parser():
    parser = argparse.ArgumentParser('Prepare input files for individuals import workflow')
    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
                        help='logging level', default='INFO')
    parser.add_argument('--in-file', type=str, required=True,
                        help='input TSV file')
    parser.add_argument('--out-individuals', type=str, required=True,
                        help='input file with Individuals definitions')
    parser.add_argument('--out-parents', type=str, required=True,
                        help='input file with parents definitions')
    return parser


def get_individual_definitions(records, logger):
    logger.info('Creating individual definitions')
    ind_defs = []
    for rec in records:
        try:
            idef = {'father': 'None', 'mother': 'None'}
            idef['gender'] = rec['gender']
            try:
                idef['study'], idef['label'] = rec['individual'].split(':')
            except ValueError:
                logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual'])
                continue
        except KeyError, ke:
            logger.error('Skipped record %r, missing key %s', rec, ke)
            continue
        ind_defs.append(idef)
    logger.info('Retrieved %d individual definitions', len(ind_defs))
    return ind_defs


def get_parents_definitions(records, logger):
    logger.info('Creating parents definitions')
    parents_defs = []
    for rec in records:
        try:
            pdef = dict()
            pdef['individual'] = rec['individual']
            if rec['father'] != 'None' or rec['mother'] != 'None':
                pdef['father'] = rec['father']
                pdef['mother'] = rec['mother']
                parents_defs.append(pdef)
            else:
                continue
        except KeyError, ke:
            logger.error('Skipped record %r, missing key %s', rec, ke)
            continue
    logger.info('Retrieved %d parents definitions', len(parents_defs))
    return parents_defs


def main(argv):
    parser = get_parser()
    args = parser.parse_args(argv)

    logger = get_logger('prepare_individuals_import', level=args.loglevel,
                        filename=args.logfile)

    logger.info('Start processing file %s', args.in_file)

    with open(args.in_file) as in_file:
        reader = csv.DictReader(in_file, delimiter='\t')
        records = [row for row in reader]
        logger.info('Loaded %d records', len(records))

    individual_defs = get_individual_definitions(records, logger)
    with open(args.out_individuals, 'w') as inds_out:
        inds_writer = csv.DictWriter(inds_out,
                                     ['study', 'label', 'gender', 'father', 'mother'],
                                     delimiter='\t')
        inds_writer.writeheader()
        inds_writer.writerows(individual_defs)

    parents_defs = get_parents_definitions(records, logger)
    with open(args.out_parents, 'w') as parents_out:
        parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'],
                                        delimiter='\t')
        parents_writer.writeheader()
        parents_writer.writerows(parents_defs)

    logger.info('Job completed')

if __name__ == '__main__':
    main(sys.argv[1:])