view galaxy-tools/biobank/utils/split_by_study.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
line wrap: on
line source

"""
Split a file like::

  individual    gender  father       mother
  ASTUDY:2141   MALE    ASTUDY:12    ASTUDY:12341
  ASTUDY:415    MALE    ASTUDY:3562  ASTUDY:13612
  BSTUDY:12515  FEMALE  BSTUDY:3512  BSTUDY:124

into multiple files based on the STUDY value of the label stored in the "individual" column.
Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line
will be skipped.
"""

import sys, argparse, csv, os
from bl.vl.utils import LOG_LEVELS, get_logger


def get_parser():
    parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria')
    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
                        help='logging level', default='INFO')
    parser.add_argument('--in-file', type=str, required=True,
                        help='input TSV file')
    parser.add_argument('--record-id', type=int,
                        help='Output ID record given by Galaxy')
    parser.add_argument('--out-path', type=str, help='Output directory',
                        default='.')
    return parser


def split_element(element, logger):
    try:
        study, code = element.split(':')
        return study, code
    except ValueError:
        logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element)
        return None, None


def map_by_study(records, logger):
    records_map = {}
    for rec in records:
        study, code = split_element(rec['individual'], logger)
        if not study and not code:
            logger.debug('Skipping record %r', rec)
            continue
        records_map.setdefault(study, []).append(rec)
    logger.info('Records splitted between %d studies', len(records_map.keys()))
    return records_map


def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None):

    def get_file_name(study, out_path, galaxy_id=None):
        if not galaxy_id:
            file_name = '%s_individuals.tsv' % study
        else:
            file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-'))
        return os.path.join(out_path, file_name)

    fname = get_file_name(study_label, output_path, galaxy_record_id)
    with open(fname, 'w') as ofile:
        logger.info('Dumping %d records to file %s', len(records), fname)
        writer = csv.DictWriter(ofile, header, delimiter='\t')
        writer.writeheader()
        writer.writerows(records)


def main(argv):
    parser = get_parser()
    args = parser.parse_args(argv)

    logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile)

    logger.info('Start processing file %s', args.in_file)
    with open(args.in_file) as in_file:
        reader = csv.DictReader(in_file, delimiter='\t')
        records = [row for row in reader]

    records_map = map_by_study(records, logger)
    # Force the header of the output files in order to prevent problems when running the workflow later
    header = ['individual', 'gender', 'father', 'mother']
    for study, records in records_map.iteritems():
        dump_records(study, records, header, args.out_path, logger, args.record_id)
    logger.info('Job completed')


if __name__ == '__main__':
    main(sys.argv[1:])