Mercurial > repos > ric > test2
diff galaxy-tools/biobank/utils/prepare_individuals_import.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_individuals_import.py Wed Sep 28 06:03:30 2016 -0400 @@ -0,0 +1,103 @@ +""" +Split a file like:: + + individual gender father mother + ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 + ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 + +into two separated TSV files, the first one will be used to import new individuals and enrollments, +the second one will be used to update father and mother informations for the individuals in the first +file. +""" + +import sys, argparse, csv +from bl.vl.utils import LOG_LEVELS, get_logger + + +def get_parser(): + parser = argparse.ArgumentParser('Prepare input files for individuals import workflow') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--out-individuals', type=str, required=True, + help='input file with Individuals definitions') + parser.add_argument('--out-parents', type=str, required=True, + help='input file with parents definitions') + return parser + + +def get_individual_definitions(records, logger): + logger.info('Creating individual definitions') + ind_defs = [] + for rec in records: + try: + idef = {'father': 'None', 'mother': 'None'} + idef['gender'] = rec['gender'] + try: + idef['study'], idef['label'] = rec['individual'].split(':') + except ValueError: + logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual']) + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + ind_defs.append(idef) + logger.info('Retrieved %d individual definitions', len(ind_defs)) + return ind_defs + + +def get_parents_definitions(records, logger): + logger.info('Creating parents definitions') + parents_defs = [] + for rec in records: + try: + pdef = dict() + pdef['individual'] = rec['individual'] + if rec['father'] != 'None' or rec['mother'] != 'None': + pdef['father'] = rec['father'] + pdef['mother'] = rec['mother'] + parents_defs.append(pdef) + else: + continue + except KeyError, ke: + logger.error('Skipped record %r, missing key %s', rec, ke) + continue + logger.info('Retrieved %d parents definitions', len(parents_defs)) + return parents_defs + + +def main(argv): + parser = get_parser() + args = parser.parse_args(argv) + + logger = get_logger('prepare_individuals_import', level=args.loglevel, + filename=args.logfile) + + logger.info('Start processing file %s', args.in_file) + + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [row for row in reader] + logger.info('Loaded %d records', len(records)) + + individual_defs = get_individual_definitions(records, logger) + with open(args.out_individuals, 'w') as inds_out: + inds_writer = csv.DictWriter(inds_out, + ['study', 'label', 'gender', 'father', 'mother'], + delimiter='\t') + inds_writer.writeheader() + inds_writer.writerows(individual_defs) + + parents_defs = get_parents_definitions(records, logger) + with open(args.out_parents, 'w') as parents_out: + parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'], + delimiter='\t') + parents_writer.writeheader() + parents_writer.writerows(parents_defs) + + logger.info('Job completed') + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file
