Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/utils/prepare_individuals_import.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 """ | |
| 2 Split a file like:: | |
| 3 | |
| 4 individual gender father mother | |
| 5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 | |
| 6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 | |
| 7 | |
| 8 into two separated TSV files, the first one will be used to import new individuals and enrollments, | |
| 9 the second one will be used to update father and mother informations for the individuals in the first | |
| 10 file. | |
| 11 """ | |
| 12 | |
| 13 import sys, argparse, csv | |
| 14 from bl.vl.utils import LOG_LEVELS, get_logger | |
| 15 | |
| 16 | |
| 17 def get_parser(): | |
| 18 parser = argparse.ArgumentParser('Prepare input files for individuals import workflow') | |
| 19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 21 help='logging level', default='INFO') | |
| 22 parser.add_argument('--in-file', type=str, required=True, | |
| 23 help='input TSV file') | |
| 24 parser.add_argument('--out-individuals', type=str, required=True, | |
| 25 help='input file with Individuals definitions') | |
| 26 parser.add_argument('--out-parents', type=str, required=True, | |
| 27 help='input file with parents definitions') | |
| 28 return parser | |
| 29 | |
| 30 | |
| 31 def get_individual_definitions(records, logger): | |
| 32 logger.info('Creating individual definitions') | |
| 33 ind_defs = [] | |
| 34 for rec in records: | |
| 35 try: | |
| 36 idef = {'father': 'None', 'mother': 'None'} | |
| 37 idef['gender'] = rec['gender'] | |
| 38 try: | |
| 39 idef['study'], idef['label'] = rec['individual'].split(':') | |
| 40 except ValueError: | |
| 41 logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual']) | |
| 42 continue | |
| 43 except KeyError, ke: | |
| 44 logger.error('Skipped record %r, missing key %s', rec, ke) | |
| 45 continue | |
| 46 ind_defs.append(idef) | |
| 47 logger.info('Retrieved %d individual definitions', len(ind_defs)) | |
| 48 return ind_defs | |
| 49 | |
| 50 | |
| 51 def get_parents_definitions(records, logger): | |
| 52 logger.info('Creating parents definitions') | |
| 53 parents_defs = [] | |
| 54 for rec in records: | |
| 55 try: | |
| 56 pdef = dict() | |
| 57 pdef['individual'] = rec['individual'] | |
| 58 if rec['father'] != 'None' or rec['mother'] != 'None': | |
| 59 pdef['father'] = rec['father'] | |
| 60 pdef['mother'] = rec['mother'] | |
| 61 parents_defs.append(pdef) | |
| 62 else: | |
| 63 continue | |
| 64 except KeyError, ke: | |
| 65 logger.error('Skipped record %r, missing key %s', rec, ke) | |
| 66 continue | |
| 67 logger.info('Retrieved %d parents definitions', len(parents_defs)) | |
| 68 return parents_defs | |
| 69 | |
| 70 | |
| 71 def main(argv): | |
| 72 parser = get_parser() | |
| 73 args = parser.parse_args(argv) | |
| 74 | |
| 75 logger = get_logger('prepare_individuals_import', level=args.loglevel, | |
| 76 filename=args.logfile) | |
| 77 | |
| 78 logger.info('Start processing file %s', args.in_file) | |
| 79 | |
| 80 with open(args.in_file) as in_file: | |
| 81 reader = csv.DictReader(in_file, delimiter='\t') | |
| 82 records = [row for row in reader] | |
| 83 logger.info('Loaded %d records', len(records)) | |
| 84 | |
| 85 individual_defs = get_individual_definitions(records, logger) | |
| 86 with open(args.out_individuals, 'w') as inds_out: | |
| 87 inds_writer = csv.DictWriter(inds_out, | |
| 88 ['study', 'label', 'gender', 'father', 'mother'], | |
| 89 delimiter='\t') | |
| 90 inds_writer.writeheader() | |
| 91 inds_writer.writerows(individual_defs) | |
| 92 | |
| 93 parents_defs = get_parents_definitions(records, logger) | |
| 94 with open(args.out_parents, 'w') as parents_out: | |
| 95 parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'], | |
| 96 delimiter='\t') | |
| 97 parents_writer.writeheader() | |
| 98 parents_writer.writerows(parents_defs) | |
| 99 | |
| 100 logger.info('Job completed') | |
| 101 | |
| 102 if __name__ == '__main__': | |
| 103 main(sys.argv[1:]) |
