Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 """ | |
| 2 Split a file like:: | |
| 3 | |
| 4 source enrollment | |
| 5 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY:2141 | |
| 6 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY:390 | |
| 7 | |
| 8 into two separated a new TSV files | |
| 9 | |
| 10 source study label | |
| 11 V044DE795E7F9F42FEB9855288CF577A77 ASTUDY 2141 | |
| 12 V06C59B915C0FD47DABE6AE02C731780AF BSTUDY 390 | |
| 13 | |
| 14 """ | |
| 15 | |
| 16 import sys, argparse, csv | |
| 17 from bl.vl.utils import LOG_LEVELS, get_logger | |
| 18 | |
| 19 | |
| 20 def get_parser(): | |
| 21 parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow') | |
| 22 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 23 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 24 help='logging level', default='INFO') | |
| 25 parser.add_argument('--in-file', type=str, required=True, | |
| 26 help='input TSV file') | |
| 27 parser.add_argument('--out-enrollments', type=str, required=True, | |
| 28 help='input file with Enrollments definitions') | |
| 29 return parser | |
| 30 | |
| 31 | |
| 32 def get_enrollments_definitions(records, logger): | |
| 33 logger.info('Creating enrollment definitions') | |
| 34 enr_defs = [] | |
| 35 for rec in records: | |
| 36 try: | |
| 37 edef = {} | |
| 38 edef['source'] = rec['source'] | |
| 39 try: | |
| 40 edef['study'], edef['label'] = rec['enrollment'].split(':') | |
| 41 except ValueError: | |
| 42 logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment']) | |
| 43 continue | |
| 44 except KeyError, ke: | |
| 45 logger.error('Skipped record %r, missing key %s', rec, ke) | |
| 46 continue | |
| 47 enr_defs.append(edef) | |
| 48 logger.info('Retrieved %d enrollment definitions', len(enr_defs)) | |
| 49 return enr_defs | |
| 50 | |
| 51 | |
| 52 def get_parents_definitions(records, logger): | |
| 53 logger.info('Creating parents definitions') | |
| 54 parents_defs = [] | |
| 55 for rec in records: | |
| 56 try: | |
| 57 pdef = dict() | |
| 58 pdef['individual'] = rec['individual'] | |
| 59 if rec['father'] != 'None' or rec['mother'] != 'None': | |
| 60 pdef['father'] = rec['father'] | |
| 61 pdef['mother'] = rec['mother'] | |
| 62 parents_defs.append(pdef) | |
| 63 else: | |
| 64 continue | |
| 65 except KeyError, ke: | |
| 66 logger.error('Skipped record %r, missing key %s', rec, ke) | |
| 67 continue | |
| 68 logger.info('Retrieved %d parents definitions', len(parents_defs)) | |
| 69 return parents_defs | |
| 70 | |
| 71 | |
| 72 def main(argv): | |
| 73 parser = get_parser() | |
| 74 args = parser.parse_args(argv) | |
| 75 | |
| 76 logger = get_logger('prepare_enrollments_import', level=args.loglevel, | |
| 77 filename=args.logfile) | |
| 78 | |
| 79 logger.info('Start processing file %s', args.in_file) | |
| 80 | |
| 81 with open(args.in_file) as in_file: | |
| 82 reader = csv.DictReader(in_file, delimiter='\t') | |
| 83 records = [row for row in reader] | |
| 84 logger.info('Loaded %d records', len(records)) | |
| 85 | |
| 86 enrollment_defs = get_enrollments_definitions(records, logger) | |
| 87 with open(args.out_enrollments, 'w') as enr_out: | |
| 88 enr_writer = csv.DictWriter(enr_out, | |
| 89 ['source', 'study', 'label'], | |
| 90 delimiter='\t') | |
| 91 enr_writer.writeheader() | |
| 92 enr_writer.writerows(enrollment_defs) | |
| 93 | |
| 94 logger.info('Job completed') | |
| 95 | |
| 96 if __name__ == '__main__': | |
| 97 main(sys.argv[1:]) |
