Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/utils/split_by_study.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 """ | |
| 2 Split a file like:: | |
| 3 | |
| 4 individual gender father mother | |
| 5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 | |
| 6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 | |
| 7 BSTUDY:12515 FEMALE BSTUDY:3512 BSTUDY:124 | |
| 8 | |
| 9 into multiple files based on the STUDY value of the label stored in the "individual" column. | |
| 10 Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line | |
| 11 will be skipped. | |
| 12 """ | |
| 13 | |
| 14 import sys, argparse, csv, os | |
| 15 from bl.vl.utils import LOG_LEVELS, get_logger | |
| 16 | |
| 17 | |
| 18 def get_parser(): | |
| 19 parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria') | |
| 20 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 21 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 22 help='logging level', default='INFO') | |
| 23 parser.add_argument('--in-file', type=str, required=True, | |
| 24 help='input TSV file') | |
| 25 parser.add_argument('--record-id', type=int, | |
| 26 help='Output ID record given by Galaxy') | |
| 27 parser.add_argument('--out-path', type=str, help='Output directory', | |
| 28 default='.') | |
| 29 return parser | |
| 30 | |
| 31 | |
| 32 def split_element(element, logger): | |
| 33 try: | |
| 34 study, code = element.split(':') | |
| 35 return study, code | |
| 36 except ValueError: | |
| 37 logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element) | |
| 38 return None, None | |
| 39 | |
| 40 | |
| 41 def map_by_study(records, logger): | |
| 42 records_map = {} | |
| 43 for rec in records: | |
| 44 study, code = split_element(rec['individual'], logger) | |
| 45 if not study and not code: | |
| 46 logger.debug('Skipping record %r', rec) | |
| 47 continue | |
| 48 records_map.setdefault(study, []).append(rec) | |
| 49 logger.info('Records splitted between %d studies', len(records_map.keys())) | |
| 50 return records_map | |
| 51 | |
| 52 | |
| 53 def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None): | |
| 54 | |
| 55 def get_file_name(study, out_path, galaxy_id=None): | |
| 56 if not galaxy_id: | |
| 57 file_name = '%s_individuals.tsv' % study | |
| 58 else: | |
| 59 file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-')) | |
| 60 return os.path.join(out_path, file_name) | |
| 61 | |
| 62 fname = get_file_name(study_label, output_path, galaxy_record_id) | |
| 63 with open(fname, 'w') as ofile: | |
| 64 logger.info('Dumping %d records to file %s', len(records), fname) | |
| 65 writer = csv.DictWriter(ofile, header, delimiter='\t') | |
| 66 writer.writeheader() | |
| 67 writer.writerows(records) | |
| 68 | |
| 69 | |
| 70 def main(argv): | |
| 71 parser = get_parser() | |
| 72 args = parser.parse_args(argv) | |
| 73 | |
| 74 logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile) | |
| 75 | |
| 76 logger.info('Start processing file %s', args.in_file) | |
| 77 with open(args.in_file) as in_file: | |
| 78 reader = csv.DictReader(in_file, delimiter='\t') | |
| 79 records = [row for row in reader] | |
| 80 | |
| 81 records_map = map_by_study(records, logger) | |
| 82 # Force the header of the output files in order to prevent problems when running the workflow later | |
| 83 header = ['individual', 'gender', 'father', 'mother'] | |
| 84 for study, records in records_map.iteritems(): | |
| 85 dump_records(study, records, header, args.out_path, logger, args.record_id) | |
| 86 logger.info('Job completed') | |
| 87 | |
| 88 | |
| 89 if __name__ == '__main__': | |
| 90 main(sys.argv[1:]) |
