Mercurial > repos > ric > test1
diff galaxy-tools/biobank/utils/split_by_study.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/split_by_study.py Thu Sep 22 08:57:04 2016 -0400 @@ -0,0 +1,90 @@ +""" +Split a file like:: + + individual gender father mother + ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341 + ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612 + BSTUDY:12515 FEMALE BSTUDY:3512 BSTUDY:124 + +into multiple files based on the STUDY value of the label stored in the "individual" column. +Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line +will be skipped. +""" + +import sys, argparse, csv, os +from bl.vl.utils import LOG_LEVELS, get_logger + + +def get_parser(): + parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', type=str, required=True, + help='input TSV file') + parser.add_argument('--record-id', type=int, + help='Output ID record given by Galaxy') + parser.add_argument('--out-path', type=str, help='Output directory', + default='.') + return parser + + +def split_element(element, logger): + try: + study, code = element.split(':') + return study, code + except ValueError: + logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element) + return None, None + + +def map_by_study(records, logger): + records_map = {} + for rec in records: + study, code = split_element(rec['individual'], logger) + if not study and not code: + logger.debug('Skipping record %r', rec) + continue + records_map.setdefault(study, []).append(rec) + logger.info('Records splitted between %d studies', len(records_map.keys())) + return records_map + + +def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None): + + def get_file_name(study, out_path, galaxy_id=None): + if not galaxy_id: + file_name = '%s_individuals.tsv' % study + else: + file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-')) + return os.path.join(out_path, file_name) + + fname = get_file_name(study_label, output_path, galaxy_record_id) + with open(fname, 'w') as ofile: + logger.info('Dumping %d records to file %s', len(records), fname) + writer = csv.DictWriter(ofile, header, delimiter='\t') + writer.writeheader() + writer.writerows(records) + + +def main(argv): + parser = get_parser() + args = parser.parse_args(argv) + + logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile) + + logger.info('Start processing file %s', args.in_file) + with open(args.in_file) as in_file: + reader = csv.DictReader(in_file, delimiter='\t') + records = [row for row in reader] + + records_map = map_by_study(records, logger) + # Force the header of the output files in order to prevent problems when running the workflow later + header = ['individual', 'gender', 'father', 'mother'] + for study, records in records_map.iteritems(): + dump_records(study, records, header, args.out_path, logger, args.record_id) + logger.info('Job completed') + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file