Mercurial > repos > ric > test1
diff galaxy-tools/biobank/utils/format_vessels_by_individual_output.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/format_vessels_by_individual_output.py Thu Sep 22 08:57:04 2016 -0400 @@ -0,0 +1,91 @@ +# This tool format output files from kb_query vessels_by_individual +# into a tabular format with all data related to an individual grouped +# in each row. The tool needs as input a mapping file like +# +# individual_id label +# V12311 A_STUDY:A_CODE +# V135115 A_STUDY:B_CODE +# +# in order to use a known label and not VIDs for each row + +import csv, sys, argparse, logging + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + +def make_parser(): + parser = argparse.ArgumentParser(description='format kb_query vessels_by_individual output file to tabular format') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in_file', type=str, required=True, + help='input file (obtained using kb_query vessels by individual tool)') + parser.add_argument('--map_file', type=str, required=True, + help='mapping file') + parser.add_argument('--out_file', type=str, required=True, + help='output file') + return parser + +def get_mapping(records, grouper_field, grouped_field): + mapping = {} + for rec in records: + mapping.setdefault(rec[grouper_field], []).append(rec[grouped_field]) + return mapping + +def get_labels_mapping(reader, logger): + rows = [r for r in reader] + lmap = get_mapping(rows, 'individual', 'label') + logger.info('%d labels grouped for %d individuals' % (len(rows), + len(lmap))) + return lmap + +def get_vessels_mapping(reader, logger): + rows = [r for r in reader] + vmap = get_mapping(rows, 'individual', 'vessel_label') + logger.info('%d vessels grouped for %d individuals' % (len(rows), + len(vmap))) + return vmap + +def build_record(label, vessels): + record = {'individual_label' : '--'.join(label)} + for v in vessels: + record['vessel_%d' % (vessels.index(v) + 1)] = v + return record + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format' : LOG_FORMAT, + 'datefmt' : LOG_DATEFMT, + 'level' : log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger() + + with open(args.map_file) as mf: + reader = csv.DictReader(mf, delimiter='\t') + labels_map = get_labels_mapping(reader, logger) + + with open(args.in_file) as inf: + reader = csv.DictReader(inf, delimiter='\t') + vessels_map = get_vessels_mapping(reader, logger) + + max_vessels_count = max([len(v) for v in vessels_map.values()]) + csv_fields = ['individual_label'] + for x in xrange(max_vessels_count): + csv_fields.append('vessel_%d' % (x+1)) + + with open(args.out_file, 'w') as ofile: + writer = csv.DictWriter(ofile, csv_fields, delimiter='\t') + writer.writeheader() + for ind, vessels in vessels_map.iteritems(): + writer.writerow(build_record(labels_map[ind], vessels)) + + logger.info('Job completed') + +if __name__ == '__main__': + main(sys.argv[1:])