Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/utils/format_vessels_by_individual_output.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 # This tool format output files from kb_query vessels_by_individual | |
| 2 # into a tabular format with all data related to an individual grouped | |
| 3 # in each row. The tool needs as input a mapping file like | |
| 4 # | |
| 5 # individual_id label | |
| 6 # V12311 A_STUDY:A_CODE | |
| 7 # V135115 A_STUDY:B_CODE | |
| 8 # | |
| 9 # in order to use a known label and not VIDs for each row | |
| 10 | |
| 11 import csv, sys, argparse, logging | |
| 12 | |
| 13 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' | |
| 14 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' | |
| 15 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] | |
| 16 | |
| 17 def make_parser(): | |
| 18 parser = argparse.ArgumentParser(description='format kb_query vessels_by_individual output file to tabular format') | |
| 19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 21 help='logging level', default='INFO') | |
| 22 parser.add_argument('--in_file', type=str, required=True, | |
| 23 help='input file (obtained using kb_query vessels by individual tool)') | |
| 24 parser.add_argument('--map_file', type=str, required=True, | |
| 25 help='mapping file') | |
| 26 parser.add_argument('--out_file', type=str, required=True, | |
| 27 help='output file') | |
| 28 return parser | |
| 29 | |
| 30 def get_mapping(records, grouper_field, grouped_field): | |
| 31 mapping = {} | |
| 32 for rec in records: | |
| 33 mapping.setdefault(rec[grouper_field], []).append(rec[grouped_field]) | |
| 34 return mapping | |
| 35 | |
| 36 def get_labels_mapping(reader, logger): | |
| 37 rows = [r for r in reader] | |
| 38 lmap = get_mapping(rows, 'individual', 'label') | |
| 39 logger.info('%d labels grouped for %d individuals' % (len(rows), | |
| 40 len(lmap))) | |
| 41 return lmap | |
| 42 | |
| 43 def get_vessels_mapping(reader, logger): | |
| 44 rows = [r for r in reader] | |
| 45 vmap = get_mapping(rows, 'individual', 'vessel_label') | |
| 46 logger.info('%d vessels grouped for %d individuals' % (len(rows), | |
| 47 len(vmap))) | |
| 48 return vmap | |
| 49 | |
| 50 def build_record(label, vessels): | |
| 51 record = {'individual_label' : '--'.join(label)} | |
| 52 for v in vessels: | |
| 53 record['vessel_%d' % (vessels.index(v) + 1)] = v | |
| 54 return record | |
| 55 | |
| 56 def main(argv): | |
| 57 parser = make_parser() | |
| 58 args = parser.parse_args(argv) | |
| 59 | |
| 60 log_level = getattr(logging, args.loglevel) | |
| 61 kwargs = {'format' : LOG_FORMAT, | |
| 62 'datefmt' : LOG_DATEFMT, | |
| 63 'level' : log_level} | |
| 64 if args.logfile: | |
| 65 kwargs['filename'] = args.logfile | |
| 66 logging.basicConfig(**kwargs) | |
| 67 logger = logging.getLogger() | |
| 68 | |
| 69 with open(args.map_file) as mf: | |
| 70 reader = csv.DictReader(mf, delimiter='\t') | |
| 71 labels_map = get_labels_mapping(reader, logger) | |
| 72 | |
| 73 with open(args.in_file) as inf: | |
| 74 reader = csv.DictReader(inf, delimiter='\t') | |
| 75 vessels_map = get_vessels_mapping(reader, logger) | |
| 76 | |
| 77 max_vessels_count = max([len(v) for v in vessels_map.values()]) | |
| 78 csv_fields = ['individual_label'] | |
| 79 for x in xrange(max_vessels_count): | |
| 80 csv_fields.append('vessel_%d' % (x+1)) | |
| 81 | |
| 82 with open(args.out_file, 'w') as ofile: | |
| 83 writer = csv.DictWriter(ofile, csv_fields, delimiter='\t') | |
| 84 writer.writeheader() | |
| 85 for ind, vessels in vessels_map.iteritems(): | |
| 86 writer.writerow(build_record(labels_map[ind], vessels)) | |
| 87 | |
| 88 logger.info('Job completed') | |
| 89 | |
| 90 if __name__ == '__main__': | |
| 91 main(sys.argv[1:]) |
