annotate galaxy-tools/biobank/utils/format_vessels_by_individual_output.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
1 # This tool format output files from kb_query vessels_by_individual
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
2 # into a tabular format with all data related to an individual grouped
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
3 # in each row. The tool needs as input a mapping file like
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
4 #
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
5 # individual_id label
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
6 # V12311 A_STUDY:A_CODE
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
7 # V135115 A_STUDY:B_CODE
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
8 #
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
9 # in order to use a known label and not VIDs for each row
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
10
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
11 import csv, sys, argparse, logging
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
12
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
13 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
14 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
15 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
16
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
17 def make_parser():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
18 parser = argparse.ArgumentParser(description='format kb_query vessels_by_individual output file to tabular format')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
19 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
20 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
21 help='logging level', default='INFO')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
22 parser.add_argument('--in_file', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
23 help='input file (obtained using kb_query vessels by individual tool)')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
24 parser.add_argument('--map_file', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
25 help='mapping file')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
26 parser.add_argument('--out_file', type=str, required=True,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
27 help='output file')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
28 return parser
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
29
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
30 def get_mapping(records, grouper_field, grouped_field):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
31 mapping = {}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
32 for rec in records:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
33 mapping.setdefault(rec[grouper_field], []).append(rec[grouped_field])
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
34 return mapping
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
35
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
36 def get_labels_mapping(reader, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
37 rows = [r for r in reader]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
38 lmap = get_mapping(rows, 'individual', 'label')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
39 logger.info('%d labels grouped for %d individuals' % (len(rows),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
40 len(lmap)))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
41 return lmap
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
42
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
43 def get_vessels_mapping(reader, logger):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
44 rows = [r for r in reader]
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
45 vmap = get_mapping(rows, 'individual', 'vessel_label')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
46 logger.info('%d vessels grouped for %d individuals' % (len(rows),
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
47 len(vmap)))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
48 return vmap
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
49
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
50 def build_record(label, vessels):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
51 record = {'individual_label' : '--'.join(label)}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
52 for v in vessels:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
53 record['vessel_%d' % (vessels.index(v) + 1)] = v
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
54 return record
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
55
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
56 def main(argv):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
57 parser = make_parser()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
58 args = parser.parse_args(argv)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
59
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
60 log_level = getattr(logging, args.loglevel)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
61 kwargs = {'format' : LOG_FORMAT,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
62 'datefmt' : LOG_DATEFMT,
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
63 'level' : log_level}
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
64 if args.logfile:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
65 kwargs['filename'] = args.logfile
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
66 logging.basicConfig(**kwargs)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
67 logger = logging.getLogger()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
68
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
69 with open(args.map_file) as mf:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
70 reader = csv.DictReader(mf, delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
71 labels_map = get_labels_mapping(reader, logger)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
72
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
73 with open(args.in_file) as inf:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
74 reader = csv.DictReader(inf, delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
75 vessels_map = get_vessels_mapping(reader, logger)
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
76
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
77 max_vessels_count = max([len(v) for v in vessels_map.values()])
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
78 csv_fields = ['individual_label']
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
79 for x in xrange(max_vessels_count):
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
80 csv_fields.append('vessel_%d' % (x+1))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
81
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
82 with open(args.out_file, 'w') as ofile:
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
83 writer = csv.DictWriter(ofile, csv_fields, delimiter='\t')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
84 writer.writeheader()
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
85 for ind, vessels in vessels_map.iteritems():
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
86 writer.writerow(build_record(labels_map[ind], vessels))
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
87
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
88 logger.info('Job completed')
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
89
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
90 if __name__ == '__main__':
ba6cf6ede027 Uploaded
ric
parents:
diff changeset
91 main(sys.argv[1:])