3
|
1 """
|
|
2 Split a file like::
|
|
3
|
|
4 individual gender father mother
|
|
5 ASTUDY:2141 MALE ASTUDY:12 ASTUDY:12341
|
|
6 ASTUDY:415 MALE ASTUDY:3562 ASTUDY:13612
|
|
7 BSTUDY:12515 FEMALE BSTUDY:3512 BSTUDY:124
|
|
8
|
|
9 into multiple files based on the STUDY value of the label stored in the "individual" column.
|
|
10 Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line
|
|
11 will be skipped.
|
|
12 """
|
|
13
|
|
14 import sys, argparse, csv, os
|
|
15 from bl.vl.utils import LOG_LEVELS, get_logger
|
|
16
|
|
17
|
|
18 def get_parser():
|
|
19 parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria')
|
|
20 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
|
|
21 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
|
|
22 help='logging level', default='INFO')
|
|
23 parser.add_argument('--in-file', type=str, required=True,
|
|
24 help='input TSV file')
|
|
25 parser.add_argument('--record-id', type=int,
|
|
26 help='Output ID record given by Galaxy')
|
|
27 parser.add_argument('--out-path', type=str, help='Output directory',
|
|
28 default='.')
|
|
29 return parser
|
|
30
|
|
31
|
|
32 def split_element(element, logger):
|
|
33 try:
|
|
34 study, code = element.split(':')
|
|
35 return study, code
|
|
36 except ValueError:
|
|
37 logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element)
|
|
38 return None, None
|
|
39
|
|
40
|
|
41 def map_by_study(records, logger):
|
|
42 records_map = {}
|
|
43 for rec in records:
|
|
44 study, code = split_element(rec['individual'], logger)
|
|
45 if not study and not code:
|
|
46 logger.debug('Skipping record %r', rec)
|
|
47 continue
|
|
48 records_map.setdefault(study, []).append(rec)
|
|
49 logger.info('Records splitted between %d studies', len(records_map.keys()))
|
|
50 return records_map
|
|
51
|
|
52
|
|
53 def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None):
|
|
54
|
|
55 def get_file_name(study, out_path, galaxy_id=None):
|
|
56 if not galaxy_id:
|
|
57 file_name = '%s_individuals.tsv' % study
|
|
58 else:
|
|
59 file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-'))
|
|
60 return os.path.join(out_path, file_name)
|
|
61
|
|
62 fname = get_file_name(study_label, output_path, galaxy_record_id)
|
|
63 with open(fname, 'w') as ofile:
|
|
64 logger.info('Dumping %d records to file %s', len(records), fname)
|
|
65 writer = csv.DictWriter(ofile, header, delimiter='\t')
|
|
66 writer.writeheader()
|
|
67 writer.writerows(records)
|
|
68
|
|
69
|
|
70 def main(argv):
|
|
71 parser = get_parser()
|
|
72 args = parser.parse_args(argv)
|
|
73
|
|
74 logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile)
|
|
75
|
|
76 logger.info('Start processing file %s', args.in_file)
|
|
77 with open(args.in_file) as in_file:
|
|
78 reader = csv.DictReader(in_file, delimiter='\t')
|
|
79 records = [row for row in reader]
|
|
80
|
|
81 records_map = map_by_study(records, logger)
|
|
82 # Force the header of the output files in order to prevent problems when running the workflow later
|
|
83 header = ['individual', 'gender', 'father', 'mother']
|
|
84 for study, records in records_map.iteritems():
|
|
85 dump_records(study, records, header, args.out_path, logger, args.record_id)
|
|
86 logger.info('Job completed')
|
|
87
|
|
88
|
|
89 if __name__ == '__main__':
|
|
90 main(sys.argv[1:]) |