annotate galaxy-tools/biobank/utils/before_prepare_seq_dsample_inputs.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
1 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
2 This tool produce files used as input by 'prepare_seq_dsample_inputs'.
43be74e62bfe Uploaded
ric
parents:
diff changeset
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
4 It needs as input an Illumina samplesheet and produce as output:
43be74e62bfe Uploaded
ric
parents:
diff changeset
5 the same samplesheet with a new column 'sample_project'
43be74e62bfe Uploaded
ric
parents:
diff changeset
6 a configuration file
43be74e62bfe Uploaded
ric
parents:
diff changeset
7 The configuration file is a YAML file with the following structure:
43be74e62bfe Uploaded
ric
parents:
diff changeset
8
43be74e62bfe Uploaded
ric
parents:
diff changeset
9 config_parameters:
43be74e62bfe Uploaded
ric
parents:
diff changeset
10 study_label: study_label
43be74e62bfe Uploaded
ric
parents:
diff changeset
11
43be74e62bfe Uploaded
ric
parents:
diff changeset
12 where study_label is mandatory
43be74e62bfe Uploaded
ric
parents:
diff changeset
13 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
14
43be74e62bfe Uploaded
ric
parents:
diff changeset
15 import csv, sys, argparse, logging, yaml
43be74e62bfe Uploaded
ric
parents:
diff changeset
16
43be74e62bfe Uploaded
ric
parents:
diff changeset
17
43be74e62bfe Uploaded
ric
parents:
diff changeset
18 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
43be74e62bfe Uploaded
ric
parents:
diff changeset
19 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
43be74e62bfe Uploaded
ric
parents:
diff changeset
20 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
43be74e62bfe Uploaded
ric
parents:
diff changeset
21
43be74e62bfe Uploaded
ric
parents:
diff changeset
22 def make_parser():
43be74e62bfe Uploaded
ric
parents:
diff changeset
23 parser = argparse.ArgumentParser(description='prepare inputs for VLUTIL.prepare_seq_dsample_inputs')
43be74e62bfe Uploaded
ric
parents:
diff changeset
24 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
43be74e62bfe Uploaded
ric
parents:
diff changeset
25 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
43be74e62bfe Uploaded
ric
parents:
diff changeset
26 help='logging level', default='INFO')
43be74e62bfe Uploaded
ric
parents:
diff changeset
27 parser.add_argument('--in-file', '-i', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
28 help='input file')
43be74e62bfe Uploaded
ric
parents:
diff changeset
29 parser.add_argument('--samplesheet-output-file', type=str,
43be74e62bfe Uploaded
ric
parents:
diff changeset
30 help='output file containing flowcell samplesheet',
43be74e62bfe Uploaded
ric
parents:
diff changeset
31 default='./samplesheet.tsv')
43be74e62bfe Uploaded
ric
parents:
diff changeset
32 parser.add_argument('--config-output-file', type=str,
43be74e62bfe Uploaded
ric
parents:
diff changeset
33 help='output file containing config definitions',
43be74e62bfe Uploaded
ric
parents:
diff changeset
34 default='./config_parameters.yaml')
43be74e62bfe Uploaded
ric
parents:
diff changeset
35 parser.add_argument('--study', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
36 help='study label parameter for YAML file')
43be74e62bfe Uploaded
ric
parents:
diff changeset
37 return parser
43be74e62bfe Uploaded
ric
parents:
diff changeset
38
43be74e62bfe Uploaded
ric
parents:
diff changeset
39
43be74e62bfe Uploaded
ric
parents:
diff changeset
40 def read_samplesheet(in_file):
43be74e62bfe Uploaded
ric
parents:
diff changeset
41 with open(in_file, 'rU') as f:
43be74e62bfe Uploaded
ric
parents:
diff changeset
42 reader = csv.DictReader(f, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
43 recs = [r for r in reader]
43be74e62bfe Uploaded
ric
parents:
diff changeset
44 fieldnames = reader.fieldnames
43be74e62bfe Uploaded
ric
parents:
diff changeset
45 return recs, fieldnames
43be74e62bfe Uploaded
ric
parents:
diff changeset
46
43be74e62bfe Uploaded
ric
parents:
diff changeset
47
43be74e62bfe Uploaded
ric
parents:
diff changeset
48 def write_samplesheet_extended(recs, fields, out_file, sample_project_label):
43be74e62bfe Uploaded
ric
parents:
diff changeset
49 with open(out_file, 'wb') as f:
43be74e62bfe Uploaded
ric
parents:
diff changeset
50 fields.append('SampleProject')
43be74e62bfe Uploaded
ric
parents:
diff changeset
51 writer = csv.DictWriter(f, fields, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
52 writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
53 for row in recs:
43be74e62bfe Uploaded
ric
parents:
diff changeset
54 row['SampleProject'] = sample_project_label
43be74e62bfe Uploaded
ric
parents:
diff changeset
55 writer.writerow(row)
43be74e62bfe Uploaded
ric
parents:
diff changeset
56
43be74e62bfe Uploaded
ric
parents:
diff changeset
57
43be74e62bfe Uploaded
ric
parents:
diff changeset
58 def write_yaml_config_file(out_file, study_label):
43be74e62bfe Uploaded
ric
parents:
diff changeset
59 config_data = {'config_parameters': {'study_label': study_label}}
43be74e62bfe Uploaded
ric
parents:
diff changeset
60 with open(out_file, 'w') as f:
43be74e62bfe Uploaded
ric
parents:
diff changeset
61 yaml.dump(config_data, f)
43be74e62bfe Uploaded
ric
parents:
diff changeset
62
43be74e62bfe Uploaded
ric
parents:
diff changeset
63
43be74e62bfe Uploaded
ric
parents:
diff changeset
64 def main(argv):
43be74e62bfe Uploaded
ric
parents:
diff changeset
65 parser = make_parser()
43be74e62bfe Uploaded
ric
parents:
diff changeset
66 args = parser.parse_args(argv)
43be74e62bfe Uploaded
ric
parents:
diff changeset
67
43be74e62bfe Uploaded
ric
parents:
diff changeset
68 log_level = getattr(logging, args.loglevel)
43be74e62bfe Uploaded
ric
parents:
diff changeset
69 kwargs = {'format': LOG_FORMAT,
43be74e62bfe Uploaded
ric
parents:
diff changeset
70 'datefmt': LOG_DATEFMT,
43be74e62bfe Uploaded
ric
parents:
diff changeset
71 'level': log_level}
43be74e62bfe Uploaded
ric
parents:
diff changeset
72 if args.logfile:
43be74e62bfe Uploaded
ric
parents:
diff changeset
73 kwargs['filename'] = args.logfile
43be74e62bfe Uploaded
ric
parents:
diff changeset
74 logging.basicConfig(**kwargs)
43be74e62bfe Uploaded
ric
parents:
diff changeset
75 logger = logging.getLogger('before_prepare_seq_dsample_inputs')
43be74e62bfe Uploaded
ric
parents:
diff changeset
76
43be74e62bfe Uploaded
ric
parents:
diff changeset
77 logger.info('Loading data from file %s' % args.in_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
78 recs, out_file_header = read_samplesheet(args.in_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
79 logger.debug('Retrieved %d records', len(recs))
43be74e62bfe Uploaded
ric
parents:
diff changeset
80
43be74e62bfe Uploaded
ric
parents:
diff changeset
81 logger.info('Writing file %s' % args.samplesheet_output_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
82 write_samplesheet_extended(recs, out_file_header, args.samplesheet_output_file,
43be74e62bfe Uploaded
ric
parents:
diff changeset
83 args.study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
84
43be74e62bfe Uploaded
ric
parents:
diff changeset
85 logger.info('Writing file %s' % args.config_output_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
86 write_yaml_config_file(args.config_output_file, args.study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
87
43be74e62bfe Uploaded
ric
parents:
diff changeset
88 if __name__ == '__main__':
43be74e62bfe Uploaded
ric
parents:
diff changeset
89 main(sys.argv[1:])