annotate galaxy-tools/biobank/utils/prepare_seq_out_inputs.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
43be74e62bfe Uploaded
ric
parents:
diff changeset
1 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
2 This tool produces files that can be used as input to import
43be74e62bfe Uploaded
ric
parents:
diff changeset
3 * SequencerOutput data samples
43be74e62bfe Uploaded
ric
parents:
diff changeset
4 * SequencerOutput data objects
43be74e62bfe Uploaded
ric
parents:
diff changeset
5 within OMERO.biobank using import applications.
43be74e62bfe Uploaded
ric
parents:
diff changeset
6
43be74e62bfe Uploaded
ric
parents:
diff changeset
7 Input file must be like
43be74e62bfe Uploaded
ric
parents:
diff changeset
8
43be74e62bfe Uploaded
ric
parents:
diff changeset
9 run_directory path
43be74e62bfe Uploaded
ric
parents:
diff changeset
10 130418_SN194_0303_BC1NYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw
43be74e62bfe Uploaded
ric
parents:
diff changeset
11 160418_SN194_0304_BCAZYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw
43be74e62bfe Uploaded
ric
parents:
diff changeset
12 ....
43be74e62bfe Uploaded
ric
parents:
diff changeset
13 """
43be74e62bfe Uploaded
ric
parents:
diff changeset
14
43be74e62bfe Uploaded
ric
parents:
diff changeset
15 import csv, sys, argparse, logging
43be74e62bfe Uploaded
ric
parents:
diff changeset
16
43be74e62bfe Uploaded
ric
parents:
diff changeset
17 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
43be74e62bfe Uploaded
ric
parents:
diff changeset
18 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
43be74e62bfe Uploaded
ric
parents:
diff changeset
19 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
43be74e62bfe Uploaded
ric
parents:
diff changeset
20
43be74e62bfe Uploaded
ric
parents:
diff changeset
21
43be74e62bfe Uploaded
ric
parents:
diff changeset
22 def make_parser():
43be74e62bfe Uploaded
ric
parents:
diff changeset
23 parser = argparse.ArgumentParser(description='build sequencer output import files')
43be74e62bfe Uploaded
ric
parents:
diff changeset
24 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
43be74e62bfe Uploaded
ric
parents:
diff changeset
25 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
43be74e62bfe Uploaded
ric
parents:
diff changeset
26 help='logging level', default='INFO')
43be74e62bfe Uploaded
ric
parents:
diff changeset
27 parser.add_argument('--in-file', '-i', type=str, required=True,
43be74e62bfe Uploaded
ric
parents:
diff changeset
28 help='input file')
43be74e62bfe Uploaded
ric
parents:
diff changeset
29 parser.add_argument('--dsamples-out-file', type=str, default='./seq_out_dsamples.tsv',
43be74e62bfe Uploaded
ric
parents:
diff changeset
30 help='output file containing data samples definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
31 parser.add_argument('--dobjects-out-file', type=str, default='./seq_out_dobjects.tsv',
43be74e62bfe Uploaded
ric
parents:
diff changeset
32 help='output file containing data objects definitions')
43be74e62bfe Uploaded
ric
parents:
diff changeset
33 parser.add_argument('--study', '-s', type=str, required=True)
43be74e62bfe Uploaded
ric
parents:
diff changeset
34 return parser
43be74e62bfe Uploaded
ric
parents:
diff changeset
35
43be74e62bfe Uploaded
ric
parents:
diff changeset
36
43be74e62bfe Uploaded
ric
parents:
diff changeset
37 def write_dsamples_file(records, out_file, study_label):
43be74e62bfe Uploaded
ric
parents:
diff changeset
38
43be74e62bfe Uploaded
ric
parents:
diff changeset
39 def parse_run_directoty(run_dir):
43be74e62bfe Uploaded
ric
parents:
diff changeset
40 _, device, _, flowcell = run_dir.split('_')
43be74e62bfe Uploaded
ric
parents:
diff changeset
41 return device, flowcell[1:]
43be74e62bfe Uploaded
ric
parents:
diff changeset
42
43be74e62bfe Uploaded
ric
parents:
diff changeset
43 with open(out_file, 'w') as ofile:
43be74e62bfe Uploaded
ric
parents:
diff changeset
44 out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type',
43be74e62bfe Uploaded
ric
parents:
diff changeset
45 'status', 'device']
43be74e62bfe Uploaded
ric
parents:
diff changeset
46 writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
47 writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
48 for r in records:
43be74e62bfe Uploaded
ric
parents:
diff changeset
49 device, flowcell = parse_run_directoty(r)
43be74e62bfe Uploaded
ric
parents:
diff changeset
50 writer.writerow({'study': study_label,
43be74e62bfe Uploaded
ric
parents:
diff changeset
51 'label': r,
43be74e62bfe Uploaded
ric
parents:
diff changeset
52 'source': flowcell,
43be74e62bfe Uploaded
ric
parents:
diff changeset
53 'source_type': 'FlowCell',
43be74e62bfe Uploaded
ric
parents:
diff changeset
54 'seq_dsample_type': 'SequencerOutput',
43be74e62bfe Uploaded
ric
parents:
diff changeset
55 'status': 'USABLE',
43be74e62bfe Uploaded
ric
parents:
diff changeset
56 'device': device})
43be74e62bfe Uploaded
ric
parents:
diff changeset
57
43be74e62bfe Uploaded
ric
parents:
diff changeset
58
43be74e62bfe Uploaded
ric
parents:
diff changeset
59 def write_dobjects_file(records, out_file, study_label):
43be74e62bfe Uploaded
ric
parents:
diff changeset
60 with open(out_file, 'w') as ofile:
43be74e62bfe Uploaded
ric
parents:
diff changeset
61 out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1']
43be74e62bfe Uploaded
ric
parents:
diff changeset
62 writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
63 writer.writeheader()
43be74e62bfe Uploaded
ric
parents:
diff changeset
64 for r in records:
43be74e62bfe Uploaded
ric
parents:
diff changeset
65 writer.writerow({'study': study_label,
43be74e62bfe Uploaded
ric
parents:
diff changeset
66 'path': r['path'],
43be74e62bfe Uploaded
ric
parents:
diff changeset
67 'data_sample': r['run_directory'],
43be74e62bfe Uploaded
ric
parents:
diff changeset
68 'mimetype': 'x-vl/illumina-run-folder',
43be74e62bfe Uploaded
ric
parents:
diff changeset
69 'size': '-1',
43be74e62bfe Uploaded
ric
parents:
diff changeset
70 'sha1': 'N.A.'})
43be74e62bfe Uploaded
ric
parents:
diff changeset
71
43be74e62bfe Uploaded
ric
parents:
diff changeset
72
43be74e62bfe Uploaded
ric
parents:
diff changeset
73 def main(argv):
43be74e62bfe Uploaded
ric
parents:
diff changeset
74 parser = make_parser()
43be74e62bfe Uploaded
ric
parents:
diff changeset
75 args = parser.parse_args(argv)
43be74e62bfe Uploaded
ric
parents:
diff changeset
76
43be74e62bfe Uploaded
ric
parents:
diff changeset
77 log_level = getattr(logging, args.loglevel)
43be74e62bfe Uploaded
ric
parents:
diff changeset
78 kwargs = {'format': LOG_FORMAT,
43be74e62bfe Uploaded
ric
parents:
diff changeset
79 'datefmt': LOG_DATEFMT,
43be74e62bfe Uploaded
ric
parents:
diff changeset
80 'level': log_level}
43be74e62bfe Uploaded
ric
parents:
diff changeset
81 if args.logfile:
43be74e62bfe Uploaded
ric
parents:
diff changeset
82 kwargs['filename'] = args.logfile
43be74e62bfe Uploaded
ric
parents:
diff changeset
83 logging.basicConfig(**kwargs)
43be74e62bfe Uploaded
ric
parents:
diff changeset
84 logger = logging.getLogger('prepare_seq_dsample_inputs')
43be74e62bfe Uploaded
ric
parents:
diff changeset
85
43be74e62bfe Uploaded
ric
parents:
diff changeset
86 with open(args.in_file) as f:
43be74e62bfe Uploaded
ric
parents:
diff changeset
87 logger.info('Loading data from file %s', args.in_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
88 reader = csv.DictReader(f, delimiter='\t')
43be74e62bfe Uploaded
ric
parents:
diff changeset
89 recs = [r for r in reader]
43be74e62bfe Uploaded
ric
parents:
diff changeset
90
43be74e62bfe Uploaded
ric
parents:
diff changeset
91 logger.info('Writing DataSample data to file %s', args.dsamples_out_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
92 write_dsamples_file(set([r['run_directory'] for r in recs]),
43be74e62bfe Uploaded
ric
parents:
diff changeset
93 args.dsamples_out_file, args.study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
94 logger.info('Writing DataObjects data to file %s', args.dobjects_out_file)
43be74e62bfe Uploaded
ric
parents:
diff changeset
95 write_dobjects_file(recs, args.dobjects_out_file, args.study)
43be74e62bfe Uploaded
ric
parents:
diff changeset
96
43be74e62bfe Uploaded
ric
parents:
diff changeset
97
43be74e62bfe Uploaded
ric
parents:
diff changeset
98 if __name__ == '__main__':
43be74e62bfe Uploaded
ric
parents:
diff changeset
99 main(sys.argv[1:])