3
|
1 """
|
|
2 This tool produces files that can be used as input to import
|
|
3 * SequencerOutput data samples
|
|
4 * SequencerOutput data objects
|
|
5 within OMERO.biobank using import applications.
|
|
6
|
|
7 Input file must be like
|
|
8
|
|
9 run_directory path
|
|
10 130418_SN194_0303_BC1NYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw
|
|
11 160418_SN194_0304_BCAZYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw
|
|
12 ....
|
|
13 """
|
|
14
|
|
15 import csv, sys, argparse, logging
|
|
16
|
|
17 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
|
|
18 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
|
|
19 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
|
|
20
|
|
21
|
|
22 def make_parser():
|
|
23 parser = argparse.ArgumentParser(description='build sequencer output import files')
|
|
24 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
|
|
25 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
|
|
26 help='logging level', default='INFO')
|
|
27 parser.add_argument('--in-file', '-i', type=str, required=True,
|
|
28 help='input file')
|
|
29 parser.add_argument('--dsamples-out-file', type=str, default='./seq_out_dsamples.tsv',
|
|
30 help='output file containing data samples definitions')
|
|
31 parser.add_argument('--dobjects-out-file', type=str, default='./seq_out_dobjects.tsv',
|
|
32 help='output file containing data objects definitions')
|
|
33 parser.add_argument('--study', '-s', type=str, required=True)
|
|
34 return parser
|
|
35
|
|
36
|
|
37 def write_dsamples_file(records, out_file, study_label):
|
|
38
|
|
39 def parse_run_directoty(run_dir):
|
|
40 _, device, _, flowcell = run_dir.split('_')
|
|
41 return device, flowcell[1:]
|
|
42
|
|
43 with open(out_file, 'w') as ofile:
|
|
44 out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type',
|
|
45 'status', 'device']
|
|
46 writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
|
|
47 writer.writeheader()
|
|
48 for r in records:
|
|
49 device, flowcell = parse_run_directoty(r)
|
|
50 writer.writerow({'study': study_label,
|
|
51 'label': r,
|
|
52 'source': flowcell,
|
|
53 'source_type': 'FlowCell',
|
|
54 'seq_dsample_type': 'SequencerOutput',
|
|
55 'status': 'USABLE',
|
|
56 'device': device})
|
|
57
|
|
58
|
|
59 def write_dobjects_file(records, out_file, study_label):
|
|
60 with open(out_file, 'w') as ofile:
|
|
61 out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1']
|
|
62 writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
|
|
63 writer.writeheader()
|
|
64 for r in records:
|
|
65 writer.writerow({'study': study_label,
|
|
66 'path': r['path'],
|
|
67 'data_sample': r['run_directory'],
|
|
68 'mimetype': 'x-vl/illumina-run-folder',
|
|
69 'size': '-1',
|
|
70 'sha1': 'N.A.'})
|
|
71
|
|
72
|
|
73 def main(argv):
|
|
74 parser = make_parser()
|
|
75 args = parser.parse_args(argv)
|
|
76
|
|
77 log_level = getattr(logging, args.loglevel)
|
|
78 kwargs = {'format': LOG_FORMAT,
|
|
79 'datefmt': LOG_DATEFMT,
|
|
80 'level': log_level}
|
|
81 if args.logfile:
|
|
82 kwargs['filename'] = args.logfile
|
|
83 logging.basicConfig(**kwargs)
|
|
84 logger = logging.getLogger('prepare_seq_dsample_inputs')
|
|
85
|
|
86 with open(args.in_file) as f:
|
|
87 logger.info('Loading data from file %s', args.in_file)
|
|
88 reader = csv.DictReader(f, delimiter='\t')
|
|
89 recs = [r for r in reader]
|
|
90
|
|
91 logger.info('Writing DataSample data to file %s', args.dsamples_out_file)
|
|
92 write_dsamples_file(set([r['run_directory'] for r in recs]),
|
|
93 args.dsamples_out_file, args.study)
|
|
94 logger.info('Writing DataObjects data to file %s', args.dobjects_out_file)
|
|
95 write_dobjects_file(recs, args.dobjects_out_file, args.study)
|
|
96
|
|
97
|
|
98 if __name__ == '__main__':
|
|
99 main(sys.argv[1:]) |