3
|
1 """
|
|
2 This tool produces files that can be used as input to import
|
|
3 * samples
|
|
4 * flowcells
|
|
5 * lanes
|
|
6 * laneslots
|
|
7 within OMERO.biobank using import applications.
|
|
8 If the optional 'study-output-file' parameter is given as input, the
|
|
9 script will produce the input file for a new study definition.
|
|
10 If the optional 'tubes-subsamples-output-file' is given, the script
|
|
11 will generate another file with tubes definitions where each tube is
|
|
12 produced appliying a specific laboratory protocol to an existing
|
|
13 tube. Existing tubes are the ones in tubes-out-file, new tubes' labels
|
|
14 are created using the pattern <tube_label>::<protocol>
|
|
15 The config_parameters field must point to a YAML configuration file
|
|
16 with the following structure:
|
|
17
|
|
18 config_parameters:
|
|
19 study_label: study_label
|
|
20 namespace: namespace
|
|
21
|
|
22 where study_label is mandatory
|
|
23 """
|
|
24
|
|
25 import csv, sys, argparse, logging, yaml
|
|
26 # Needed to import flowcell data
|
|
27 from bioblend.galaxy import GalaxyInstance
|
|
28 import nglimsclient, os
|
|
29
|
|
30 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
|
|
31 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
|
|
32 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
|
|
33
|
|
34 def make_parser():
|
|
35 parser = argparse.ArgumentParser(description='split sequencing samplesheet')
|
|
36 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
|
|
37 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
|
|
38 help='logging level', default='INFO')
|
|
39 parser.add_argument('--in-file', '-i', type=str, required=True,
|
|
40 help='input file')
|
|
41 parser.add_argument('--tubes-out-file', type=str,
|
|
42 help='output file containing tube definitions',
|
|
43 default='./tubes_def.tsv')
|
|
44 parser.add_argument('--flowcells-out-file', type=str,
|
|
45 help='output file containing flowcell definitions',
|
|
46 default='./flowcells_def.tsv')
|
|
47 parser.add_argument('--lanes-out-file', type=str,
|
|
48 help='output file containing lane definitions',
|
|
49 default='./lanes_def.tsv')
|
|
50 parser.add_argument('--laneslots-out-file', type=str,
|
|
51 help='output file containing laneslot definitions',
|
|
52 default='./laneslots_def.tsv')
|
|
53 parser.add_argument('--config-parameters', type=str, required=True,
|
|
54 help='a YAML configuration file containing study label and labels namespace, '
|
|
55 'namespace is optional')
|
|
56 parser.add_argument('--study-output-file', type=str,
|
|
57 help='output file containing study definition')
|
|
58 parser.add_argument('--tubes-subsamples-output-file', type=str,
|
|
59 help='output file containing tubes subsamples (samples produced applying a '
|
|
60 'laboratory protocol to existing samples)')
|
|
61 return parser
|
|
62
|
|
63
|
|
64 def get_samplesheet_translator(samplesheet_type='default'):
|
|
65 translator = {'default': {'flowcell_id': 'FCID',
|
|
66 'tube_id': 'SampleID',
|
|
67 'lane_id': 'Lane',
|
|
68 'sample_tag': 'Index',
|
|
69 'protocol': 'Recipe',
|
|
70 'operator': 'Operator',
|
|
71 'sample_project': 'SampleProject'}
|
|
72 }
|
|
73 return translator[samplesheet_type]
|
|
74
|
|
75 def add_namespace(namespace, label, separator='|'):
|
|
76 return separator.join([namespace, label])
|
|
77
|
|
78 def write_tubes_file(records, study_label, translator, ofile,
|
|
79 namespace = None, logger = None):
|
|
80 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content',
|
|
81 'vessel_status', 'source', 'source_type']
|
|
82 with open(ofile, 'w') as out_file:
|
|
83 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
|
|
84 writer.writeheader()
|
|
85 tubes_def = set([r[translator['tube_id'].strip()] for r in records])
|
|
86 for x in tubes_def:
|
|
87 writer.writerow({'study' : study_label,
|
|
88 'label' : x if not namespace else add_namespace(namespace, x),
|
|
89 'vessel_type' : 'Tube',
|
|
90 'vessel_content' : 'DNA',
|
|
91 'vessel_status' : 'UNKNOWN',
|
|
92 'source' : 'None',
|
|
93 'source_type' : 'NO_SOURCE'})
|
|
94
|
|
95
|
|
96 def write_subsamples_file(records, study_label, translator, ofile,
|
|
97 namespace = None, logger = None):
|
|
98 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content',
|
|
99 'vessel_status', 'source', 'source_type', 'options']
|
|
100 with open(ofile, 'w') as out_file:
|
|
101 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
|
|
102 writer.writeheader()
|
|
103 subsamples_def = set([('%s::%s' % (r[translator['tube_id']].strip(), r[translator['protocol']].strip()),
|
|
104 r[translator['tube_id']].strip(),
|
|
105 r[translator['protocol']].strip()) for r in records])
|
|
106 for x in subsamples_def:
|
|
107 writer.writerow({'study' : study_label,
|
|
108 'label' : x[0] if not namespace else add_namespace(namespace, x[0]),
|
|
109 'vessel_type' : 'Tube',
|
|
110 'vessel_content' : 'DNA',
|
|
111 'vessel_status' : 'UNKNOWN',
|
|
112 'source' : x[1] if not namespace else add_namespace(namespace, x[1]),
|
|
113 'source_type' : 'Tube',
|
|
114 'options' : 'protocol=%s' % x[2]})
|
|
115
|
|
116
|
|
117 def write_flowcells_file(records, study_label, translator, ofile,
|
|
118 namespace = None, logger=None):
|
|
119 ofile_fields = ['study', 'label', 'barcode', 'container_status',
|
|
120 'number_of_slots']
|
|
121 with open(ofile, 'w') as out_file:
|
|
122 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
|
|
123 writer.writeheader()
|
|
124 flowcells_def = set([(r[translator['flowcell_id']].strip()) for r in records])
|
|
125 for x in flowcells_def:
|
|
126 writer.writerow({'study' : study_label,
|
|
127 'label' : x if not namespace else add_namespace(namespace, x),
|
|
128 'barcode' : x if not namespace else add_namespace(namespace, x),
|
|
129 'container_status' : 'INSTOCK',
|
|
130 'number_of_slots' : '8'})
|
|
131
|
|
132
|
|
133 def write_lanes_file(records, study_label, translator, ofile,
|
|
134 namespace = None, logger=None):
|
|
135 ofile_fields = ['study', 'flow_cell', 'slot', 'container_status']
|
|
136 with open(ofile, 'w') as out_file:
|
|
137 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
|
|
138 writer.writeheader()
|
|
139 lanes_def = set([(r[translator['flowcell_id']].strip(),
|
|
140 r[translator['lane_id']].strip())
|
|
141 for r in records])
|
|
142 for x in lanes_def:
|
|
143 writer.writerow({'study' : study_label,
|
|
144 'flow_cell' : x[0] if not namespace else add_namespace(namespace, x[0]),
|
|
145 'slot' : x[1],
|
|
146 'container_status' : 'INSTOCK'})
|
|
147
|
|
148
|
|
149 def write_laneslots_file(records, study_label, translator, ofile,
|
|
150 subsamples_enabled=False,
|
|
151 namespace = None, logger=None):
|
|
152 logger.debug ('subsamples_ensabled: %r' % subsamples_enabled)
|
|
153 ofile_fields = ['study', 'lane', 'tag', 'content', 'source',
|
|
154 'source_type', 'options']
|
|
155 # Get NGLIMS host and key
|
|
156 try:
|
|
157 galaxy_host = os.environ['NGLIMS_GALAXY_HOST']
|
|
158 api_key = os.environ['NGLIMS_GALAXY_API_KEY']
|
|
159 except KeyError as ke:
|
|
160 msg = 'No environment variables %s set to configure access to the Galaxy server' % ke
|
|
161 sys.exit(msg)
|
|
162 # Get flowcell label (assuming label is the same for all records)
|
|
163 fc_id = records[0][translator['flowcell_id']].strip()
|
|
164 # Get flowcell details from nglims
|
|
165 gi = nglimsclient.setup(GalaxyInstance(galaxy_host, api_key))
|
|
166 if gi.nglims.exists_flowcell_id(fc_id):
|
|
167 fc_data = gi.nglims.flowcell_complete_details(fc_id)
|
|
168 with open(ofile, 'w') as out_file:
|
|
169 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
|
|
170 writer.writeheader()
|
|
171 laneslots_def = set()
|
|
172 for r in records:
|
|
173 fc_id = r[translator['flowcell_id']].strip() if not namespace else \
|
|
174 add_namespace(namespace, r[translator['flowcell_id']]).strip()
|
|
175 if subsamples_enabled:
|
|
176 source_tube_id = '%s::%s' % (r[translator['tube_id']].strip(),
|
|
177 r[translator['protocol']].strip())
|
|
178 else:
|
|
179 source_tube_id = r[translator['tube_id']].strip()
|
|
180 # Identify adapter
|
|
181 adapter = [i['adapter'] for i in fc_data['details'] if i['name']==r[translator['tube_id']].strip() and i['lane']==int(r[translator['lane_id']].strip())]
|
|
182 laneslots_def.add(('%s:%s' % (fc_id, r[translator['lane_id']].strip()),
|
|
183 r[translator['sample_tag']].strip(),
|
|
184 source_tube_id,
|
|
185 r[translator['protocol']].strip(),
|
|
186 r[translator['operator']].strip(),
|
|
187 r[translator['sample_project']].strip(),
|
|
188 adapter[0]))
|
|
189 for x in laneslots_def:
|
|
190 writer.writerow({'study' : study_label,
|
|
191 'lane' : x[0],
|
|
192 'tag' : x[1],
|
|
193 'content' : 'DNA',
|
|
194 'source' : x[2] if not namespace else \
|
|
195 add_namespace(namespace, x[2]),
|
|
196 'source_type' : 'Tube',
|
|
197 'options' : 'protocol=%s,operator=%s,sample_project=%s,adapter=%s' %
|
|
198 (x[3], x[4], x[5], x[6])})
|
|
199
|
|
200
|
|
201 def write_study_file(study_label, records, translator, ofile, logger=None):
|
|
202 ofile_fields = ['label', 'description']
|
|
203 with open(ofile, 'w') as out_file:
|
|
204 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t', )
|
|
205 writer.writeheader()
|
|
206 writer.writerow({'label': study_label})
|
|
207
|
|
208
|
|
209 def main(argv):
|
|
210 parser = make_parser()
|
|
211 args = parser.parse_args(argv)
|
|
212
|
|
213 log_level = getattr(logging, args.loglevel)
|
|
214 kwargs = {'format' : LOG_FORMAT,
|
|
215 'datefmt' : LOG_DATEFMT,
|
|
216 'level' : log_level}
|
|
217 if args.logfile:
|
|
218 kwargs['filename'] = args.logfile
|
|
219 logging.basicConfig(**kwargs)
|
|
220 logger = logging.getLogger('prepare_seq_dsample_inputs')
|
|
221
|
|
222 with open(args.in_file, 'rU') as f:
|
|
223 logger.info('Loading data from file %s' % args.in_file)
|
|
224 reader = csv.DictReader(f, delimiter='\t')
|
|
225 recs = [r for r in reader]
|
|
226 translator = get_samplesheet_translator()
|
|
227
|
|
228 with open(args.config_parameters) as cfgf:
|
|
229 conf = yaml.load(cfgf)
|
|
230 if not conf.has_key('config_parameters'):
|
|
231 raise RuntimeError('Bad configuration file')
|
|
232 else:
|
|
233 try:
|
|
234 study_label = conf['config_parameters']['study_label']
|
|
235 except KeyError:
|
|
236 raise RuntimeError('No study_label provided')
|
|
237 if conf['config_parameters'].has_key('namespace'):
|
|
238 namespace = conf['config_parameters']['namespace']
|
|
239 else:
|
|
240 namespace = None
|
|
241
|
|
242 if args.study_output_file:
|
|
243 logger.info('Writing Study definition file %s' % args.study_output_file)
|
|
244 write_study_file(study_label, recs, translator, args.study_output_file, logger)
|
|
245 logger.info('Done writing file %s' % args.study_output_file)
|
|
246
|
|
247 logger.info('Writing Tube definitions file %s' % args.tubes_out_file)
|
|
248 write_tubes_file(recs, study_label, translator,
|
|
249 args.tubes_out_file, namespace,
|
|
250 logger)
|
|
251 logger.info('Done writing file %s' % args.tubes_out_file)
|
|
252
|
|
253 if args.tubes_subsamples_output_file:
|
|
254 logger.info('Writing Tubes\' subsamples definitions file %s' \
|
|
255 % args.tubes_subsamples_output_file)
|
|
256 write_subsamples_file(recs, study_label, translator,
|
|
257 args.tubes_subsamples_output_file,
|
|
258 namespace, logger)
|
|
259 logger.info('Done writing file %s' % args.tubes_subsamples_output_file)
|
|
260
|
|
261 logger.info('Writing FlowCell definitions file %s' % args.flowcells_out_file)
|
|
262 write_flowcells_file(recs, study_label, translator,
|
|
263 args.flowcells_out_file, namespace,
|
|
264 logger)
|
|
265 logger.info('Done writing file %s' % args.flowcells_out_file)
|
|
266
|
|
267 logger.info('Writing Lane definitions file %s' % args.lanes_out_file)
|
|
268 write_lanes_file(recs, study_label, translator,
|
|
269 args.lanes_out_file, namespace,
|
|
270 logger)
|
|
271 logger.info('Done writing file %s' % args.lanes_out_file)
|
|
272
|
|
273 logger.info('Writing LaneSlot definitions file %s' % args.laneslots_out_file)
|
|
274 write_laneslots_file(recs, study_label, translator,
|
|
275 args.laneslots_out_file,
|
|
276 'tubes_subsamples_output_file' in args, # Check if subsamples have been created
|
|
277 namespace,
|
|
278 logger)
|
|
279 logger.info('Done writing file %s' % args.laneslots_out_file)
|
|
280
|
|
281
|
|
282 if __name__ == '__main__':
|
|
283 main(sys.argv[1:])
|