comparison galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
comparison
equal deleted inserted replaced
2:47bf0086e082 3:43be74e62bfe
1 """
2 This tool produces files that can be used as input to import
3 * samples
4 * flowcells
5 * lanes
6 * laneslots
7 within OMERO.biobank using import applications.
8 If the optional 'study-output-file' parameter is given as input, the
9 script will produce the input file for a new study definition.
10 If the optional 'tubes-subsamples-output-file' is given, the script
11 will generate another file with tubes definitions where each tube is
12 produced appliying a specific laboratory protocol to an existing
13 tube. Existing tubes are the ones in tubes-out-file, new tubes' labels
14 are created using the pattern <tube_label>::<protocol>
15 The config_parameters field must point to a YAML configuration file
16 with the following structure:
17
18 config_parameters:
19 study_label: study_label
20 namespace: namespace
21
22 where study_label is mandatory
23 """
24
25 import csv, sys, argparse, logging, yaml
26 # Needed to import flowcell data
27 from bioblend.galaxy import GalaxyInstance
28 import nglimsclient, os
29
30 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
31 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
32 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
33
34 def make_parser():
35 parser = argparse.ArgumentParser(description='split sequencing samplesheet')
36 parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
37 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
38 help='logging level', default='INFO')
39 parser.add_argument('--in-file', '-i', type=str, required=True,
40 help='input file')
41 parser.add_argument('--tubes-out-file', type=str,
42 help='output file containing tube definitions',
43 default='./tubes_def.tsv')
44 parser.add_argument('--flowcells-out-file', type=str,
45 help='output file containing flowcell definitions',
46 default='./flowcells_def.tsv')
47 parser.add_argument('--lanes-out-file', type=str,
48 help='output file containing lane definitions',
49 default='./lanes_def.tsv')
50 parser.add_argument('--laneslots-out-file', type=str,
51 help='output file containing laneslot definitions',
52 default='./laneslots_def.tsv')
53 parser.add_argument('--config-parameters', type=str, required=True,
54 help='a YAML configuration file containing study label and labels namespace, '
55 'namespace is optional')
56 parser.add_argument('--study-output-file', type=str,
57 help='output file containing study definition')
58 parser.add_argument('--tubes-subsamples-output-file', type=str,
59 help='output file containing tubes subsamples (samples produced applying a '
60 'laboratory protocol to existing samples)')
61 return parser
62
63
64 def get_samplesheet_translator(samplesheet_type='default'):
65 translator = {'default': {'flowcell_id': 'FCID',
66 'tube_id': 'SampleID',
67 'lane_id': 'Lane',
68 'sample_tag': 'Index',
69 'protocol': 'Recipe',
70 'operator': 'Operator',
71 'sample_project': 'SampleProject'}
72 }
73 return translator[samplesheet_type]
74
75 def add_namespace(namespace, label, separator='|'):
76 return separator.join([namespace, label])
77
78 def write_tubes_file(records, study_label, translator, ofile,
79 namespace = None, logger = None):
80 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content',
81 'vessel_status', 'source', 'source_type']
82 with open(ofile, 'w') as out_file:
83 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
84 writer.writeheader()
85 tubes_def = set([r[translator['tube_id'].strip()] for r in records])
86 for x in tubes_def:
87 writer.writerow({'study' : study_label,
88 'label' : x if not namespace else add_namespace(namespace, x),
89 'vessel_type' : 'Tube',
90 'vessel_content' : 'DNA',
91 'vessel_status' : 'UNKNOWN',
92 'source' : 'None',
93 'source_type' : 'NO_SOURCE'})
94
95
96 def write_subsamples_file(records, study_label, translator, ofile,
97 namespace = None, logger = None):
98 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content',
99 'vessel_status', 'source', 'source_type', 'options']
100 with open(ofile, 'w') as out_file:
101 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
102 writer.writeheader()
103 subsamples_def = set([('%s::%s' % (r[translator['tube_id']].strip(), r[translator['protocol']].strip()),
104 r[translator['tube_id']].strip(),
105 r[translator['protocol']].strip()) for r in records])
106 for x in subsamples_def:
107 writer.writerow({'study' : study_label,
108 'label' : x[0] if not namespace else add_namespace(namespace, x[0]),
109 'vessel_type' : 'Tube',
110 'vessel_content' : 'DNA',
111 'vessel_status' : 'UNKNOWN',
112 'source' : x[1] if not namespace else add_namespace(namespace, x[1]),
113 'source_type' : 'Tube',
114 'options' : 'protocol=%s' % x[2]})
115
116
117 def write_flowcells_file(records, study_label, translator, ofile,
118 namespace = None, logger=None):
119 ofile_fields = ['study', 'label', 'barcode', 'container_status',
120 'number_of_slots']
121 with open(ofile, 'w') as out_file:
122 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
123 writer.writeheader()
124 flowcells_def = set([(r[translator['flowcell_id']].strip()) for r in records])
125 for x in flowcells_def:
126 writer.writerow({'study' : study_label,
127 'label' : x if not namespace else add_namespace(namespace, x),
128 'barcode' : x if not namespace else add_namespace(namespace, x),
129 'container_status' : 'INSTOCK',
130 'number_of_slots' : '8'})
131
132
133 def write_lanes_file(records, study_label, translator, ofile,
134 namespace = None, logger=None):
135 ofile_fields = ['study', 'flow_cell', 'slot', 'container_status']
136 with open(ofile, 'w') as out_file:
137 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
138 writer.writeheader()
139 lanes_def = set([(r[translator['flowcell_id']].strip(),
140 r[translator['lane_id']].strip())
141 for r in records])
142 for x in lanes_def:
143 writer.writerow({'study' : study_label,
144 'flow_cell' : x[0] if not namespace else add_namespace(namespace, x[0]),
145 'slot' : x[1],
146 'container_status' : 'INSTOCK'})
147
148
149 def write_laneslots_file(records, study_label, translator, ofile,
150 subsamples_enabled=False,
151 namespace = None, logger=None):
152 logger.debug ('subsamples_ensabled: %r' % subsamples_enabled)
153 ofile_fields = ['study', 'lane', 'tag', 'content', 'source',
154 'source_type', 'options']
155 # Get NGLIMS host and key
156 try:
157 galaxy_host = os.environ['NGLIMS_GALAXY_HOST']
158 api_key = os.environ['NGLIMS_GALAXY_API_KEY']
159 except KeyError as ke:
160 msg = 'No environment variables %s set to configure access to the Galaxy server' % ke
161 sys.exit(msg)
162 # Get flowcell label (assuming label is the same for all records)
163 fc_id = records[0][translator['flowcell_id']].strip()
164 # Get flowcell details from nglims
165 gi = nglimsclient.setup(GalaxyInstance(galaxy_host, api_key))
166 if gi.nglims.exists_flowcell_id(fc_id):
167 fc_data = gi.nglims.flowcell_complete_details(fc_id)
168 with open(ofile, 'w') as out_file:
169 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
170 writer.writeheader()
171 laneslots_def = set()
172 for r in records:
173 fc_id = r[translator['flowcell_id']].strip() if not namespace else \
174 add_namespace(namespace, r[translator['flowcell_id']]).strip()
175 if subsamples_enabled:
176 source_tube_id = '%s::%s' % (r[translator['tube_id']].strip(),
177 r[translator['protocol']].strip())
178 else:
179 source_tube_id = r[translator['tube_id']].strip()
180 # Identify adapter
181 adapter = [i['adapter'] for i in fc_data['details'] if i['name']==r[translator['tube_id']].strip() and i['lane']==int(r[translator['lane_id']].strip())]
182 laneslots_def.add(('%s:%s' % (fc_id, r[translator['lane_id']].strip()),
183 r[translator['sample_tag']].strip(),
184 source_tube_id,
185 r[translator['protocol']].strip(),
186 r[translator['operator']].strip(),
187 r[translator['sample_project']].strip(),
188 adapter[0]))
189 for x in laneslots_def:
190 writer.writerow({'study' : study_label,
191 'lane' : x[0],
192 'tag' : x[1],
193 'content' : 'DNA',
194 'source' : x[2] if not namespace else \
195 add_namespace(namespace, x[2]),
196 'source_type' : 'Tube',
197 'options' : 'protocol=%s,operator=%s,sample_project=%s,adapter=%s' %
198 (x[3], x[4], x[5], x[6])})
199
200
201 def write_study_file(study_label, records, translator, ofile, logger=None):
202 ofile_fields = ['label', 'description']
203 with open(ofile, 'w') as out_file:
204 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t', )
205 writer.writeheader()
206 writer.writerow({'label': study_label})
207
208
209 def main(argv):
210 parser = make_parser()
211 args = parser.parse_args(argv)
212
213 log_level = getattr(logging, args.loglevel)
214 kwargs = {'format' : LOG_FORMAT,
215 'datefmt' : LOG_DATEFMT,
216 'level' : log_level}
217 if args.logfile:
218 kwargs['filename'] = args.logfile
219 logging.basicConfig(**kwargs)
220 logger = logging.getLogger('prepare_seq_dsample_inputs')
221
222 with open(args.in_file, 'rU') as f:
223 logger.info('Loading data from file %s' % args.in_file)
224 reader = csv.DictReader(f, delimiter='\t')
225 recs = [r for r in reader]
226 translator = get_samplesheet_translator()
227
228 with open(args.config_parameters) as cfgf:
229 conf = yaml.load(cfgf)
230 if not conf.has_key('config_parameters'):
231 raise RuntimeError('Bad configuration file')
232 else:
233 try:
234 study_label = conf['config_parameters']['study_label']
235 except KeyError:
236 raise RuntimeError('No study_label provided')
237 if conf['config_parameters'].has_key('namespace'):
238 namespace = conf['config_parameters']['namespace']
239 else:
240 namespace = None
241
242 if args.study_output_file:
243 logger.info('Writing Study definition file %s' % args.study_output_file)
244 write_study_file(study_label, recs, translator, args.study_output_file, logger)
245 logger.info('Done writing file %s' % args.study_output_file)
246
247 logger.info('Writing Tube definitions file %s' % args.tubes_out_file)
248 write_tubes_file(recs, study_label, translator,
249 args.tubes_out_file, namespace,
250 logger)
251 logger.info('Done writing file %s' % args.tubes_out_file)
252
253 if args.tubes_subsamples_output_file:
254 logger.info('Writing Tubes\' subsamples definitions file %s' \
255 % args.tubes_subsamples_output_file)
256 write_subsamples_file(recs, study_label, translator,
257 args.tubes_subsamples_output_file,
258 namespace, logger)
259 logger.info('Done writing file %s' % args.tubes_subsamples_output_file)
260
261 logger.info('Writing FlowCell definitions file %s' % args.flowcells_out_file)
262 write_flowcells_file(recs, study_label, translator,
263 args.flowcells_out_file, namespace,
264 logger)
265 logger.info('Done writing file %s' % args.flowcells_out_file)
266
267 logger.info('Writing Lane definitions file %s' % args.lanes_out_file)
268 write_lanes_file(recs, study_label, translator,
269 args.lanes_out_file, namespace,
270 logger)
271 logger.info('Done writing file %s' % args.lanes_out_file)
272
273 logger.info('Writing LaneSlot definitions file %s' % args.laneslots_out_file)
274 write_laneslots_file(recs, study_label, translator,
275 args.laneslots_out_file,
276 'tubes_subsamples_output_file' in args, # Check if subsamples have been created
277 namespace,
278 logger)
279 logger.info('Done writing file %s' % args.laneslots_out_file)
280
281
282 if __name__ == '__main__':
283 main(sys.argv[1:])