Mercurial > repos > ric > test1
comparison galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:47bf0086e082 | 3:43be74e62bfe |
---|---|
1 """ | |
2 This tool produces files that can be used as input to import | |
3 * samples | |
4 * flowcells | |
5 * lanes | |
6 * laneslots | |
7 within OMERO.biobank using import applications. | |
8 If the optional 'study-output-file' parameter is given as input, the | |
9 script will produce the input file for a new study definition. | |
10 If the optional 'tubes-subsamples-output-file' is given, the script | |
11 will generate another file with tubes definitions where each tube is | |
12 produced appliying a specific laboratory protocol to an existing | |
13 tube. Existing tubes are the ones in tubes-out-file, new tubes' labels | |
14 are created using the pattern <tube_label>::<protocol> | |
15 The config_parameters field must point to a YAML configuration file | |
16 with the following structure: | |
17 | |
18 config_parameters: | |
19 study_label: study_label | |
20 namespace: namespace | |
21 | |
22 where study_label is mandatory | |
23 """ | |
24 | |
25 import csv, sys, argparse, logging, yaml | |
26 # Needed to import flowcell data | |
27 from bioblend.galaxy import GalaxyInstance | |
28 import nglimsclient, os | |
29 | |
30 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' | |
31 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' | |
32 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] | |
33 | |
34 def make_parser(): | |
35 parser = argparse.ArgumentParser(description='split sequencing samplesheet') | |
36 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
37 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
38 help='logging level', default='INFO') | |
39 parser.add_argument('--in-file', '-i', type=str, required=True, | |
40 help='input file') | |
41 parser.add_argument('--tubes-out-file', type=str, | |
42 help='output file containing tube definitions', | |
43 default='./tubes_def.tsv') | |
44 parser.add_argument('--flowcells-out-file', type=str, | |
45 help='output file containing flowcell definitions', | |
46 default='./flowcells_def.tsv') | |
47 parser.add_argument('--lanes-out-file', type=str, | |
48 help='output file containing lane definitions', | |
49 default='./lanes_def.tsv') | |
50 parser.add_argument('--laneslots-out-file', type=str, | |
51 help='output file containing laneslot definitions', | |
52 default='./laneslots_def.tsv') | |
53 parser.add_argument('--config-parameters', type=str, required=True, | |
54 help='a YAML configuration file containing study label and labels namespace, ' | |
55 'namespace is optional') | |
56 parser.add_argument('--study-output-file', type=str, | |
57 help='output file containing study definition') | |
58 parser.add_argument('--tubes-subsamples-output-file', type=str, | |
59 help='output file containing tubes subsamples (samples produced applying a ' | |
60 'laboratory protocol to existing samples)') | |
61 return parser | |
62 | |
63 | |
64 def get_samplesheet_translator(samplesheet_type='default'): | |
65 translator = {'default': {'flowcell_id': 'FCID', | |
66 'tube_id': 'SampleID', | |
67 'lane_id': 'Lane', | |
68 'sample_tag': 'Index', | |
69 'protocol': 'Recipe', | |
70 'operator': 'Operator', | |
71 'sample_project': 'SampleProject'} | |
72 } | |
73 return translator[samplesheet_type] | |
74 | |
75 def add_namespace(namespace, label, separator='|'): | |
76 return separator.join([namespace, label]) | |
77 | |
78 def write_tubes_file(records, study_label, translator, ofile, | |
79 namespace = None, logger = None): | |
80 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content', | |
81 'vessel_status', 'source', 'source_type'] | |
82 with open(ofile, 'w') as out_file: | |
83 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
84 writer.writeheader() | |
85 tubes_def = set([r[translator['tube_id'].strip()] for r in records]) | |
86 for x in tubes_def: | |
87 writer.writerow({'study' : study_label, | |
88 'label' : x if not namespace else add_namespace(namespace, x), | |
89 'vessel_type' : 'Tube', | |
90 'vessel_content' : 'DNA', | |
91 'vessel_status' : 'UNKNOWN', | |
92 'source' : 'None', | |
93 'source_type' : 'NO_SOURCE'}) | |
94 | |
95 | |
96 def write_subsamples_file(records, study_label, translator, ofile, | |
97 namespace = None, logger = None): | |
98 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content', | |
99 'vessel_status', 'source', 'source_type', 'options'] | |
100 with open(ofile, 'w') as out_file: | |
101 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
102 writer.writeheader() | |
103 subsamples_def = set([('%s::%s' % (r[translator['tube_id']].strip(), r[translator['protocol']].strip()), | |
104 r[translator['tube_id']].strip(), | |
105 r[translator['protocol']].strip()) for r in records]) | |
106 for x in subsamples_def: | |
107 writer.writerow({'study' : study_label, | |
108 'label' : x[0] if not namespace else add_namespace(namespace, x[0]), | |
109 'vessel_type' : 'Tube', | |
110 'vessel_content' : 'DNA', | |
111 'vessel_status' : 'UNKNOWN', | |
112 'source' : x[1] if not namespace else add_namespace(namespace, x[1]), | |
113 'source_type' : 'Tube', | |
114 'options' : 'protocol=%s' % x[2]}) | |
115 | |
116 | |
117 def write_flowcells_file(records, study_label, translator, ofile, | |
118 namespace = None, logger=None): | |
119 ofile_fields = ['study', 'label', 'barcode', 'container_status', | |
120 'number_of_slots'] | |
121 with open(ofile, 'w') as out_file: | |
122 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
123 writer.writeheader() | |
124 flowcells_def = set([(r[translator['flowcell_id']].strip()) for r in records]) | |
125 for x in flowcells_def: | |
126 writer.writerow({'study' : study_label, | |
127 'label' : x if not namespace else add_namespace(namespace, x), | |
128 'barcode' : x if not namespace else add_namespace(namespace, x), | |
129 'container_status' : 'INSTOCK', | |
130 'number_of_slots' : '8'}) | |
131 | |
132 | |
133 def write_lanes_file(records, study_label, translator, ofile, | |
134 namespace = None, logger=None): | |
135 ofile_fields = ['study', 'flow_cell', 'slot', 'container_status'] | |
136 with open(ofile, 'w') as out_file: | |
137 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
138 writer.writeheader() | |
139 lanes_def = set([(r[translator['flowcell_id']].strip(), | |
140 r[translator['lane_id']].strip()) | |
141 for r in records]) | |
142 for x in lanes_def: | |
143 writer.writerow({'study' : study_label, | |
144 'flow_cell' : x[0] if not namespace else add_namespace(namespace, x[0]), | |
145 'slot' : x[1], | |
146 'container_status' : 'INSTOCK'}) | |
147 | |
148 | |
149 def write_laneslots_file(records, study_label, translator, ofile, | |
150 subsamples_enabled=False, | |
151 namespace = None, logger=None): | |
152 logger.debug ('subsamples_ensabled: %r' % subsamples_enabled) | |
153 ofile_fields = ['study', 'lane', 'tag', 'content', 'source', | |
154 'source_type', 'options'] | |
155 # Get NGLIMS host and key | |
156 try: | |
157 galaxy_host = os.environ['NGLIMS_GALAXY_HOST'] | |
158 api_key = os.environ['NGLIMS_GALAXY_API_KEY'] | |
159 except KeyError as ke: | |
160 msg = 'No environment variables %s set to configure access to the Galaxy server' % ke | |
161 sys.exit(msg) | |
162 # Get flowcell label (assuming label is the same for all records) | |
163 fc_id = records[0][translator['flowcell_id']].strip() | |
164 # Get flowcell details from nglims | |
165 gi = nglimsclient.setup(GalaxyInstance(galaxy_host, api_key)) | |
166 if gi.nglims.exists_flowcell_id(fc_id): | |
167 fc_data = gi.nglims.flowcell_complete_details(fc_id) | |
168 with open(ofile, 'w') as out_file: | |
169 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
170 writer.writeheader() | |
171 laneslots_def = set() | |
172 for r in records: | |
173 fc_id = r[translator['flowcell_id']].strip() if not namespace else \ | |
174 add_namespace(namespace, r[translator['flowcell_id']]).strip() | |
175 if subsamples_enabled: | |
176 source_tube_id = '%s::%s' % (r[translator['tube_id']].strip(), | |
177 r[translator['protocol']].strip()) | |
178 else: | |
179 source_tube_id = r[translator['tube_id']].strip() | |
180 # Identify adapter | |
181 adapter = [i['adapter'] for i in fc_data['details'] if i['name']==r[translator['tube_id']].strip() and i['lane']==int(r[translator['lane_id']].strip())] | |
182 laneslots_def.add(('%s:%s' % (fc_id, r[translator['lane_id']].strip()), | |
183 r[translator['sample_tag']].strip(), | |
184 source_tube_id, | |
185 r[translator['protocol']].strip(), | |
186 r[translator['operator']].strip(), | |
187 r[translator['sample_project']].strip(), | |
188 adapter[0])) | |
189 for x in laneslots_def: | |
190 writer.writerow({'study' : study_label, | |
191 'lane' : x[0], | |
192 'tag' : x[1], | |
193 'content' : 'DNA', | |
194 'source' : x[2] if not namespace else \ | |
195 add_namespace(namespace, x[2]), | |
196 'source_type' : 'Tube', | |
197 'options' : 'protocol=%s,operator=%s,sample_project=%s,adapter=%s' % | |
198 (x[3], x[4], x[5], x[6])}) | |
199 | |
200 | |
201 def write_study_file(study_label, records, translator, ofile, logger=None): | |
202 ofile_fields = ['label', 'description'] | |
203 with open(ofile, 'w') as out_file: | |
204 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t', ) | |
205 writer.writeheader() | |
206 writer.writerow({'label': study_label}) | |
207 | |
208 | |
209 def main(argv): | |
210 parser = make_parser() | |
211 args = parser.parse_args(argv) | |
212 | |
213 log_level = getattr(logging, args.loglevel) | |
214 kwargs = {'format' : LOG_FORMAT, | |
215 'datefmt' : LOG_DATEFMT, | |
216 'level' : log_level} | |
217 if args.logfile: | |
218 kwargs['filename'] = args.logfile | |
219 logging.basicConfig(**kwargs) | |
220 logger = logging.getLogger('prepare_seq_dsample_inputs') | |
221 | |
222 with open(args.in_file, 'rU') as f: | |
223 logger.info('Loading data from file %s' % args.in_file) | |
224 reader = csv.DictReader(f, delimiter='\t') | |
225 recs = [r for r in reader] | |
226 translator = get_samplesheet_translator() | |
227 | |
228 with open(args.config_parameters) as cfgf: | |
229 conf = yaml.load(cfgf) | |
230 if not conf.has_key('config_parameters'): | |
231 raise RuntimeError('Bad configuration file') | |
232 else: | |
233 try: | |
234 study_label = conf['config_parameters']['study_label'] | |
235 except KeyError: | |
236 raise RuntimeError('No study_label provided') | |
237 if conf['config_parameters'].has_key('namespace'): | |
238 namespace = conf['config_parameters']['namespace'] | |
239 else: | |
240 namespace = None | |
241 | |
242 if args.study_output_file: | |
243 logger.info('Writing Study definition file %s' % args.study_output_file) | |
244 write_study_file(study_label, recs, translator, args.study_output_file, logger) | |
245 logger.info('Done writing file %s' % args.study_output_file) | |
246 | |
247 logger.info('Writing Tube definitions file %s' % args.tubes_out_file) | |
248 write_tubes_file(recs, study_label, translator, | |
249 args.tubes_out_file, namespace, | |
250 logger) | |
251 logger.info('Done writing file %s' % args.tubes_out_file) | |
252 | |
253 if args.tubes_subsamples_output_file: | |
254 logger.info('Writing Tubes\' subsamples definitions file %s' \ | |
255 % args.tubes_subsamples_output_file) | |
256 write_subsamples_file(recs, study_label, translator, | |
257 args.tubes_subsamples_output_file, | |
258 namespace, logger) | |
259 logger.info('Done writing file %s' % args.tubes_subsamples_output_file) | |
260 | |
261 logger.info('Writing FlowCell definitions file %s' % args.flowcells_out_file) | |
262 write_flowcells_file(recs, study_label, translator, | |
263 args.flowcells_out_file, namespace, | |
264 logger) | |
265 logger.info('Done writing file %s' % args.flowcells_out_file) | |
266 | |
267 logger.info('Writing Lane definitions file %s' % args.lanes_out_file) | |
268 write_lanes_file(recs, study_label, translator, | |
269 args.lanes_out_file, namespace, | |
270 logger) | |
271 logger.info('Done writing file %s' % args.lanes_out_file) | |
272 | |
273 logger.info('Writing LaneSlot definitions file %s' % args.laneslots_out_file) | |
274 write_laneslots_file(recs, study_label, translator, | |
275 args.laneslots_out_file, | |
276 'tubes_subsamples_output_file' in args, # Check if subsamples have been created | |
277 namespace, | |
278 logger) | |
279 logger.info('Done writing file %s' % args.laneslots_out_file) | |
280 | |
281 | |
282 if __name__ == '__main__': | |
283 main(sys.argv[1:]) |