Mercurial > repos > ric > test2
comparison galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py @ 0:ba6cf6ede027 draft default tip
Uploaded
| author | ric |
|---|---|
| date | Wed, 28 Sep 2016 06:03:30 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ba6cf6ede027 |
|---|---|
| 1 """ | |
| 2 This tool produces files that can be used as input to import | |
| 3 * samples | |
| 4 * flowcells | |
| 5 * lanes | |
| 6 * laneslots | |
| 7 within OMERO.biobank using import applications. | |
| 8 If the optional 'study-output-file' parameter is given as input, the | |
| 9 script will produce the input file for a new study definition. | |
| 10 If the optional 'tubes-subsamples-output-file' is given, the script | |
| 11 will generate another file with tubes definitions where each tube is | |
| 12 produced appliying a specific laboratory protocol to an existing | |
| 13 tube. Existing tubes are the ones in tubes-out-file, new tubes' labels | |
| 14 are created using the pattern <tube_label>::<protocol> | |
| 15 The config_parameters field must point to a YAML configuration file | |
| 16 with the following structure: | |
| 17 | |
| 18 config_parameters: | |
| 19 study_label: study_label | |
| 20 namespace: namespace | |
| 21 | |
| 22 where study_label is mandatory | |
| 23 """ | |
| 24 | |
| 25 import csv, sys, argparse, logging, yaml | |
| 26 # Needed to import flowcell data | |
| 27 from bioblend.galaxy import GalaxyInstance | |
| 28 import nglimsclient, os | |
| 29 | |
| 30 LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' | |
| 31 LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' | |
| 32 LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] | |
| 33 | |
| 34 def make_parser(): | |
| 35 parser = argparse.ArgumentParser(description='split sequencing samplesheet') | |
| 36 parser.add_argument('--logfile', type=str, help='log file (default=stderr)') | |
| 37 parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, | |
| 38 help='logging level', default='INFO') | |
| 39 parser.add_argument('--in-file', '-i', type=str, required=True, | |
| 40 help='input file') | |
| 41 parser.add_argument('--tubes-out-file', type=str, | |
| 42 help='output file containing tube definitions', | |
| 43 default='./tubes_def.tsv') | |
| 44 parser.add_argument('--flowcells-out-file', type=str, | |
| 45 help='output file containing flowcell definitions', | |
| 46 default='./flowcells_def.tsv') | |
| 47 parser.add_argument('--lanes-out-file', type=str, | |
| 48 help='output file containing lane definitions', | |
| 49 default='./lanes_def.tsv') | |
| 50 parser.add_argument('--laneslots-out-file', type=str, | |
| 51 help='output file containing laneslot definitions', | |
| 52 default='./laneslots_def.tsv') | |
| 53 parser.add_argument('--config-parameters', type=str, required=True, | |
| 54 help='a YAML configuration file containing study label and labels namespace, ' | |
| 55 'namespace is optional') | |
| 56 parser.add_argument('--study-output-file', type=str, | |
| 57 help='output file containing study definition') | |
| 58 parser.add_argument('--tubes-subsamples-output-file', type=str, | |
| 59 help='output file containing tubes subsamples (samples produced applying a ' | |
| 60 'laboratory protocol to existing samples)') | |
| 61 return parser | |
| 62 | |
| 63 | |
| 64 def get_samplesheet_translator(samplesheet_type='default'): | |
| 65 translator = {'default': {'flowcell_id': 'FCID', | |
| 66 'tube_id': 'SampleID', | |
| 67 'lane_id': 'Lane', | |
| 68 'sample_tag': 'Index', | |
| 69 'protocol': 'Recipe', | |
| 70 'operator': 'Operator', | |
| 71 'sample_project': 'SampleProject'} | |
| 72 } | |
| 73 return translator[samplesheet_type] | |
| 74 | |
| 75 def add_namespace(namespace, label, separator='|'): | |
| 76 return separator.join([namespace, label]) | |
| 77 | |
| 78 def write_tubes_file(records, study_label, translator, ofile, | |
| 79 namespace = None, logger = None): | |
| 80 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content', | |
| 81 'vessel_status', 'source', 'source_type'] | |
| 82 with open(ofile, 'w') as out_file: | |
| 83 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
| 84 writer.writeheader() | |
| 85 tubes_def = set([r[translator['tube_id'].strip()] for r in records]) | |
| 86 for x in tubes_def: | |
| 87 writer.writerow({'study' : study_label, | |
| 88 'label' : x if not namespace else add_namespace(namespace, x), | |
| 89 'vessel_type' : 'Tube', | |
| 90 'vessel_content' : 'DNA', | |
| 91 'vessel_status' : 'UNKNOWN', | |
| 92 'source' : 'None', | |
| 93 'source_type' : 'NO_SOURCE'}) | |
| 94 | |
| 95 | |
| 96 def write_subsamples_file(records, study_label, translator, ofile, | |
| 97 namespace = None, logger = None): | |
| 98 ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content', | |
| 99 'vessel_status', 'source', 'source_type', 'options'] | |
| 100 with open(ofile, 'w') as out_file: | |
| 101 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
| 102 writer.writeheader() | |
| 103 subsamples_def = set([('%s::%s' % (r[translator['tube_id']].strip(), r[translator['protocol']].strip()), | |
| 104 r[translator['tube_id']].strip(), | |
| 105 r[translator['protocol']].strip()) for r in records]) | |
| 106 for x in subsamples_def: | |
| 107 writer.writerow({'study' : study_label, | |
| 108 'label' : x[0] if not namespace else add_namespace(namespace, x[0]), | |
| 109 'vessel_type' : 'Tube', | |
| 110 'vessel_content' : 'DNA', | |
| 111 'vessel_status' : 'UNKNOWN', | |
| 112 'source' : x[1] if not namespace else add_namespace(namespace, x[1]), | |
| 113 'source_type' : 'Tube', | |
| 114 'options' : 'protocol=%s' % x[2]}) | |
| 115 | |
| 116 | |
| 117 def write_flowcells_file(records, study_label, translator, ofile, | |
| 118 namespace = None, logger=None): | |
| 119 ofile_fields = ['study', 'label', 'barcode', 'container_status', | |
| 120 'number_of_slots'] | |
| 121 with open(ofile, 'w') as out_file: | |
| 122 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
| 123 writer.writeheader() | |
| 124 flowcells_def = set([(r[translator['flowcell_id']].strip()) for r in records]) | |
| 125 for x in flowcells_def: | |
| 126 writer.writerow({'study' : study_label, | |
| 127 'label' : x if not namespace else add_namespace(namespace, x), | |
| 128 'barcode' : x if not namespace else add_namespace(namespace, x), | |
| 129 'container_status' : 'INSTOCK', | |
| 130 'number_of_slots' : '8'}) | |
| 131 | |
| 132 | |
| 133 def write_lanes_file(records, study_label, translator, ofile, | |
| 134 namespace = None, logger=None): | |
| 135 ofile_fields = ['study', 'flow_cell', 'slot', 'container_status'] | |
| 136 with open(ofile, 'w') as out_file: | |
| 137 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
| 138 writer.writeheader() | |
| 139 lanes_def = set([(r[translator['flowcell_id']].strip(), | |
| 140 r[translator['lane_id']].strip()) | |
| 141 for r in records]) | |
| 142 for x in lanes_def: | |
| 143 writer.writerow({'study' : study_label, | |
| 144 'flow_cell' : x[0] if not namespace else add_namespace(namespace, x[0]), | |
| 145 'slot' : x[1], | |
| 146 'container_status' : 'INSTOCK'}) | |
| 147 | |
| 148 | |
| 149 def write_laneslots_file(records, study_label, translator, ofile, | |
| 150 subsamples_enabled=False, | |
| 151 namespace = None, logger=None): | |
| 152 logger.debug ('subsamples_ensabled: %r' % subsamples_enabled) | |
| 153 ofile_fields = ['study', 'lane', 'tag', 'content', 'source', | |
| 154 'source_type', 'options'] | |
| 155 # Get NGLIMS host and key | |
| 156 try: | |
| 157 galaxy_host = os.environ['NGLIMS_GALAXY_HOST'] | |
| 158 api_key = os.environ['NGLIMS_GALAXY_API_KEY'] | |
| 159 except KeyError as ke: | |
| 160 msg = 'No environment variables %s set to configure access to the Galaxy server' % ke | |
| 161 sys.exit(msg) | |
| 162 # Get flowcell label (assuming label is the same for all records) | |
| 163 fc_id = records[0][translator['flowcell_id']].strip() | |
| 164 # Get flowcell details from nglims | |
| 165 gi = nglimsclient.setup(GalaxyInstance(galaxy_host, api_key)) | |
| 166 if gi.nglims.exists_flowcell_id(fc_id): | |
| 167 fc_data = gi.nglims.flowcell_complete_details(fc_id) | |
| 168 with open(ofile, 'w') as out_file: | |
| 169 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t') | |
| 170 writer.writeheader() | |
| 171 laneslots_def = set() | |
| 172 for r in records: | |
| 173 fc_id = r[translator['flowcell_id']].strip() if not namespace else \ | |
| 174 add_namespace(namespace, r[translator['flowcell_id']]).strip() | |
| 175 if subsamples_enabled: | |
| 176 source_tube_id = '%s::%s' % (r[translator['tube_id']].strip(), | |
| 177 r[translator['protocol']].strip()) | |
| 178 else: | |
| 179 source_tube_id = r[translator['tube_id']].strip() | |
| 180 # Identify adapter | |
| 181 adapter = [i['adapter'] for i in fc_data['details'] if i['name']==r[translator['tube_id']].strip() and i['lane']==int(r[translator['lane_id']].strip())] | |
| 182 laneslots_def.add(('%s:%s' % (fc_id, r[translator['lane_id']].strip()), | |
| 183 r[translator['sample_tag']].strip(), | |
| 184 source_tube_id, | |
| 185 r[translator['protocol']].strip(), | |
| 186 r[translator['operator']].strip(), | |
| 187 r[translator['sample_project']].strip(), | |
| 188 adapter[0])) | |
| 189 for x in laneslots_def: | |
| 190 writer.writerow({'study' : study_label, | |
| 191 'lane' : x[0], | |
| 192 'tag' : x[1], | |
| 193 'content' : 'DNA', | |
| 194 'source' : x[2] if not namespace else \ | |
| 195 add_namespace(namespace, x[2]), | |
| 196 'source_type' : 'Tube', | |
| 197 'options' : 'protocol=%s,operator=%s,sample_project=%s,adapter=%s' % | |
| 198 (x[3], x[4], x[5], x[6])}) | |
| 199 | |
| 200 | |
| 201 def write_study_file(study_label, records, translator, ofile, logger=None): | |
| 202 ofile_fields = ['label', 'description'] | |
| 203 with open(ofile, 'w') as out_file: | |
| 204 writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t', ) | |
| 205 writer.writeheader() | |
| 206 writer.writerow({'label': study_label}) | |
| 207 | |
| 208 | |
| 209 def main(argv): | |
| 210 parser = make_parser() | |
| 211 args = parser.parse_args(argv) | |
| 212 | |
| 213 log_level = getattr(logging, args.loglevel) | |
| 214 kwargs = {'format' : LOG_FORMAT, | |
| 215 'datefmt' : LOG_DATEFMT, | |
| 216 'level' : log_level} | |
| 217 if args.logfile: | |
| 218 kwargs['filename'] = args.logfile | |
| 219 logging.basicConfig(**kwargs) | |
| 220 logger = logging.getLogger('prepare_seq_dsample_inputs') | |
| 221 | |
| 222 with open(args.in_file, 'rU') as f: | |
| 223 logger.info('Loading data from file %s' % args.in_file) | |
| 224 reader = csv.DictReader(f, delimiter='\t') | |
| 225 recs = [r for r in reader] | |
| 226 translator = get_samplesheet_translator() | |
| 227 | |
| 228 with open(args.config_parameters) as cfgf: | |
| 229 conf = yaml.load(cfgf) | |
| 230 if not conf.has_key('config_parameters'): | |
| 231 raise RuntimeError('Bad configuration file') | |
| 232 else: | |
| 233 try: | |
| 234 study_label = conf['config_parameters']['study_label'] | |
| 235 except KeyError: | |
| 236 raise RuntimeError('No study_label provided') | |
| 237 if conf['config_parameters'].has_key('namespace'): | |
| 238 namespace = conf['config_parameters']['namespace'] | |
| 239 else: | |
| 240 namespace = None | |
| 241 | |
| 242 if args.study_output_file: | |
| 243 logger.info('Writing Study definition file %s' % args.study_output_file) | |
| 244 write_study_file(study_label, recs, translator, args.study_output_file, logger) | |
| 245 logger.info('Done writing file %s' % args.study_output_file) | |
| 246 | |
| 247 logger.info('Writing Tube definitions file %s' % args.tubes_out_file) | |
| 248 write_tubes_file(recs, study_label, translator, | |
| 249 args.tubes_out_file, namespace, | |
| 250 logger) | |
| 251 logger.info('Done writing file %s' % args.tubes_out_file) | |
| 252 | |
| 253 if args.tubes_subsamples_output_file: | |
| 254 logger.info('Writing Tubes\' subsamples definitions file %s' \ | |
| 255 % args.tubes_subsamples_output_file) | |
| 256 write_subsamples_file(recs, study_label, translator, | |
| 257 args.tubes_subsamples_output_file, | |
| 258 namespace, logger) | |
| 259 logger.info('Done writing file %s' % args.tubes_subsamples_output_file) | |
| 260 | |
| 261 logger.info('Writing FlowCell definitions file %s' % args.flowcells_out_file) | |
| 262 write_flowcells_file(recs, study_label, translator, | |
| 263 args.flowcells_out_file, namespace, | |
| 264 logger) | |
| 265 logger.info('Done writing file %s' % args.flowcells_out_file) | |
| 266 | |
| 267 logger.info('Writing Lane definitions file %s' % args.lanes_out_file) | |
| 268 write_lanes_file(recs, study_label, translator, | |
| 269 args.lanes_out_file, namespace, | |
| 270 logger) | |
| 271 logger.info('Done writing file %s' % args.lanes_out_file) | |
| 272 | |
| 273 logger.info('Writing LaneSlot definitions file %s' % args.laneslots_out_file) | |
| 274 write_laneslots_file(recs, study_label, translator, | |
| 275 args.laneslots_out_file, | |
| 276 'tubes_subsamples_output_file' in args, # Check if subsamples have been created | |
| 277 namespace, | |
| 278 logger) | |
| 279 logger.info('Done writing file %s' % args.laneslots_out_file) | |
| 280 | |
| 281 | |
| 282 if __name__ == '__main__': | |
| 283 main(sys.argv[1:]) |
