diff galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/utils/prepare_seq_dsample_inputs.py	Wed Sep 28 06:03:30 2016 -0400
@@ -0,0 +1,283 @@
+"""
+This tool produces files that can be used as input to import
+* samples
+* flowcells
+* lanes
+* laneslots
+within OMERO.biobank using import applications.
+If the optional 'study-output-file' parameter is given as input, the
+script will produce the input file for a new study definition.
+If the optional 'tubes-subsamples-output-file' is given, the script
+will generate another file with tubes definitions where each tube is
+produced appliying a specific laboratory protocol to an existing
+tube. Existing tubes are the ones in tubes-out-file, new tubes' labels
+are created using the pattern <tube_label>::<protocol>
+The config_parameters field must point to a YAML configuration file
+with the following structure:
+
+ config_parameters:
+   study_label: study_label
+   namespace: namespace
+
+where study_label is mandatory
+"""
+
+import csv, sys, argparse, logging, yaml
+# Needed to import flowcell data
+from bioblend.galaxy import GalaxyInstance
+import nglimsclient, os
+
+LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
+LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
+LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
+def make_parser():
+    parser = argparse.ArgumentParser(description='split sequencing samplesheet')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logging level', default='INFO')
+    parser.add_argument('--in-file', '-i', type=str, required=True,
+                        help='input file')
+    parser.add_argument('--tubes-out-file', type=str,
+                        help='output file containing tube definitions',
+                        default='./tubes_def.tsv')
+    parser.add_argument('--flowcells-out-file', type=str, 
+                        help='output file containing flowcell definitions',
+                        default='./flowcells_def.tsv')
+    parser.add_argument('--lanes-out-file', type=str,
+                        help='output file containing lane definitions',
+                        default='./lanes_def.tsv')
+    parser.add_argument('--laneslots-out-file', type=str,
+                        help='output file containing laneslot definitions',
+                        default='./laneslots_def.tsv')
+    parser.add_argument('--config-parameters', type=str, required=True,
+                        help='a YAML configuration file containing study label and labels namespace, '
+                             'namespace is optional')
+    parser.add_argument('--study-output-file', type=str,
+                        help='output file containing study definition')
+    parser.add_argument('--tubes-subsamples-output-file', type=str,
+                        help='output file containing tubes subsamples (samples produced applying a '
+                             'laboratory protocol to existing samples)')
+    return parser
+
+
+def get_samplesheet_translator(samplesheet_type='default'):
+    translator = {'default': {'flowcell_id': 'FCID',
+                              'tube_id': 'SampleID',
+                              'lane_id': 'Lane',
+                              'sample_tag': 'Index',
+                              'protocol': 'Recipe',
+                              'operator': 'Operator',
+                              'sample_project': 'SampleProject'}
+                  }
+    return translator[samplesheet_type]
+
+def add_namespace(namespace, label, separator='|'):
+    return separator.join([namespace, label])
+
+def write_tubes_file(records, study_label, translator, ofile, 
+                     namespace = None, logger = None):
+    ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content',
+                    'vessel_status', 'source', 'source_type']
+    with open(ofile, 'w') as out_file:
+        writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
+        writer.writeheader()
+        tubes_def = set([r[translator['tube_id'].strip()] for r in records])
+        for x in tubes_def:
+            writer.writerow({'study'          : study_label,
+                             'label'          : x if not namespace else add_namespace(namespace, x),
+                             'vessel_type'    : 'Tube',
+                             'vessel_content' : 'DNA',
+                             'vessel_status'  : 'UNKNOWN',
+                             'source'         : 'None',
+                             'source_type'    : 'NO_SOURCE'})
+
+
+def write_subsamples_file(records, study_label, translator, ofile,
+                          namespace = None, logger = None):
+    ofile_fields = ['study', 'label', 'vessel_type', 'vessel_content',
+                    'vessel_status', 'source', 'source_type', 'options']
+    with open(ofile, 'w') as out_file:
+        writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
+        writer.writeheader()
+        subsamples_def = set([('%s::%s' % (r[translator['tube_id']].strip(), r[translator['protocol']].strip()),
+                               r[translator['tube_id']].strip(), 
+                               r[translator['protocol']].strip()) for r in records])
+        for x in subsamples_def:
+            writer.writerow({'study'          : study_label,
+                             'label'          : x[0] if not namespace else add_namespace(namespace, x[0]),
+                             'vessel_type'    : 'Tube',
+                             'vessel_content' : 'DNA',
+                             'vessel_status'  : 'UNKNOWN',
+                             'source'         : x[1] if not namespace else add_namespace(namespace, x[1]),
+                             'source_type'    : 'Tube',
+                             'options'        : 'protocol=%s' % x[2]})
+
+
+def write_flowcells_file(records, study_label, translator, ofile, 
+                         namespace = None, logger=None):
+    ofile_fields = ['study', 'label', 'barcode', 'container_status',
+                    'number_of_slots']
+    with open(ofile, 'w') as out_file:
+        writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
+        writer.writeheader()
+        flowcells_def = set([(r[translator['flowcell_id']].strip()) for r in records])
+        for x in flowcells_def:
+            writer.writerow({'study'            : study_label,
+                             'label'            : x if not namespace else add_namespace(namespace, x),
+                             'barcode'          : x if not namespace else add_namespace(namespace, x),
+                             'container_status' : 'INSTOCK',
+                             'number_of_slots'  : '8'})
+
+
+def write_lanes_file(records, study_label, translator, ofile, 
+                     namespace = None, logger=None):
+    ofile_fields = ['study', 'flow_cell', 'slot', 'container_status']
+    with open(ofile, 'w') as out_file:
+        writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
+        writer.writeheader()
+        lanes_def = set([(r[translator['flowcell_id']].strip(),
+                          r[translator['lane_id']].strip())
+                         for r in records])
+        for x in lanes_def:
+            writer.writerow({'study'            : study_label,
+                             'flow_cell'        : x[0] if not namespace else add_namespace(namespace, x[0]),
+                             'slot'             : x[1],
+                             'container_status' : 'INSTOCK'})
+
+
+def write_laneslots_file(records, study_label, translator, ofile,
+                         subsamples_enabled=False,
+                         namespace = None, logger=None):
+    logger.debug ('subsamples_ensabled: %r' % subsamples_enabled)
+    ofile_fields = ['study', 'lane', 'tag', 'content', 'source',
+                    'source_type', 'options']
+    # Get NGLIMS host and key
+    try:
+        galaxy_host = os.environ['NGLIMS_GALAXY_HOST']
+        api_key = os.environ['NGLIMS_GALAXY_API_KEY']
+    except KeyError as ke:
+        msg = 'No environment variables %s set to configure access to the Galaxy server' % ke
+        sys.exit(msg)
+    # Get flowcell label (assuming label is the same for all records)
+    fc_id = records[0][translator['flowcell_id']].strip()
+    # Get flowcell details from nglims
+    gi = nglimsclient.setup(GalaxyInstance(galaxy_host, api_key))
+    if gi.nglims.exists_flowcell_id(fc_id):
+        fc_data = gi.nglims.flowcell_complete_details(fc_id)
+    with open(ofile, 'w') as out_file:
+        writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t')
+        writer.writeheader()
+        laneslots_def = set()
+        for r in records:
+            fc_id = r[translator['flowcell_id']].strip() if not namespace else \
+                add_namespace(namespace, r[translator['flowcell_id']]).strip()
+            if subsamples_enabled:
+                source_tube_id = '%s::%s' % (r[translator['tube_id']].strip(),
+                                             r[translator['protocol']].strip())
+            else:
+                source_tube_id = r[translator['tube_id']].strip()
+            # Identify adapter
+            adapter = [i['adapter'] for i in fc_data['details'] if i['name']==r[translator['tube_id']].strip() and i['lane']==int(r[translator['lane_id']].strip())]
+            laneslots_def.add(('%s:%s' % (fc_id, r[translator['lane_id']].strip()),
+                               r[translator['sample_tag']].strip(),
+                               source_tube_id,
+                               r[translator['protocol']].strip(),
+                               r[translator['operator']].strip(),
+                               r[translator['sample_project']].strip(),
+                               adapter[0]))
+        for x in laneslots_def:
+            writer.writerow({'study'       : study_label,
+                             'lane'        : x[0],
+                             'tag'         : x[1],
+                             'content'     : 'DNA',
+                             'source'      : x[2] if not namespace else \
+                                               add_namespace(namespace, x[2]),
+                             'source_type' : 'Tube',
+                             'options'     : 'protocol=%s,operator=%s,sample_project=%s,adapter=%s' % 
+                                              (x[3], x[4], x[5], x[6])})
+
+
+def write_study_file(study_label, records, translator, ofile, logger=None):
+    ofile_fields = ['label', 'description']
+    with open(ofile, 'w') as out_file:
+        writer = csv.DictWriter(out_file, ofile_fields, delimiter='\t', )
+        writer.writeheader()
+        writer.writerow({'label': study_label})
+
+
+def main(argv):
+    parser = make_parser()
+    args = parser.parse_args(argv)
+
+    log_level = getattr(logging, args.loglevel)
+    kwargs = {'format'  : LOG_FORMAT,
+              'datefmt' : LOG_DATEFMT,
+              'level'   : log_level}
+    if args.logfile:
+        kwargs['filename'] = args.logfile
+    logging.basicConfig(**kwargs)
+    logger = logging.getLogger('prepare_seq_dsample_inputs')
+
+    with open(args.in_file, 'rU') as f:
+        logger.info('Loading data from file %s' % args.in_file)
+        reader = csv.DictReader(f, delimiter='\t')
+        recs = [r for r in reader]
+    translator = get_samplesheet_translator()
+
+    with open(args.config_parameters) as cfgf:
+        conf = yaml.load(cfgf)
+        if not conf.has_key('config_parameters'):
+            raise RuntimeError('Bad configuration file')
+        else:
+            try:
+                study_label = conf['config_parameters']['study_label']
+            except KeyError:
+                raise RuntimeError('No study_label provided')
+            if conf['config_parameters'].has_key('namespace'):
+                namespace = conf['config_parameters']['namespace']
+            else:
+                namespace = None
+
+    if args.study_output_file:
+        logger.info('Writing Study definition file %s' % args.study_output_file)
+        write_study_file(study_label, recs, translator, args.study_output_file, logger)
+        logger.info('Done writing file %s' % args.study_output_file)
+
+    logger.info('Writing Tube definitions file %s' % args.tubes_out_file)
+    write_tubes_file(recs, study_label, translator,
+                     args.tubes_out_file, namespace,
+                     logger)
+    logger.info('Done writing file %s' % args.tubes_out_file)
+
+    if args.tubes_subsamples_output_file:
+        logger.info('Writing Tubes\' subsamples definitions file %s' \
+                     % args.tubes_subsamples_output_file)
+        write_subsamples_file(recs, study_label, translator,
+                              args.tubes_subsamples_output_file,
+                              namespace, logger)
+        logger.info('Done writing file %s' % args.tubes_subsamples_output_file)
+
+    logger.info('Writing FlowCell definitions file %s' % args.flowcells_out_file)
+    write_flowcells_file(recs, study_label, translator,
+                         args.flowcells_out_file, namespace,
+                         logger)
+    logger.info('Done writing file %s' % args.flowcells_out_file)
+
+    logger.info('Writing Lane definitions file %s' % args.lanes_out_file)
+    write_lanes_file(recs, study_label, translator,
+                     args.lanes_out_file, namespace,
+                     logger)
+    logger.info('Done writing file %s' % args.lanes_out_file)
+
+    logger.info('Writing LaneSlot definitions file %s' % args.laneslots_out_file)
+    write_laneslots_file(recs, study_label, translator,
+                         args.laneslots_out_file, 
+                         'tubes_subsamples_output_file' in args, # Check if subsamples have been created
+                         namespace,
+                         logger)
+    logger.info('Done writing file %s' % args.laneslots_out_file)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])