view galaxy-tools/biobank/utils/prepare_seq_out_inputs.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
line wrap: on
line source

"""
This tool produces files that can be used as input to import
 * SequencerOutput data samples
 * SequencerOutput data objects
within OMERO.biobank using import applications.

Input file must be like

 run_directory                   path
 130418_SN194_0303_BC1NYHACXX    file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw
 160418_SN194_0304_BCAZYHACXX    file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw
 ....
"""

import csv, sys, argparse, logging

LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']


def make_parser():
    parser = argparse.ArgumentParser(description='build sequencer output import files')
    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
                        help='logging level', default='INFO')
    parser.add_argument('--in-file', '-i', type=str, required=True,
                        help='input file')
    parser.add_argument('--dsamples-out-file', type=str, default='./seq_out_dsamples.tsv',
                        help='output file containing data samples definitions')
    parser.add_argument('--dobjects-out-file', type=str, default='./seq_out_dobjects.tsv',
                        help='output file containing data objects definitions')
    parser.add_argument('--study', '-s', type=str, required=True)
    return parser


def write_dsamples_file(records, out_file, study_label):

    def parse_run_directoty(run_dir):
        _, device, _, flowcell = run_dir.split('_')
        return device, flowcell[1:]

    with open(out_file, 'w') as ofile:
        out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type',
                           'status', 'device']
        writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
        writer.writeheader()
        for r in records:
            device, flowcell = parse_run_directoty(r)
            writer.writerow({'study': study_label,
                             'label': r,
                             'source': flowcell,
                             'source_type': 'FlowCell',
                             'seq_dsample_type': 'SequencerOutput',
                             'status': 'USABLE',
                             'device': device})


def write_dobjects_file(records, out_file, study_label):
    with open(out_file, 'w') as ofile:
        out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1']
        writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
        writer.writeheader()
        for r in records:
            writer.writerow({'study': study_label,
                             'path': r['path'],
                             'data_sample': r['run_directory'],
                             'mimetype': 'x-vl/illumina-run-folder',
                             'size': '-1',
                             'sha1': 'N.A.'})


def main(argv):
    parser = make_parser()
    args = parser.parse_args(argv)

    log_level = getattr(logging, args.loglevel)
    kwargs = {'format': LOG_FORMAT,
              'datefmt': LOG_DATEFMT,
              'level': log_level}
    if args.logfile:
        kwargs['filename'] = args.logfile
    logging.basicConfig(**kwargs)
    logger = logging.getLogger('prepare_seq_dsample_inputs')

    with open(args.in_file) as f:
        logger.info('Loading data from file %s', args.in_file)
        reader = csv.DictReader(f, delimiter='\t')
        recs = [r for r in reader]

    logger.info('Writing DataSample data to file %s', args.dsamples_out_file)
    write_dsamples_file(set([r['run_directory'] for r in recs]),
                        args.dsamples_out_file, args.study)
    logger.info('Writing DataObjects data to file %s', args.dobjects_out_file)
    write_dobjects_file(recs, args.dobjects_out_file, args.study)


if __name__ == '__main__':
    main(sys.argv[1:])