Mercurial > repos > ric > test1
diff galaxy-tools/biobank/utils/prepare_seq_out_inputs.py @ 3:43be74e62bfe draft
Uploaded
author | ric |
---|---|
date | Thu, 22 Sep 2016 08:57:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-tools/biobank/utils/prepare_seq_out_inputs.py Thu Sep 22 08:57:04 2016 -0400 @@ -0,0 +1,99 @@ +""" +This tool produces files that can be used as input to import + * SequencerOutput data samples + * SequencerOutput data objects +within OMERO.biobank using import applications. + +Input file must be like + + run_directory path + 130418_SN194_0303_BC1NYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw + 160418_SN194_0304_BCAZYHACXX file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw + .... +""" + +import csv, sys, argparse, logging + +LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s' +LOG_DATEFMT = '%Y-%m-%d %H:%M:%S' +LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + + +def make_parser(): + parser = argparse.ArgumentParser(description='build sequencer output import files') + parser.add_argument('--logfile', type=str, help='log file (default=stderr)') + parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS, + help='logging level', default='INFO') + parser.add_argument('--in-file', '-i', type=str, required=True, + help='input file') + parser.add_argument('--dsamples-out-file', type=str, default='./seq_out_dsamples.tsv', + help='output file containing data samples definitions') + parser.add_argument('--dobjects-out-file', type=str, default='./seq_out_dobjects.tsv', + help='output file containing data objects definitions') + parser.add_argument('--study', '-s', type=str, required=True) + return parser + + +def write_dsamples_file(records, out_file, study_label): + + def parse_run_directoty(run_dir): + _, device, _, flowcell = run_dir.split('_') + return device, flowcell[1:] + + with open(out_file, 'w') as ofile: + out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type', + 'status', 'device'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + for r in records: + device, flowcell = parse_run_directoty(r) + writer.writerow({'study': study_label, + 'label': r, + 'source': flowcell, + 'source_type': 'FlowCell', + 'seq_dsample_type': 'SequencerOutput', + 'status': 'USABLE', + 'device': device}) + + +def write_dobjects_file(records, out_file, study_label): + with open(out_file, 'w') as ofile: + out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1'] + writer = csv.DictWriter(ofile, out_file_header, delimiter='\t') + writer.writeheader() + for r in records: + writer.writerow({'study': study_label, + 'path': r['path'], + 'data_sample': r['run_directory'], + 'mimetype': 'x-vl/illumina-run-folder', + 'size': '-1', + 'sha1': 'N.A.'}) + + +def main(argv): + parser = make_parser() + args = parser.parse_args(argv) + + log_level = getattr(logging, args.loglevel) + kwargs = {'format': LOG_FORMAT, + 'datefmt': LOG_DATEFMT, + 'level': log_level} + if args.logfile: + kwargs['filename'] = args.logfile + logging.basicConfig(**kwargs) + logger = logging.getLogger('prepare_seq_dsample_inputs') + + with open(args.in_file) as f: + logger.info('Loading data from file %s', args.in_file) + reader = csv.DictReader(f, delimiter='\t') + recs = [r for r in reader] + + logger.info('Writing DataSample data to file %s', args.dsamples_out_file) + write_dsamples_file(set([r['run_directory'] for r in recs]), + args.dsamples_out_file, args.study) + logger.info('Writing DataObjects data to file %s', args.dobjects_out_file) + write_dobjects_file(recs, args.dobjects_out_file, args.study) + + +if __name__ == '__main__': + main(sys.argv[1:]) \ No newline at end of file