diff galaxy-tools/biobank/utils/prepare_seq_out_inputs.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/utils/prepare_seq_out_inputs.py	Thu Sep 22 08:57:04 2016 -0400
@@ -0,0 +1,99 @@
+"""
+This tool produces files that can be used as input to import
+ * SequencerOutput data samples
+ * SequencerOutput data objects
+within OMERO.biobank using import applications.
+
+Input file must be like
+
+ run_directory                   path
+ 130418_SN194_0303_BC1NYHACXX    file:///SHARE/USERFS/els7/users/sequencing_data/completed/130418_SN194_0303_BC1NYHACXX/raw
+ 160418_SN194_0304_BCAZYHACXX    file:///SHARE/USERFS/els7/users/sequencing_data/completed/160418_SN194_0304_BCAZYHACXX/raw
+ ....
+"""
+
+import csv, sys, argparse, logging
+
+LOG_FORMAT = '%(asctime)s|%(levelname)-8s|%(message)s'
+LOG_DATEFMT = '%Y-%m-%d %H:%M:%S'
+LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
+
+def make_parser():
+    parser = argparse.ArgumentParser(description='build sequencer output import files')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logging level', default='INFO')
+    parser.add_argument('--in-file', '-i', type=str, required=True,
+                        help='input file')
+    parser.add_argument('--dsamples-out-file', type=str, default='./seq_out_dsamples.tsv',
+                        help='output file containing data samples definitions')
+    parser.add_argument('--dobjects-out-file', type=str, default='./seq_out_dobjects.tsv',
+                        help='output file containing data objects definitions')
+    parser.add_argument('--study', '-s', type=str, required=True)
+    return parser
+
+
+def write_dsamples_file(records, out_file, study_label):
+
+    def parse_run_directoty(run_dir):
+        _, device, _, flowcell = run_dir.split('_')
+        return device, flowcell[1:]
+
+    with open(out_file, 'w') as ofile:
+        out_file_header = ['study', 'label', 'source', 'source_type', 'seq_dsample_type',
+                           'status', 'device']
+        writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
+        writer.writeheader()
+        for r in records:
+            device, flowcell = parse_run_directoty(r)
+            writer.writerow({'study': study_label,
+                             'label': r,
+                             'source': flowcell,
+                             'source_type': 'FlowCell',
+                             'seq_dsample_type': 'SequencerOutput',
+                             'status': 'USABLE',
+                             'device': device})
+
+
+def write_dobjects_file(records, out_file, study_label):
+    with open(out_file, 'w') as ofile:
+        out_file_header = ['study', 'path', 'data_sample', 'mimetype', 'size', 'sha1']
+        writer = csv.DictWriter(ofile, out_file_header, delimiter='\t')
+        writer.writeheader()
+        for r in records:
+            writer.writerow({'study': study_label,
+                             'path': r['path'],
+                             'data_sample': r['run_directory'],
+                             'mimetype': 'x-vl/illumina-run-folder',
+                             'size': '-1',
+                             'sha1': 'N.A.'})
+
+
+def main(argv):
+    parser = make_parser()
+    args = parser.parse_args(argv)
+
+    log_level = getattr(logging, args.loglevel)
+    kwargs = {'format': LOG_FORMAT,
+              'datefmt': LOG_DATEFMT,
+              'level': log_level}
+    if args.logfile:
+        kwargs['filename'] = args.logfile
+    logging.basicConfig(**kwargs)
+    logger = logging.getLogger('prepare_seq_dsample_inputs')
+
+    with open(args.in_file) as f:
+        logger.info('Loading data from file %s', args.in_file)
+        reader = csv.DictReader(f, delimiter='\t')
+        recs = [r for r in reader]
+
+    logger.info('Writing DataSample data to file %s', args.dsamples_out_file)
+    write_dsamples_file(set([r['run_directory'] for r in recs]),
+                        args.dsamples_out_file, args.study)
+    logger.info('Writing DataObjects data to file %s', args.dobjects_out_file)
+    write_dobjects_file(recs, args.dobjects_out_file, args.study)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
\ No newline at end of file