diff galaxy-tools/biobank/utils/prepare_enrollments_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/utils/prepare_enrollments_import.py	Wed Sep 28 06:03:30 2016 -0400
@@ -0,0 +1,97 @@
+"""
+Split a file like::
+
+source                              enrollment
+V044DE795E7F9F42FEB9855288CF577A77  ASTUDY:2141
+V06C59B915C0FD47DABE6AE02C731780AF  BSTUDY:390
+
+into two separated  a new TSV files
+
+source                              study  label
+V044DE795E7F9F42FEB9855288CF577A77  ASTUDY 2141
+V06C59B915C0FD47DABE6AE02C731780AF  BSTUDY 390
+
+"""
+
+import sys, argparse, csv
+from bl.vl.utils import LOG_LEVELS, get_logger
+
+
+def get_parser():
+    parser = argparse.ArgumentParser('Prepare input files for enrollments import workflow')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logging level', default='INFO')
+    parser.add_argument('--in-file', type=str, required=True,
+                        help='input TSV file')
+    parser.add_argument('--out-enrollments', type=str, required=True,
+                        help='input file with Enrollments definitions')
+    return parser
+
+
+def get_enrollments_definitions(records, logger):
+    logger.info('Creating enrollment definitions')
+    enr_defs = []
+    for rec in records:
+        try:
+            edef = {}
+            edef['source'] = rec['source']
+            try:
+                edef['study'], edef['label'] = rec['enrollment'].split(':')
+            except ValueError:
+                logger.error('Skipped record %r, wrong label format for %s', rec, rec['enrollment'])
+                continue
+        except KeyError, ke:
+            logger.error('Skipped record %r, missing key %s', rec, ke)
+            continue
+        enr_defs.append(edef)
+    logger.info('Retrieved %d enrollment definitions', len(enr_defs))
+    return enr_defs
+
+
+def get_parents_definitions(records, logger):
+    logger.info('Creating parents definitions')
+    parents_defs = []
+    for rec in records:
+        try:
+            pdef = dict()
+            pdef['individual'] = rec['individual']
+            if rec['father'] != 'None' or rec['mother'] != 'None':
+                pdef['father'] = rec['father']
+                pdef['mother'] = rec['mother']
+                parents_defs.append(pdef)
+            else:
+                continue
+        except KeyError, ke:
+            logger.error('Skipped record %r, missing key %s', rec, ke)
+            continue
+    logger.info('Retrieved %d parents definitions', len(parents_defs))
+    return parents_defs
+
+
+def main(argv):
+    parser = get_parser()
+    args = parser.parse_args(argv)
+
+    logger = get_logger('prepare_enrollments_import', level=args.loglevel,
+                        filename=args.logfile)
+
+    logger.info('Start processing file %s', args.in_file)
+
+    with open(args.in_file) as in_file:
+        reader = csv.DictReader(in_file, delimiter='\t')
+        records = [row for row in reader]
+        logger.info('Loaded %d records', len(records))
+
+    enrollment_defs = get_enrollments_definitions(records, logger)
+    with open(args.out_enrollments, 'w') as enr_out:
+        enr_writer = csv.DictWriter(enr_out,
+                                     ['source', 'study', 'label'],
+                                     delimiter='\t')
+        enr_writer.writeheader()
+        enr_writer.writerows(enrollment_defs)
+
+    logger.info('Job completed')
+
+if __name__ == '__main__':
+    main(sys.argv[1:])