diff galaxy-tools/biobank/utils/prepare_individuals_import.py @ 0:ba6cf6ede027 draft default tip

Uploaded
author ric
date Wed, 28 Sep 2016 06:03:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/utils/prepare_individuals_import.py	Wed Sep 28 06:03:30 2016 -0400
@@ -0,0 +1,103 @@
+"""
+Split a file like::
+
+  individual    gender  father       mother
+  ASTUDY:2141   MALE    ASTUDY:12    ASTUDY:12341
+  ASTUDY:415    MALE    ASTUDY:3562  ASTUDY:13612
+
+into two separated TSV files, the first one will be used to import new individuals and enrollments,
+the second one will be used to update father and mother informations for the individuals in the first
+file.
+"""
+
+import sys, argparse, csv
+from bl.vl.utils import LOG_LEVELS, get_logger
+
+
+def get_parser():
+    parser = argparse.ArgumentParser('Prepare input files for individuals import workflow')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logging level', default='INFO')
+    parser.add_argument('--in-file', type=str, required=True,
+                        help='input TSV file')
+    parser.add_argument('--out-individuals', type=str, required=True,
+                        help='input file with Individuals definitions')
+    parser.add_argument('--out-parents', type=str, required=True,
+                        help='input file with parents definitions')
+    return parser
+
+
+def get_individual_definitions(records, logger):
+    logger.info('Creating individual definitions')
+    ind_defs = []
+    for rec in records:
+        try:
+            idef = {'father': 'None', 'mother': 'None'}
+            idef['gender'] = rec['gender']
+            try:
+                idef['study'], idef['label'] = rec['individual'].split(':')
+            except ValueError:
+                logger.error('Skipped record %r, wrong label format for %s', rec, rec['individual'])
+                continue
+        except KeyError, ke:
+            logger.error('Skipped record %r, missing key %s', rec, ke)
+            continue
+        ind_defs.append(idef)
+    logger.info('Retrieved %d individual definitions', len(ind_defs))
+    return ind_defs
+
+
+def get_parents_definitions(records, logger):
+    logger.info('Creating parents definitions')
+    parents_defs = []
+    for rec in records:
+        try:
+            pdef = dict()
+            pdef['individual'] = rec['individual']
+            if rec['father'] != 'None' or rec['mother'] != 'None':
+                pdef['father'] = rec['father']
+                pdef['mother'] = rec['mother']
+                parents_defs.append(pdef)
+            else:
+                continue
+        except KeyError, ke:
+            logger.error('Skipped record %r, missing key %s', rec, ke)
+            continue
+    logger.info('Retrieved %d parents definitions', len(parents_defs))
+    return parents_defs
+
+
+def main(argv):
+    parser = get_parser()
+    args = parser.parse_args(argv)
+
+    logger = get_logger('prepare_individuals_import', level=args.loglevel,
+                        filename=args.logfile)
+
+    logger.info('Start processing file %s', args.in_file)
+
+    with open(args.in_file) as in_file:
+        reader = csv.DictReader(in_file, delimiter='\t')
+        records = [row for row in reader]
+        logger.info('Loaded %d records', len(records))
+
+    individual_defs = get_individual_definitions(records, logger)
+    with open(args.out_individuals, 'w') as inds_out:
+        inds_writer = csv.DictWriter(inds_out,
+                                     ['study', 'label', 'gender', 'father', 'mother'],
+                                     delimiter='\t')
+        inds_writer.writeheader()
+        inds_writer.writerows(individual_defs)
+
+    parents_defs = get_parents_definitions(records, logger)
+    with open(args.out_parents, 'w') as parents_out:
+        parents_writer = csv.DictWriter(parents_out, ['individual', 'father', 'mother'],
+                                        delimiter='\t')
+        parents_writer.writeheader()
+        parents_writer.writerows(parents_defs)
+
+    logger.info('Job completed')
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
\ No newline at end of file