diff galaxy-tools/biobank/utils/split_by_study.py @ 3:43be74e62bfe draft

Uploaded
author ric
date Thu, 22 Sep 2016 08:57:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-tools/biobank/utils/split_by_study.py	Thu Sep 22 08:57:04 2016 -0400
@@ -0,0 +1,90 @@
+"""
+Split a file like::
+
+  individual    gender  father       mother
+  ASTUDY:2141   MALE    ASTUDY:12    ASTUDY:12341
+  ASTUDY:415    MALE    ASTUDY:3562  ASTUDY:13612
+  BSTUDY:12515  FEMALE  BSTUDY:3512  BSTUDY:124
+
+into multiple files based on the STUDY value of the label stored in the "individual" column.
+Each label in the "individual" column must have a STUDY:ENROLLMENT_CODE format, otherwise the line
+will be skipped.
+"""
+
+import sys, argparse, csv, os
+from bl.vl.utils import LOG_LEVELS, get_logger
+
+
+def get_parser():
+    parser = argparse.ArgumentParser('Split a file containing pedigree informations in multiple files using the study as split criteria')
+    parser.add_argument('--logfile', type=str, help='log file (default=stderr)')
+    parser.add_argument('--loglevel', type=str, choices=LOG_LEVELS,
+                        help='logging level', default='INFO')
+    parser.add_argument('--in-file', type=str, required=True,
+                        help='input TSV file')
+    parser.add_argument('--record-id', type=int,
+                        help='Output ID record given by Galaxy')
+    parser.add_argument('--out-path', type=str, help='Output directory',
+                        default='.')
+    return parser
+
+
+def split_element(element, logger):
+    try:
+        study, code = element.split(':')
+        return study, code
+    except ValueError:
+        logger.error('Label %s is not a label with format STUDY:ENROLLMENT_CODE', element)
+        return None, None
+
+
+def map_by_study(records, logger):
+    records_map = {}
+    for rec in records:
+        study, code = split_element(rec['individual'], logger)
+        if not study and not code:
+            logger.debug('Skipping record %r', rec)
+            continue
+        records_map.setdefault(study, []).append(rec)
+    logger.info('Records splitted between %d studies', len(records_map.keys()))
+    return records_map
+
+
+def dump_records(study_label, records, header, output_path, logger, galaxy_record_id=None):
+
+    def get_file_name(study, out_path, galaxy_id=None):
+        if not galaxy_id:
+            file_name = '%s_individuals.tsv' % study
+        else:
+            file_name = 'primary_%d_%s_visible_tabular' % (galaxy_id, study.replace('_', '-'))
+        return os.path.join(out_path, file_name)
+
+    fname = get_file_name(study_label, output_path, galaxy_record_id)
+    with open(fname, 'w') as ofile:
+        logger.info('Dumping %d records to file %s', len(records), fname)
+        writer = csv.DictWriter(ofile, header, delimiter='\t')
+        writer.writeheader()
+        writer.writerows(records)
+
+
+def main(argv):
+    parser = get_parser()
+    args = parser.parse_args(argv)
+
+    logger = get_logger('split_by_study', level=args.loglevel, filename=args.logfile)
+
+    logger.info('Start processing file %s', args.in_file)
+    with open(args.in_file) as in_file:
+        reader = csv.DictReader(in_file, delimiter='\t')
+        records = [row for row in reader]
+
+    records_map = map_by_study(records, logger)
+    # Force the header of the output files in order to prevent problems when running the workflow later
+    header = ['individual', 'gender', 'father', 'mother']
+    for study, records in records_map.iteritems():
+        dump_records(study, records, header, args.out_path, logger, args.record_id)
+    logger.info('Job completed')
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
\ No newline at end of file