Mercurial > repos > morinlab > cnv2igv
changeset 1:a81a8c524d3d draft
Uploaded
author | morinlab |
---|---|
date | Mon, 05 Dec 2016 14:22:04 -0500 |
parents | 55f246f1bdb3 |
children | 41b99b6dcfdc |
files | cnv2igv.py |
diffstat | 1 files changed, 348 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cnv2igv.py Mon Dec 05 14:22:04 2016 -0500 @@ -0,0 +1,348 @@ +import argparse +import re +import math + +""" +Input: +Requires filepath to a segmentation file. This segmentation +file will be converted to IGV format. Additionally, the +user is able to specify whether the segmentation file is +generated by 'sequenza' or 'titan' using the '--mode' +parameter. If 'sequenza' is specified in the '--mode' +parameter, the '--sequenza_sample' parameter must also be set. +The '--sequenza_sample' parameter specifies the sample name +for the 'sequenza' segmentation file. If '--include_loh' is +present, a column indicating whether the segment represents +a loss of heterozygosity. The '--include_loh' parameter only +works when '--mode' is to 'sequenza' or 'titan'. Alternatively, +if the segmentation file is neither 'sequenza' or 'titan', +then the user must specify the column indices (1-based numbering) +for the sample name ('--sample_col'), chromosome ('--chrm_col'), +start position ('--start_col'), end position ('--end_col') +in the segmentation file. The user must also specify the +column indices (1-based numbering) of either the log ratio +column ('--log_r_col'), the depth ratio column +('--d_ratio_col') or the absolute copy number column +('--abs_cn_col') in the segmentation file. If '--d_ratio_col' +or '--abs_cn_col' is specified then the values in the +specified columns will be used in calculating the log2 value. + +Output: +Outputs an IGV formatted segmentation file to standard output. +""" + + +def main(): + args = parse_args() + + check_arguments(args) + + args = add_to_MODES(args) + + header_checked = False + + mode = args.mode + seg_file = args.seg_file + use_abs_cn = args.use_abs_cn + as_integer = args.as_integer + force_header = args.force_header + prepend_chr = args.prepend_chr + human_readable_state = False + drop_header = args.drop_header + if args.oncocircos: + use_abs_cn = True + human_readable_state = True + drop_header = True + if args.gistic: + #as_integer = True + use_abs_cn = True + prepend_chr = True + #drop_header = True + include_loh = args.include_loh + + if force_header: + print "Sample\tChromosome\tStart\tEnd\tNum_Probes\tSegment_Mean" + elif drop_header: + pass + else: + if mode in [ 'sequenza', 'titan' ] and include_loh: + print 'ID\tchrom\tstart\tend\tLOH_flag\tlog.ratio' + else: + print 'ID\tchrom\tstart\tend\tlog.ratio' + + for seg in seg_file: + + if not header_checked: + p = re.compile('start', re.I) + m = re.search(p, seg) + + header_checked = True + + if m: + continue + + fields = seg.split('\t') + + string = fields[MODES[mode]['chrm_col']] + if string.startswith('"') and string.endswith('"'): + string = string[1:-1] + if prepend_chr: + if not string.startswith("chr"): + fields[MODES[mode]['chrm_col']] = "chr%s" % string + else: + fields[MODES[mode]['chrm_col']] = string + else: + fields[MODES[mode]['chrm_col']] = string + vals = [ fields[MODES[mode]['chrm_col']], fields[MODES[mode]['start_col']], + fields[MODES[mode]['end_col']] ] + + if mode == 'sequenza': + + vals = [ args.sequenza_sample ] + vals + + if include_loh == 'neutral': + a_allele = fields[ MODES[ mode ][ 'a_allele_col' ] ] + abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ]) + + if a_allele == 'NA': + vals = vals + [ 'NA' ] + elif abs_cn == 2 and int(a_allele) == 2: + vals = vals + [ '1' ] + else: + vals = vals + [ '0' ] + + elif include_loh == 'deletion': + a_allele = fields[ MODES[ mode ][ 'a_allele_col' ] ] + b_allele = fields[ MODES[ mode ][ 'b_allele_col' ] ] + abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ]) + + if a_allele == 'NA' or b_allele == 'NA': + vals = vals + [ 'NA' ] + elif abs_cn == 1 and (int(a_allele) + int(b_allele)) == 1: + vals = vals + [ '1' ] + else: + vals = vals + [ '0' ] + + elif include_loh == 'any': + b_allele = fields[ MODES[ mode ][ 'b_allele_col' ] ] + abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ]) + + if b_allele == 'NA': + vals = vals + [ 'NA' ] + elif abs_cn <= 2 and int(b_allele) == 0: + vals = vals + [ '1' ] + else: + vals = vals + [ '0' ] + + log_ratio = None + + if use_abs_cn: + abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ]) + + if abs_cn == 0: + log_ratio = '-Inf' + else: + log_ratio = math.log(abs_cn, 2) - 1 + if as_integer: + vals = vals + [abs_cn] + else: + vals = vals + [ log_ratio ] + else: + log_ratio = math.log(float(fields[MODES[mode]['d_ratio_col']]),2) + vals = vals + [ log_ratio ] + + if human_readable_state: + vals = vals + [ toState(fields[MODES[mode]['abs_cn_col'] ]) ] + + elif mode == 'titan': + vals = [ fields[ MODES[ mode ][ 'sample_col' ] ] ] + vals + + if include_loh: + loh_call = fields[ MODES[ mode ][ 'call_col' ] ] + + if include_loh == 'neutral': + + if loh_call == 'NLOH': + vals = vals + [ '1' ] + else: + vals = vals + [ '0' ] + + elif include_loh == 'deletion': + + if loh_call == 'DLOH': + vals = vals + [ ' 1' ] + else: + vals = vals + [ '0' ] + + elif include_loh == 'any': + + if 'LOH' in loh_call: + vals = vals + [ '1' ] + else: + vals = vals + [ '0' ] + if use_abs_cn: + abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ]) + if abs_cn == 0: + log_ratio = '-Inf' + else: + log_ratio = math.log(abs_cn, 2) - 1 + vals = vals + [log_ratio] + else: + vals = vals + [ fields[MODES[mode]['log_r_col'] ] ] + if human_readable_state: + vals = vals + [ toState(fields[MODES[mode]['abs_cn_col'] ]) ] + elif mode == 'other': + + if 'd_ratio_col' in MODES[mode]: + vals = vals + [ math.log(fields[MODES[mode]['d_ratio_col'] ],2) ] + + elif 'abs_cn_col' in MODES[mode]: + abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ]) + + log_ratio = None + + if abs_cn == 0: + log_ratio = '-Inf' + else: + log_ratio = math.log(abs_cn, 2) - 1 + if as_integer: + print "using INT: " + print abs_cn + print "\n" + vals = vals + [abs_cn] + else: + vals = vals + [ log_ratio ] + + elif 'log_r_col' in MODES[mode]: + vals = vals + [ fields[MODES[mode]['log_r_col'] ] ] + + if include_loh: + print '{}\t{}\t{}\t{}\t{}\t{}'.format(*vals) + else: + + if human_readable_state: + print '{}\t{}\t{}\t{}\t10\t{}\t{}'.format(*vals) + else: + print '{}\t{}\t{}\t{}\t10\t{}'.format(*vals) + + return + + +def add_to_MODES(args): + mode = args.mode + + if mode not in MODES.keys(): + mode = 'other' + args.mode = mode + MODES[mode] = {} + + MODES[mode]['sample_col'] = args.sample_col - 1 + MODES[mode]['chrm_col'] = args.chrm_col - 1 + MODES[mode]['start_col'] = args.start_col - 1 + MODES[mode]['end_col'] = args.end_col - 1 + + if args.log_r_col: + MODES[mode]['log_r_col'] = args.log_r_col - 1 + elif args.d_ratio_col: + MODES[mode]['d_ratio_col'] = args.d_ratio_col - 1 + elif args.abs_cn_col: + MODES[mode]['abs_cn_col'] = args.abs_cn_col - 1 + + return args + +def toState(cns): + cns = int(cns) + cnstate = "NEUT" + if cns > 2: + if cns > 3: + cnstate = "AMP" + else: + cnstate = "GAIN" + elif cns == 1: + cnstate = "HETD" + elif cns == 0: + cnstate = "HOMD" + return cnstate + + +def check_arguments(args): + + if args.mode == 'sequenza' and not args.sequenza_sample: + raise ValueError("Must specify '--sequenza_sample' when '--mode' is set to 'sequenza'.") + + if not args.mode: + if not all([args.sample_col, args.chrm_col, args.start_col, args.end_col]) and not any([args.log_r_col, args.d_ratio_col, args.abs_cn_col]): + raise ValueError("Specify '--sample_col', '--chrm_col', '--start_col', '--end_col'. Also specify either '--log_r_col', '--d_ratio_col' or '--abs_cn_col'.") + + if args.log_r_col and args.d_ratio_col: + raise ValueError("Cannot specify both '--log_r_col' and '--d_ratio_col'.") + + return + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument('seg_file', type=argparse.FileType('r'), + help='Segmentation file to convert to IGV format.') + parser.add_argument('--mode', choices=MODES.keys(), + help='Indiciate whether segmentation file is Sequenza or TITAN.') + parser.add_argument('--sequenza_sample', + help='Specify sample name for Sequenza segmentation file.') + parser.add_argument('--use_abs_cn', action='store_true', + help="Use absolute copy number when calculating log2 value instead of \ + depth ratio when '--mode' is set to 'sequenza'.") + parser.add_argument('--include_loh', choices=['neutral', 'deletion', 'any'], + help="Include a column indicating whether the segment represents a loss \ + heterozygosity. Only works when '--mode' is set to 'sequenza or 'titan'.") + parser.add_argument('--sample_col', type=int, + help='1-based index of sample name column in segmentation file.') + parser.add_argument('--chrm_col', type=int, + help='1-based index of chromosome column in segmentation file.') + parser.add_argument('--start_col', type=int, + help='1-based index of segment start position column in segmentation file.') + parser.add_argument('--end_col', type=int, + help='1-based index of segment end position column in segmentation file.') + parser.add_argument('--log_r_col', type=int, + help='1-based index of log ratio column in segmentation file.') + parser.add_argument('--d_ratio_col', type=int, + help='1-based index of depth ratio column in segmentation file.') + parser.add_argument('--abs_cn_col', type=int, + help='1-based index of absolute copy number column in segmentation file.') + parser.add_argument('--drop_header', action='store_true', help='Flag to force script to not write a header to the output') + parser.add_argument('--as_integer', action='store_true', + help='Flag to use untransformed value of absolute copy number instead of log2 ratio') + parser.add_argument('--prepend_chr', action='store_true', help='Flag to force a chr prefex be added to all rows (unless already present)') + parser.add_argument('--oncocircos',action='store_true',help='special mode to generate input for the Oncoricos tool') + parser.add_argument('--gistic',action='store_true',help='special mode to generate input for the GISTIC2.0 tool') + parser.add_argument('--force_header',action='store_true') + + args = parser.parse_args() + + return args + +MODES = { + 'titan' : + { + 'sample_col': 0, + 'chrm_col': 1, + 'start_col': 2, + 'end_col': 3, + 'log_r_col': 6, # Log Ratio + 'call_col': 8, + 'abs_cn_col': 9 #copy number based on state + }, + 'sequenza' : + { + 'chrm_col': 0, + 'start_col': 1, + 'end_col': 2, + 'd_ratio_col': 6, # Depth Ratio + 'abs_cn_col': 9, # Absolute copy number + 'a_allele_col': 10, + 'b_allele_col': 11 + } + } + +if __name__ == '__main__': + main()