changeset 1:a81a8c524d3d draft

Uploaded
author morinlab
date Mon, 05 Dec 2016 14:22:04 -0500
parents 55f246f1bdb3
children 41b99b6dcfdc
files cnv2igv.py
diffstat 1 files changed, 348 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cnv2igv.py	Mon Dec 05 14:22:04 2016 -0500
@@ -0,0 +1,348 @@
+import argparse
+import re
+import math
+
+"""
+Input:
+Requires filepath to a segmentation file. This segmentation
+file will be converted to IGV format. Additionally, the
+user is able to specify whether the segmentation file is
+generated by 'sequenza' or 'titan' using the '--mode'
+parameter. If 'sequenza' is specified in the '--mode'
+parameter, the '--sequenza_sample' parameter must also be set.
+The '--sequenza_sample' parameter specifies the sample name
+for the 'sequenza' segmentation file. If '--include_loh' is
+present, a column indicating whether the segment represents
+a loss of heterozygosity. The '--include_loh' parameter only
+works when '--mode' is to 'sequenza' or 'titan'. Alternatively,
+if the segmentation file is neither 'sequenza' or 'titan',
+then the user must specify the column indices (1-based numbering)
+for the sample name ('--sample_col'), chromosome ('--chrm_col'),
+start position ('--start_col'), end position ('--end_col')
+in the segmentation file. The user must also specify the
+column indices (1-based numbering) of either the log ratio
+column ('--log_r_col'), the depth ratio column
+('--d_ratio_col') or the absolute copy number column
+('--abs_cn_col') in the segmentation file. If '--d_ratio_col'
+or '--abs_cn_col' is specified then the values in the
+specified columns will be used in calculating the log2 value.
+
+Output:
+Outputs an IGV formatted segmentation file to standard output.
+"""
+
+
+def main():
+    args = parse_args()
+
+    check_arguments(args)
+
+    args = add_to_MODES(args)
+
+    header_checked = False
+
+    mode = args.mode
+    seg_file = args.seg_file
+    use_abs_cn = args.use_abs_cn
+    as_integer = args.as_integer
+    force_header = args.force_header
+    prepend_chr = args.prepend_chr
+    human_readable_state = False
+    drop_header = args.drop_header
+    if args.oncocircos:
+        use_abs_cn = True
+        human_readable_state = True
+        drop_header = True
+    if args.gistic:
+        #as_integer = True
+        use_abs_cn = True
+        prepend_chr = True
+        #drop_header = True
+    include_loh = args.include_loh
+    
+    if force_header:
+        print "Sample\tChromosome\tStart\tEnd\tNum_Probes\tSegment_Mean"
+    elif drop_header:
+        pass
+    else:
+        if mode in [ 'sequenza', 'titan' ] and include_loh:
+            print 'ID\tchrom\tstart\tend\tLOH_flag\tlog.ratio' 
+        else:
+            print 'ID\tchrom\tstart\tend\tlog.ratio'
+
+    for seg in seg_file:
+
+        if not header_checked:
+            p = re.compile('start', re.I)
+            m = re.search(p, seg)
+
+            header_checked = True
+
+            if m:
+                continue
+
+        fields = seg.split('\t')
+        
+        string = fields[MODES[mode]['chrm_col']]
+        if string.startswith('"') and string.endswith('"'):
+            string = string[1:-1]
+        if prepend_chr:
+            if not string.startswith("chr"):
+                fields[MODES[mode]['chrm_col']] = "chr%s" % string
+            else:
+                fields[MODES[mode]['chrm_col']] = string
+        else:
+            fields[MODES[mode]['chrm_col']] = string
+        vals = [ fields[MODES[mode]['chrm_col']], fields[MODES[mode]['start_col']],
+                 fields[MODES[mode]['end_col']] ]
+
+        if mode == 'sequenza':
+            
+            vals = [ args.sequenza_sample ] + vals
+
+            if include_loh == 'neutral':
+                a_allele = fields[ MODES[ mode ][ 'a_allele_col' ] ]
+                abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ])
+
+                if a_allele == 'NA': 
+                    vals = vals + [ 'NA' ]
+                elif abs_cn == 2 and int(a_allele) == 2:
+                    vals = vals + [ '1' ]
+                else:
+                    vals = vals + [ '0' ]
+
+            elif include_loh == 'deletion':
+                a_allele = fields[ MODES[ mode ][ 'a_allele_col' ] ]
+                b_allele = fields[ MODES[ mode ][ 'b_allele_col' ] ]
+                abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ])
+
+                if a_allele == 'NA' or b_allele == 'NA':
+                    vals = vals + [ 'NA' ]
+                elif abs_cn == 1 and (int(a_allele) + int(b_allele)) == 1:
+                    vals = vals + [ '1' ]
+                else:
+                    vals = vals + [ '0' ]
+
+            elif include_loh == 'any':
+                b_allele = fields[ MODES[ mode ][ 'b_allele_col' ] ]
+                abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ])
+
+                if b_allele == 'NA':
+                    vals = vals + [ 'NA' ]
+                elif abs_cn <= 2 and int(b_allele) == 0:
+                    vals = vals + [ '1' ]
+                else:
+                    vals = vals + [ '0' ]
+
+            log_ratio = None
+
+            if use_abs_cn:
+                abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ])
+
+                if abs_cn == 0:
+                    log_ratio = '-Inf'
+                else:
+                    log_ratio = math.log(abs_cn, 2) - 1
+                if as_integer:
+                    vals = vals + [abs_cn]
+                else:
+                    vals = vals + [ log_ratio ]
+            else:
+                log_ratio = math.log(float(fields[MODES[mode]['d_ratio_col']]),2)
+                vals = vals + [ log_ratio ]
+
+            if human_readable_state:
+                vals = vals + [ toState(fields[MODES[mode]['abs_cn_col'] ]) ]
+
+        elif mode == 'titan':
+            vals = [ fields[ MODES[ mode ][ 'sample_col' ] ] ] + vals 
+
+            if include_loh:
+                loh_call = fields[ MODES[ mode ][ 'call_col' ] ]
+
+                if include_loh == 'neutral':
+
+                    if loh_call == 'NLOH':
+                        vals = vals + [ '1' ]
+                    else:
+                        vals = vals + [ '0' ]
+
+                elif include_loh == 'deletion':
+
+                    if loh_call == 'DLOH':
+                        vals = vals + [ ' 1' ]
+                    else:
+                        vals = vals + [ '0' ]
+
+                elif include_loh == 'any':
+
+                    if 'LOH' in loh_call:
+                        vals = vals + [ '1' ]
+                    else:
+                        vals = vals + [ '0' ]
+            if use_abs_cn:
+                abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ])
+                if abs_cn == 0:
+                    log_ratio = '-Inf'
+                else:
+                    log_ratio = math.log(abs_cn, 2) - 1
+                vals = vals + [log_ratio]
+            else:
+                vals = vals + [ fields[MODES[mode]['log_r_col'] ] ]
+            if human_readable_state:
+                vals = vals + [ toState(fields[MODES[mode]['abs_cn_col'] ]) ]
+        elif mode == 'other':
+
+            if 'd_ratio_col' in MODES[mode]:
+                vals = vals + [ math.log(fields[MODES[mode]['d_ratio_col'] ],2) ]
+
+            elif 'abs_cn_col' in MODES[mode]:
+                abs_cn = int(fields[ MODES[ mode ][ 'abs_cn_col' ] ])
+
+                log_ratio = None
+
+                if abs_cn == 0:
+                    log_ratio = '-Inf'
+                else:
+                    log_ratio = math.log(abs_cn, 2) - 1
+                if as_integer:
+                    print "using INT: "
+                    print abs_cn
+                    print "\n"
+                    vals = vals + [abs_cn]
+                else:
+                    vals = vals + [ log_ratio ]
+
+            elif 'log_r_col' in MODES[mode]:
+                vals = vals + [ fields[MODES[mode]['log_r_col'] ] ]
+
+        if include_loh:
+            print '{}\t{}\t{}\t{}\t{}\t{}'.format(*vals)
+        else:
+            
+            if human_readable_state:
+                print '{}\t{}\t{}\t{}\t10\t{}\t{}'.format(*vals)
+            else:
+                print '{}\t{}\t{}\t{}\t10\t{}'.format(*vals)
+
+    return
+
+
+def add_to_MODES(args):
+    mode = args.mode
+
+    if mode not in MODES.keys():
+        mode = 'other'
+        args.mode = mode
+        MODES[mode] = {}
+
+        MODES[mode]['sample_col'] = args.sample_col - 1
+        MODES[mode]['chrm_col'] = args.chrm_col - 1
+        MODES[mode]['start_col'] = args.start_col - 1
+        MODES[mode]['end_col'] = args.end_col - 1
+
+        if args.log_r_col:
+            MODES[mode]['log_r_col'] = args.log_r_col - 1
+        elif args.d_ratio_col:
+            MODES[mode]['d_ratio_col'] = args.d_ratio_col - 1
+        elif args.abs_cn_col:
+            MODES[mode]['abs_cn_col'] = args.abs_cn_col - 1
+
+    return args
+
+def toState(cns):
+    cns = int(cns)
+    cnstate = "NEUT"
+    if cns > 2:
+        if cns > 3:
+            cnstate = "AMP"
+        else:
+            cnstate = "GAIN"
+    elif cns == 1:
+        cnstate = "HETD"
+    elif cns == 0:
+        cnstate = "HOMD"
+    return cnstate
+    
+
+def check_arguments(args):
+
+    if args.mode == 'sequenza' and not args.sequenza_sample:
+        raise ValueError("Must specify '--sequenza_sample' when '--mode' is set to 'sequenza'.")
+
+    if not args.mode:
+        if not all([args.sample_col, args.chrm_col, args.start_col, args.end_col]) and not any([args.log_r_col, args.d_ratio_col, args.abs_cn_col]):
+            raise ValueError("Specify '--sample_col', '--chrm_col', '--start_col', '--end_col'. Also specify either '--log_r_col', '--d_ratio_col' or '--abs_cn_col'.")
+
+        if args.log_r_col and args.d_ratio_col:
+            raise ValueError("Cannot specify both '--log_r_col' and '--d_ratio_col'.")
+
+    return
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('seg_file', type=argparse.FileType('r'),
+                        help='Segmentation file to convert to IGV format.')
+    parser.add_argument('--mode', choices=MODES.keys(),
+                        help='Indiciate whether segmentation file is Sequenza or TITAN.')
+    parser.add_argument('--sequenza_sample',
+                        help='Specify sample name for Sequenza segmentation file.')
+    parser.add_argument('--use_abs_cn', action='store_true',
+                        help="Use absolute copy number when calculating log2 value instead of \
+                        depth ratio when '--mode' is set to 'sequenza'.")
+    parser.add_argument('--include_loh', choices=['neutral', 'deletion', 'any'],
+                        help="Include a column indicating whether the segment represents a loss \
+                        heterozygosity. Only works when '--mode' is set to 'sequenza or 'titan'.")
+    parser.add_argument('--sample_col', type=int,
+                        help='1-based index of sample name column in segmentation file.')
+    parser.add_argument('--chrm_col', type=int,
+                        help='1-based index of chromosome column in segmentation file.')
+    parser.add_argument('--start_col', type=int,
+                        help='1-based index of segment start position column in segmentation file.')
+    parser.add_argument('--end_col', type=int,
+                        help='1-based index of segment end position column in segmentation file.')
+    parser.add_argument('--log_r_col', type=int,
+                        help='1-based index of log ratio column in segmentation file.')
+    parser.add_argument('--d_ratio_col', type=int,
+                        help='1-based index of depth ratio column in segmentation file.')
+    parser.add_argument('--abs_cn_col', type=int,
+                        help='1-based index of absolute copy number column in segmentation file.')
+    parser.add_argument('--drop_header', action='store_true', help='Flag to force script to not write a header to the output')
+    parser.add_argument('--as_integer', action='store_true',
+                                                help='Flag to use untransformed value of absolute copy number instead of log2 ratio')
+    parser.add_argument('--prepend_chr', action='store_true', help='Flag to force a chr prefex be added to all rows (unless already present)')
+    parser.add_argument('--oncocircos',action='store_true',help='special mode to generate input for the Oncoricos tool')
+    parser.add_argument('--gistic',action='store_true',help='special mode to generate input for the GISTIC2.0 tool')
+    parser.add_argument('--force_header',action='store_true')
+    
+    args = parser.parse_args()
+
+    return args
+
+MODES = {
+          'titan' : 
+                    {
+                      'sample_col': 0,
+                      'chrm_col': 1,
+                      'start_col': 2,
+                      'end_col': 3,
+                      'log_r_col': 6, # Log Ratio
+                        'call_col': 8,
+                        'abs_cn_col': 9 #copy number based on state
+                     },
+          'sequenza' :
+                       {
+                         'chrm_col': 0,
+                         'start_col': 1,
+                         'end_col': 2,
+                         'd_ratio_col': 6, # Depth Ratio
+                         'abs_cn_col': 9, # Absolute copy number
+                         'a_allele_col': 10,
+                         'b_allele_col': 11
+                        }
+        }
+
+if __name__ == '__main__':
+    main()