Mercurial > repos > devteam > mutate_snp_codon
diff mutate_snp_codon.py @ 0:50333b5ac867 draft default tip
Uploaded tool tarball.
author | devteam |
---|---|
date | Tue, 20 Aug 2013 11:03:11 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutate_snp_codon.py Tue Aug 20 11:03:11 2013 -0400 @@ -0,0 +1,112 @@ +#!/usr/bin/env python +""" +Script to mutate SNP codons. +Dan Blankenberg +""" + +import sys, string + +def strandify( fields, column ): + strand = '+' + if column >= 0 and column < len( fields ): + strand = fields[ column ] + if strand not in [ '+', '-' ]: + strand = '+' + return strand + +def main(): + # parse command line + input_file = sys.argv[1] + out = open( sys.argv[2], 'wb+' ) + codon_chrom_col = int( sys.argv[3] ) - 1 + codon_start_col = int( sys.argv[4] ) - 1 + codon_end_col = int( sys.argv[5] ) - 1 + codon_strand_col = int( sys.argv[6] ) - 1 + codon_seq_col = int( sys.argv[7] ) - 1 + + snp_chrom_col = int( sys.argv[8] ) - 1 + snp_start_col = int( sys.argv[9] ) - 1 + snp_end_col = int( sys.argv[10] ) - 1 + snp_strand_col = int( sys.argv[11] ) - 1 + snp_observed_col = int( sys.argv[12] ) - 1 + + max_field_index = max( codon_chrom_col, codon_start_col, codon_end_col, codon_strand_col, codon_seq_col, snp_chrom_col, snp_start_col, snp_end_col, snp_strand_col, snp_observed_col ) + + DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) + skipped_lines = 0 + errors = {} + for name, message in [ ('max_field_index','not enough fields'), ( 'codon_len', 'codon length must be 3' ), ( 'codon_seq', 'codon sequence must have length 3' ), ( 'snp_len', 'SNP length must be 3' ), ( 'snp_observed', 'SNP observed values must have length 3' ), ( 'empty_comment', 'empty or comment'), ( 'no_overlap', 'codon and SNP do not overlap' ) ]: + errors[ name ] = { 'count':0, 'message':message } + line_count = 0 + for line_count, line in enumerate( open( input_file ) ): + line = line.rstrip( '\n\r' ) + if line and not line.startswith( '#' ): + fields = line.split( '\t' ) + if max_field_index >= len( fields ): + skipped_lines += 1 + errors[ 'max_field_index' ]['count'] += 1 + continue + + #read codon info + codon_chrom = fields[codon_chrom_col] + codon_start = int( fields[codon_start_col] ) + codon_end = int( fields[codon_end_col] ) + if codon_end - codon_start != 3: + #codons must be length 3 + skipped_lines += 1 + errors[ 'codon_len' ]['count'] += 1 + continue + codon_strand = strandify( fields, codon_strand_col ) + codon_seq = fields[codon_seq_col].upper() + if len( codon_seq ) != 3: + #codon sequence must have length 3 + skipped_lines += 1 + errors[ 'codon_seq' ]['count'] += 1 + continue + + #read snp info + snp_chrom = fields[snp_chrom_col] + snp_start = int( fields[snp_start_col] ) + snp_end = int( fields[snp_end_col] ) + if snp_end - snp_start != 1: + #snps must be length 1 + skipped_lines += 1 + errors[ 'snp_len' ]['count'] += 1 + continue + snp_strand = strandify( fields, snp_strand_col ) + snp_observed = fields[snp_observed_col].split( '/' ) + snp_observed = [ observed for observed in snp_observed if len( observed ) == 1 ] + if not snp_observed: + #sequence replacements must be length 1 + skipped_lines += 1 + errors[ 'snp_observed' ]['count'] += 1 + continue + + #Determine index of replacement for observed values into codon + offset = snp_start - codon_start + #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2 + if codon_strand == '-': + offset = 2 - offset + if offset < 0 or offset > 2: #assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset ) + #codon and snp do not overlap + skipped_lines += 1 + errors[ 'no_overlap' ]['count'] += 1 + continue + + for observed in snp_observed: + if codon_strand != snp_strand: + #if our SNP is on a different strand than our codon, take complement of provided observed SNP base + observed = observed.translate( DNA_COMP ) + snp_codon = [ char for char in codon_seq ] + snp_codon[offset] = observed.upper() + snp_codon = ''.join( snp_codon ) + + if codon_seq != snp_codon: #only output when we actually have a different codon + out.write( "%s\t%s\n" % ( line, snp_codon ) ) + else: + skipped_lines += 1 + errors[ 'empty_comment' ]['count'] += 1 + if skipped_lines: + print "Skipped %i (%4.2f%%) of %i lines; reasons: %s" % ( skipped_lines, ( float( skipped_lines )/float( line_count ) ) * 100, line_count, ', '.join( [ "%s (%i)" % ( error['message'], error['count'] ) for error in errors.itervalues() if error['count'] ] ) ) + +if __name__ == "__main__": main()