Mercurial > repos > devteam > substitution_rates
diff substitution_rates.py @ 0:d51dd9e4b517 draft
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 09:11:59 -0400 |
parents | |
children | 412a03dc24a2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/substitution_rates.py Tue Apr 01 09:11:59 2014 -0400 @@ -0,0 +1,123 @@ +#!/usr/bin/env python +#guruprasad Ananda +""" +Estimates substitution rates from pairwise alignments using JC69 model. +""" + +from galaxy import eggs +from galaxy.tools.util.galaxyops import * +from galaxy.tools.util import maf_utilities +import bx.align.maf +import fileinput +import sys + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +if len(sys.argv) < 3: + stop_err("Incorrect number of arguments.") + +inp_file = sys.argv[1] +out_file = sys.argv[2] +fout = open(out_file, 'w') +int_file = sys.argv[3] +if int_file != "None": #The user has specified an interval file + dbkey_i = sys.argv[4] + chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) + + +def rateEstimator(block): + global alignlen, mismatches + + src1 = block.components[0].src + sequence1 = block.components[0].text + start1 = block.components[0].start + end1 = block.components[0].end + len1 = int(end1)-int(start1) + len1_withgap = len(sequence1) + mismatch = 0.0 + + for seq in range (1, len(block.components)): + src2 = block.components[seq].src + sequence2 = block.components[seq].text + start2 = block.components[seq].start + end2 = block.components[seq].end + len2 = int(end2)-int(start2) + for nt in range(len1_withgap): + if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': # Not a gap or masked character + if sequence1[nt].upper() != sequence2[nt].upper(): + mismatch += 1 + + if int_file == "None": + p = mismatch/min(len1, len2) + print >> fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" % ( src1, start1, end1, src2, start2, end2, min(len1, len2), mismatch, p ) + else: + mismatches += mismatch + alignlen += min(len1, len2) + + +def main(): + skipped = 0 + not_pairwise = 0 + + if int_file == "None": + try: + maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) + except: + stop_err("Your MAF file appears to be malformed.") + print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp" + for block in maf_reader: + if len(block.components) != 2: + not_pairwise += 1 + continue + try: + rateEstimator(block) + except: + skipped += 1 + else: + index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] ) + if index is None: + print >> sys.stderr, "Your MAF file appears to be malformed." + sys.exit() + win = NiceReaderWrapper( fileinput.FileInput( int_file ), + chrom_col=chr_col_i, + start_col=start_col_i, + end_col=end_col_i, + strand_col=strand_col_i, + fix_strand=True) + species = None + mincols = 0 + global alignlen, mismatches + + for interval in win: + alignlen = 0 + mismatches = 0.0 + src = "%s.%s" % ( dbkey_i, interval.chrom ) + for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ): + if len(block.components) != 2: + not_pairwise += 1 + continue + try: + rateEstimator(block) + except: + skipped += 1 + if alignlen: + p = mismatches/alignlen + else: + p = 'NA' + interval.fields.append(str(alignlen)) + interval.fields.append(str(mismatches)) + interval.fields.append(str(p)) + print >> fout, "\t".join(interval.fields) + #num_blocks += 1 + + if not_pairwise: + print "Skipped %d non-pairwise blocks" % (not_pairwise) + if skipped: + print "Skipped %d blocks as invalid" % (skipped) + + +if __name__ == "__main__": + main()