diff substitution_rates.py @ 0:d51dd9e4b517 draft

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 09:11:59 -0400
parents
children 412a03dc24a2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/substitution_rates.py	Tue Apr 01 09:11:59 2014 -0400
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+#guruprasad Ananda
+"""
+Estimates substitution rates from pairwise alignments using JC69 model.
+"""
+
+from galaxy import eggs
+from galaxy.tools.util.galaxyops import *
+from galaxy.tools.util import maf_utilities
+import bx.align.maf
+import fileinput
+import sys
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+if len(sys.argv) < 3:
+    stop_err("Incorrect number of arguments.")
+
+inp_file = sys.argv[1]
+out_file = sys.argv[2]
+fout = open(out_file, 'w')
+int_file = sys.argv[3]
+if int_file != "None":     #The user has specified an interval file
+    dbkey_i = sys.argv[4]
+    chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
+
+
+def rateEstimator(block):
+    global alignlen, mismatches
+
+    src1 = block.components[0].src
+    sequence1 = block.components[0].text
+    start1 = block.components[0].start
+    end1 = block.components[0].end
+    len1 = int(end1)-int(start1)
+    len1_withgap = len(sequence1)
+    mismatch = 0.0
+    
+    for seq in range (1, len(block.components)):
+        src2 = block.components[seq].src
+        sequence2 = block.components[seq].text
+        start2 = block.components[seq].start
+        end2 = block.components[seq].end
+        len2 = int(end2)-int(start2)
+        for nt in range(len1_withgap):
+            if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?':  # Not a gap or masked character
+                if sequence1[nt].upper() != sequence2[nt].upper():
+                    mismatch += 1
+    
+    if int_file == "None":
+        p = mismatch/min(len1, len2)
+        print >> fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" % ( src1, start1, end1, src2, start2, end2, min(len1, len2), mismatch, p )
+    else:
+        mismatches += mismatch
+        alignlen += min(len1, len2)
+
+
+def main():
+    skipped = 0
+    not_pairwise = 0
+    
+    if int_file == "None":
+        try:
+            maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+        except:
+            stop_err("Your MAF file appears to be malformed.")
+        print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
+        for block in maf_reader:
+            if len(block.components) != 2:
+                not_pairwise += 1
+                continue
+            try:
+                rateEstimator(block)
+            except:
+                skipped += 1
+    else:
+        index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
+        if index is None:
+            print >> sys.stderr, "Your MAF file appears to be malformed."
+            sys.exit()
+        win = NiceReaderWrapper( fileinput.FileInput( int_file ),
+                                chrom_col=chr_col_i,
+                                start_col=start_col_i,
+                                end_col=end_col_i,
+                                strand_col=strand_col_i,
+                                fix_strand=True)
+        species = None
+        mincols = 0
+        global alignlen, mismatches
+        
+        for interval in win:
+            alignlen = 0
+            mismatches = 0.0
+            src = "%s.%s" % ( dbkey_i, interval.chrom )
+            for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
+                if len(block.components) != 2:
+                    not_pairwise += 1
+                    continue
+                try:
+                    rateEstimator(block)
+                except:
+                    skipped += 1
+            if alignlen:
+                p = mismatches/alignlen
+            else:
+                p = 'NA'
+            interval.fields.append(str(alignlen))
+            interval.fields.append(str(mismatches))
+            interval.fields.append(str(p))
+            print >> fout, "\t".join(interval.fields)
+            #num_blocks += 1
+    
+    if not_pairwise:
+        print "Skipped %d non-pairwise blocks" % (not_pairwise)
+    if skipped:
+        print "Skipped %d blocks as invalid" % (skipped)
+
+
+if __name__ == "__main__":
+    main()