Mercurial > repos > devteam > substitution_rates
changeset 0:d51dd9e4b517 draft
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 09:11:59 -0400 |
parents | |
children | 412a03dc24a2 |
files | substitution_rates.py substitution_rates.xml test-data/Interval2Maf_pairwise_out.maf test-data/subRates1.out |
diffstat | 4 files changed, 246 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/substitution_rates.py Tue Apr 01 09:11:59 2014 -0400 @@ -0,0 +1,123 @@ +#!/usr/bin/env python +#guruprasad Ananda +""" +Estimates substitution rates from pairwise alignments using JC69 model. +""" + +from galaxy import eggs +from galaxy.tools.util.galaxyops import * +from galaxy.tools.util import maf_utilities +import bx.align.maf +import fileinput +import sys + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +if len(sys.argv) < 3: + stop_err("Incorrect number of arguments.") + +inp_file = sys.argv[1] +out_file = sys.argv[2] +fout = open(out_file, 'w') +int_file = sys.argv[3] +if int_file != "None": #The user has specified an interval file + dbkey_i = sys.argv[4] + chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) + + +def rateEstimator(block): + global alignlen, mismatches + + src1 = block.components[0].src + sequence1 = block.components[0].text + start1 = block.components[0].start + end1 = block.components[0].end + len1 = int(end1)-int(start1) + len1_withgap = len(sequence1) + mismatch = 0.0 + + for seq in range (1, len(block.components)): + src2 = block.components[seq].src + sequence2 = block.components[seq].text + start2 = block.components[seq].start + end2 = block.components[seq].end + len2 = int(end2)-int(start2) + for nt in range(len1_withgap): + if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': # Not a gap or masked character + if sequence1[nt].upper() != sequence2[nt].upper(): + mismatch += 1 + + if int_file == "None": + p = mismatch/min(len1, len2) + print >> fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" % ( src1, start1, end1, src2, start2, end2, min(len1, len2), mismatch, p ) + else: + mismatches += mismatch + alignlen += min(len1, len2) + + +def main(): + skipped = 0 + not_pairwise = 0 + + if int_file == "None": + try: + maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) + except: + stop_err("Your MAF file appears to be malformed.") + print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp" + for block in maf_reader: + if len(block.components) != 2: + not_pairwise += 1 + continue + try: + rateEstimator(block) + except: + skipped += 1 + else: + index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] ) + if index is None: + print >> sys.stderr, "Your MAF file appears to be malformed." + sys.exit() + win = NiceReaderWrapper( fileinput.FileInput( int_file ), + chrom_col=chr_col_i, + start_col=start_col_i, + end_col=end_col_i, + strand_col=strand_col_i, + fix_strand=True) + species = None + mincols = 0 + global alignlen, mismatches + + for interval in win: + alignlen = 0 + mismatches = 0.0 + src = "%s.%s" % ( dbkey_i, interval.chrom ) + for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ): + if len(block.components) != 2: + not_pairwise += 1 + continue + try: + rateEstimator(block) + except: + skipped += 1 + if alignlen: + p = mismatches/alignlen + else: + p = 'NA' + interval.fields.append(str(alignlen)) + interval.fields.append(str(mismatches)) + interval.fields.append(str(p)) + print >> fout, "\t".join(interval.fields) + #num_blocks += 1 + + if not_pairwise: + print "Skipped %d non-pairwise blocks" % (not_pairwise) + if skipped: + print "Skipped %d blocks as invalid" % (skipped) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/substitution_rates.xml Tue Apr 01 09:11:59 2014 -0400 @@ -0,0 +1,61 @@ +<tool id="subRate1" name="Estimate substitution rates " version="1.0.0"> + <description> for non-coding regions</description> + <command interpreter="python"> + substitution_rates.py + $input + $out_file1 + #if $region.type == "win": + ${region.input2} ${region.input2.dbkey} ${region.input2.metadata.chromCol},$region.input2.metadata.startCol,$region.input2.metadata.endCol,$region.input2.metadata.strandCol + #else: + "None" + #end if + </command> + <inputs> + <param format="maf" name="input" type="data" label="Select pair-wise alignment data"/> + <conditional name="region"> + <param name="type" type="select" label="Estimate rates corresponding to" multiple="false"> + <option value="align">Alignment block</option> + <option value="win">Intervals in your history</option> + </param> + <when value="win"> + <param format="interval" name="input2" type="data" label="Choose intervals"> + <validator type="unspecified_build" /> + </param> + </when> + <when value="align" /> + </conditional> + </inputs> + <outputs> + <data format="tabular" name="out_file1" metadata_source="input"/> + </outputs> + + <tests> + <test> + <param name="input" value="Interval2Maf_pairwise_out.maf"/> + <param name="type" value="align"/> + <output name="out_file1" file="subRates1.out"/> + </test> + </tests> + + <help> + +.. class:: infomark + +**What it does** + +This tool takes a pairwise MAF file as input and estimates substitution rate according to Jukes-Cantor JC69 model. The 3 new columns appended to the output are explained below: + +- L: number of nucleotides compared +- N: number of different nucleotides +- p = N/L + +----- + +.. class:: warningmark + +**Note** + +Any block/s not containing exactly two sequences, will be omitted. + + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Interval2Maf_pairwise_out.maf Tue Apr 01 09:11:59 2014 -0400 @@ -0,0 +1,49 @@ +##maf version=1 +a score=10458 +s hg17.chrX 3816458 525 - 154824264 tGATGCACGTTTTCTCTTTTCATACTCATCTT--CAGTTATTCCTTGACACAAACCTGGTTA---CTAGTGCTATTGGCTACATGGCCCCTGTGCCTTAACTTCCCTCATGGCTCCTCCTCCCTGCAGCCTCTGAAGGTTCTCTGTGTAATGAATT--AATTAGCTTTGCTTTTTTCTCCCTTTCCCCC-----CCTCTTTTTCCTGGCCTCT---------AGAAGAAAACACCAGCAGCCCCAGCAAAGAAAACCAGCACTACCTTCAACATCGTGGGGACCACCTATCCCATCAACCTGGCCAAGGACACTGAATTTTCCACCATCTCCAAGGGCGCTGCTCCCAGTGCCTCCTCAACCCCAACAATCATTGCTTCACCCAAGGCCACCTACGTGCAGGACAGCCCGACTGAGACCAAGA---CCTACAACAGTGTCAGCAAGGTTGACAAAATTTCCCGCATCATCTTTCCTGTGCTCTTTGCCATATTCAATCTGGTCTATTGGGCCACATATGTCAACCGGGAGTCAGCTATCAAGGGCATGA +s fr1.chrUn 343715247 529 - 349519338 TGGTCAACATTTTTTTCTTTTCTACTTTTCTTTTCAGGAACCGTTTCAGGCTAACATGGCCAGGATTAAAGATAT-GGTTAGGCTTTTGTTTTGTTCTTGTGTGTGTCGTGTCTCGTGGT----GTTGCTTCTATTTCCTCTTCATGAGTTCAAATGGAAGTATTGTTAAGTTTGTTGCCCCCGACCCTAACGACCTCTGCTTTATGGCCTCTTCTCTTTTCAGAGGAGAGAATCTGAGAATCTGACCAAAAAGACGAACAACACCTTCAACATCGTCGGGACGACCTACGCCATCAACGTGGCGAAGGACCAGGGCTTAACCACCATCTCCAAAAGCGCCACCACCGGCGCGCC----GGCCAAACAGCACTACCTGCACAAAA-----------TGGACGACCCCTACACAGAGGCCAAGAAATCCTACAACCGAGTCAGCATAGTGGACAAAATATCACGCATCATTTTCCCAGTTCTGTTCTTCATCTTCAACTTGGGCTACTGGGCCACATATATCAACAGGAAGCCCATTATTATGAACGCGA + +a score=12779 +s hg17.chrX 3795168 357 - 154824264 aGGGCCATTTTGAGTGGGGTAAAGCTTCTCCTTTCAAATAGGAAA----TTAGTGATTTCATCG--AATAAGC-------TCTCAATGTCTTCCAGTTGCAACTGGTTCTCATCTTTCCTCT----CCTCAGGTGTCACCACTGTGCTTACCATGACCACCTTGAGTATCAGTGCCAGAAATTCCTTACCTAAAGTGGCATATGCGACGGCCATGGACTGGTTCATAGCCGTCTGTTATGCCTTTGTATTTTCTGCACTGATTGAATTTGCCACTGTCAACTATTTCACCAAGCGGAGTTGGGCTTGGGAAGGCAAGAAGGTGCCAGAGGCCCTGGAGATGAAGGTGAGATA----TAGCAAAAGAGAAACTGAGAAG +s fr1.chrUn 343710815 364 - 349519338 TGAGCCAATTT--------TGAAGTGACAATTTTCAAATGTGCCACATTTTAGTAGCAGCAAAGTCAGAAAGCGACTTTTCCTCTACATCACTTCTCTGATCTGTGTTCTGATGTTTTCCTTTCATCCTCAGGGGTCACCACTGTGCTTACCATGACCACCCTGAGCATCAGTGCTAGAAACTCCCTCCCTAAGGTGGCCTACGCCACCGCCATGGACTGGTTCATGGCGGTGTGCTACGCCTTTGTCTTCTCTGCCTTGATTGAGTTTGCGACGGTCAACTACTTCACCAAACGCAGCTGGGCATGGGATGGGAAAAA------AGAGGGCATGGAAATAAGGGTGAGTGATTTTTAATCAAATATATACTGATACG + +a score=10363 +s hg17.chrX 3787425 174 - 154824264 TACTCTCTCTTTAGGAGAATATGTCGTCATGACAACCCACTTCCATCTCAAGCGAAAAATTGGCTACTTTGTGATCCAGACCTACTTGCCATGTATCATGACTGTCATTCTGTCACAAGTGTCGTTCTGGCTCAACAGAGAGTCTGTTCCTGCCCGTACAGTCTTTGGTGAGTG +s fr1.chrUn 343708230 174 - 349519338 TCCTCCCGCTGCAGGCGAATATGTTGTGATGACAACATATTTCCATTTGAAAAGGAAAATTGGCTATTTCGTGATTCAAACCTACCTCCCGTGCATCATGACTGTCATTCTGTCTCAAGTCTCCTTCTGGCTGAACCGAGAATCGGTCCCTGCTCGCACCGTGTTCGGTGAGTG + +a score=31 +s hg17.chrX 3787284 100 - 154824264 ATTTTGTGATTATTAAAGGGCTCTAGTTATATGAACTTGGGAGATGCTGGAATATTGAATATAAG----ATTAAAGAGCTTTGTGAAT--------TGAATT-----GGGGCCCAAC +s fr1.chrUn 62078707 109 - 349519338 AGTTTGTGAGGCCTAAATGCCAAAAGT-------ACTTGA-ACAGGCTTAAGAATGGATCATGAATTTTCTTAAATAAATCTGTTAATCCAGAAAGTAAACTCTGGGGGGGTCCAGA + +a score=4963 +s hg17.chrX 3776942 285 - 154824264 aCTTCAGTTGAAGACAGACCACATACCCAGAACTCTTTGAAGATGGAAGTTTTATTTCGATCTCATTTGAGTTTAGTATCAAGGGTAATCATTCTCCATTTCCTTCAACTTCTCTTTATTGACCCTTCTCTGCC--TTAGATGCCTATACAACAGCTGAAGTGGTTTATTCTTGGACTCTCGGAAAGAACAAATCCGTGGAAGTGGCACAGGATGGTTCTCGCTTGAACCAGTATGACCTTTTGGGCCATGTTGTTGGGACAGAGATAATCCGGTCTAGTACAGGTA +s fr1.chrUn 343707053 283 - 349519338 AACTTAAATCAACACCAAGTAAAGGCAAAGAATCTTTTAAACTAAGAA----TCTATTAAACTAGATAAACTAAAGCATCGACACTGACGCGTTGATCTGTCATCCACGTTTCCCCTGTTTCCTTTCCTTCTCCATCTAGACGCTTACACCAACAATGAGGTGATTTACCTGTGGACTCAGGGGGACGAGAGGTCTGTGTCTGTCGCAGAGGACGGCTCCAGGCTGAACCAGTATGATTTACTGGGACACGTTATTGGCAAAGAAACCATCAGCTCCAGCACAGGTA + +a score=5912 +s hg17.chrX 3760375 93 - 154824264 TTAGGTTAACAATTCATGCTGAGTGTCCCATGCATTTGGAAGATTTTCCCATGGATGTGCATGCCTGCCCACTGAAGTTTGGAAGCTGTATGT +s fr1.chrUn 343706399 93 - 349519338 TCAGGTTGACCATCCACGCTGAGTGCCCCATGCACCTGGAGGACTTCCCCATGGACTCGCACGCCTGTCCTCTCAAGTTTGGAAGTTGTAAGT + +a score=4332 +s hg17.chrX 3733405 476 - 154824264 ATCAAGGAGAATTTGGGGAAACAGAGCATTAGGGAAGGAATAGATGGAAGAGAGGCATGCTTGTAATAAATTATAGGAAAGAGAAGAGTTTGAAAAAGAGATAAGAGTTCACACAATTGTGGAAAGAATGGAGTGGAGTATATGTAAACAGAAAAGAAGCTGTATGGAATGAGGTGGATGGAGGGACAGAATAATGAGGAGGAGAT---GTGGATGGGCAGAAGACAGTAAGCAAATGGAAGAGAAGCAAGA-----GGAAGGAGAGAGTAATGGGCAGGAAAGGATCTGAACCCAGTAGAGATTGCAAATAGTGAGGAGAAATAGAGAATGAATGAAAAGAACAGG--ATAGAGGAGAAACAACAATAAAAGGAATAAAGCATAGAGAATAGGTTTAAGTTTTGGAGGAGGGATAGAGAA-----GAGAGAAGAAAGGAGAGAGGATGGGAATATTGAAATACAAAAAAATAGAAGGTGCAGGTTGAGGA +s fr1.chrUn 303515824 444 + 349519338 AACACGGAATATGGAGGGAAAACGGGGAATATGGAGGGAACAG--GGAATATGG-------------------TGGGAACACAGAGGGAATATGGAGGGAACACGGG-------GAATATGGAGGGAA---AACGGGGAATATGGAGGGAACGGGGAATATGGAGGGAACGAGGAATATGGAGGGAACGGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACGAGGAATATGGAGGGAACACAGGGAATATGGAGGGAACGAGGAATATGGAGGGAACACAGGGAAT--ATGGAGGGAACGGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACACAGGGAATAT--------------GGAGGGAACGAGGAATATGGAGGGAACACAGGGAATATGGAGGGAACGAGGAATATGGAGGGAACACGGGGTACTCAGTGAGAA + +a score=719 +s hg17.chrX 3731355 108 - 154824264 TATGTAAGTAGAATGCAGTCAAGAAACTTGTAGGTGGATAGGTAGAGGCAGAAGGAAGAGAGGAAGGACGTTCTAGACAGAGAAAACCAAGTGAGCAGTTGCAGGGAA +s fr1.chrUn 303515724 91 + 349519338 TATGGAGGGAACACG------GGGAATATGGAGG-GAATATAGAGGGGAGACGGGGAATATGGAGGGA------ACACGGGGAATATGGAGGGAACA----CAGGGAA + +a score=3647 +s hg17.chrX 3730591 447 - 154824264 CAAGGAAAAAAGGATAATACTTCAGAGAGAAGTTGGATGAGAGATGGGAGGAA--AAATGAACATAGAGGAGAAATGAGCATAGAGTGGAGGATAAAATAATGAAGAAGGAAAGTGGATAGAAGTGGGAAGATGAGGGTCAAGAGGGGAGGAGAAGACTAATTACACAGAGTACATAGAGGCATGTATATTGGAGAAATGAGAATGGTGAAAAGAAAGAGAGTATGAGAAGAGAGGACAAAAAGAGTAGAGAAGTTGAGAAGACAAAGGATAAGATGGTGTAAAATAAAGAAGCTAGATTAGATATTAGGATATTCTAAAGAGATAGGCGAATATAATGAAGTGGATGTAGTAGATGGGAATAAAGAGGAAAAGACATAGAGAAGTAGGGGAAGGGACATGGGAGGAGATGAACAAGAGGGAGGGAAATTGTAGACGAAAGAAAGAGAA +s fr1.chrUn 303515378 346 + 349519338 CACGGGGAATATGGCGGGAACACAGAGGGAATATGGAGGGAACACGGGGAATATGGAGGGAATATAGAGGGGAGACGGG--GAATATGGAGGG-AACATGGGGAATATGGAGGGAATATAGA---GGGGAGACGGGGAATATGGAGGGA--------------ACACGGGGAATATGGAGG---GAACACAGG------GAATATGGCGGGAACACAGGGAATATG----GAGGGAACACGGGGAATATGGAGG----GAACACAGGGAATATGGAGG----AAATATAGAGGGAAAACGGGAAAT------------------------------ATGGAGGGAATATGGT----GGGAACACAGGGAATA----------------TGGAGGGAACACGGGG----------AATATGGAGGGAA--TATAGAGGGGAGACGGGGAA + +a score=13113 +s hg17.chrX 3729219 238 - 154824264 CCCCTAGGAGTACACTATTGATGTATTTTTTCGGCAGACATGGCATGATGAAAGACTGAAATTTGATGGCCCCATGAAGATCCTTCCACTGAACAATCTCCTGGCTAGTAAGATCTGGACACCGGACACCTTCTTCCACAATGGCAAGAAATCAGTGGCTCATAACATGACCACGCCCAACAAGCTGCTCAGATTGGTGGACAACGGAACCCTCCTCTATACAATGAGGTCAGAGAGT +s fr1.chrUn 343703525 238 - 349519338 CCCTCAGGAGTACACCATCGACGTCTTCTTCCGTCAGAGTTGGAGAGACGAGAGGCTAAAGTTTGATGGACCTATGCAGGTTTTACCCCTGAACAACCTCCTGGCCAGTAAGATTTGGACTCCTGACACCTTTTTCCACAACGGAAAGAAGTCTGTTGCCCACAACATGACGACTCCGAACAAACTGCTGCGCCTGGTCGACAACGGCACGCTTCTATATACAATGAGGTGAGGCGGT + +a score=3528 +s hg17.chrX 3700391 307 - 154824264 AGTAAGTTGGCTGTCATTCAGAGT-ACTGCACTG-GACTCT---CTCTGAGCTAAGACAAAAGCCTTTCCTGACATTCTGCTTTTCTACTTTCTTTCACGCAGATGCAGTGACTGAAGTGAAGACTGACATCTACGTGACCAGTTTTGGCCCTGTGTCAGACACTGACATGGTA---------AGTTCTTGCAAAGGTAAATGTA---AAAGAGGATGCACT----AGACCAAAGAAAATAAATTGGAGCTTTCCTAAGGC---AGGGCCTTGTAGCTATATTCTAAGGAGTGGTCTTTTCTCA--TATGAATTT--CCTACCCTCGCCACCCTA +s fr1.chrUn 241017738 330 - 349519338 ATTCATTTGTATGTCCTTTTGTGTTACTAAGCTGTGTCTCTAATTCCAGAGCGAAAACAAAACTCATTAGATTTGTGTCATTTGTTCATTTGCTC---CATAGGTCCTGTAACTGAGGTCAAGACAGATATTTATGTGACAAGTTTTGGGCCTGTGTCAGATGTTGAAATGGTACGTACACCGAGTGTTATTGATGCTAAATTTACCTCAGGAGGGGACACTTTTAATACCTTATAGAACATTATAGAGATTCTGGGACGTGCAAGGGCAGTGCAGGAAAGGCC--AGGTGGGGTCAGGTCTTAGTTATGAAGCGGACATACTGTCGCAGCCCTA + +a score=5471 +s hg17.chrX 3639441 205 - 154824264 TCTTGAATTACGATATCTA-GTCATGT---ATTTTTTTTCCTATTAAGAACTATTATTTTTTTTCTTTTC---CTTGCATTGCCTCAGGCTGTCTC---CTAAGCATGCCCCAGATATTCCTGATGACAGCACTGACAACATCACTATCTTCACCAGAATCTTGGATCGTCTTCTGGACGGCTATGACAACCGGCTGCGACCTGGGCTTGGAGGT +s fr1.chrUn 333536350 213 + 349519338 TCATTAATCACAAAGACCACGTCATGTTTGACGTCTTGTGGTTTTAAGAAGCGCAAGCCTTGTTCTTTTCACTCTCCCCTTCCTCTCGTCTCTTCCAGGCTCTGCTTGTCCCAGTCTGAGCCGCCTAC--CAACGACAACAGCACAGTTTTCACCAGGATCCTGGATGGGCTCCTTGACGGTTACGACAACAGGCTGCGACCTGGGCTTGGGGGT +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/subRates1.out Tue Apr 01 09:11:59 2014 -0400 @@ -0,0 +1,13 @@ +#Seq1 Start1 End1 Seq2 Start2 End2 L N p +hg17.chrX 3816458 3816983 fr1.chrUn 343715247 343715776 525 188 0.3581 +hg17.chrX 3795168 3795525 fr1.chrUn 343710815 343711179 357 92 0.2577 +hg17.chrX 3787425 3787599 fr1.chrUn 343708230 343708404 174 37 0.2126 +hg17.chrX 3787284 3787384 fr1.chrUn 62078707 62078816 100 33 0.3300 +hg17.chrX 3776942 3777227 fr1.chrUn 343707053 343707336 283 122 0.4311 +hg17.chrX 3760375 3760468 fr1.chrUn 343706399 343706492 93 20 0.2151 +hg17.chrX 3733405 3733881 fr1.chrUn 303515824 303516268 444 186 0.4189 +hg17.chrX 3731355 3731463 fr1.chrUn 303515724 303515815 91 36 0.3956 +hg17.chrX 3730591 3731038 fr1.chrUn 303515378 303515724 346 126 0.3642 +hg17.chrX 3729219 3729457 fr1.chrUn 343703525 343703763 238 57 0.2395 +hg17.chrX 3700391 3700698 fr1.chrUn 241017738 241018068 307 112 0.3648 +hg17.chrX 3639441 3639646 fr1.chrUn 333536350 333536563 205 66 0.3220