Mercurial > repos > iuc > progressivemauve
changeset 1:272c6e61dd0c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 936ca4b4641eae2bdfa93885f35f5a9263f9dae1
author | iuc |
---|---|
date | Fri, 22 May 2015 22:50:24 -0400 |
parents | aaa94e13ac60 |
children | 17ab54e158cd |
files | macros.xml progressivemauve.xml static/images/hemolysin.jpg static/images/xmfa2gff3.png test-data/1.gff3 tool_dependencies.xml xmfa2gff3.py xmfa2gff3.xml |
diffstat | 8 files changed, 269 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Thu May 21 22:49:06 2015 -0400 +++ b/macros.xml Fri May 22 22:50:24 2015 -0400 @@ -3,6 +3,9 @@ <xml name="requirements"> <requirements> <requirement type="package" version="2.4.0">progressivemauve</requirement> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="0.6.2">bcbiogff</requirement> <yield/> </requirements> </xml>
--- a/progressivemauve.xml Thu May 21 22:49:06 2015 -0400 +++ b/progressivemauve.xml Fri May 22 22:50:24 2015 -0400 @@ -239,6 +239,10 @@ Y. pestis genomes can be aligned in under a minute, while a group of 9 divergent Enterobacterial genomes can be aligned in a few hours. +progressiveMauve XMFA alignment visualized with the Mauve tool: + +.. image:: $PATH_TO_IMAGES/hemolysin.jpg + Example Usage =============
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.gff3 Fri May 22 22:50:24 2015 -0400 @@ -0,0 +1,25 @@ +##gff-version 3 +##sequence-region karma 1 1 +karma progressiveMauve match 83032 140930 . - . ID=phagey;target=phagey +karma progressiveMauve match_part 83032 83041 50.0 + . Parent=phagey +karma progressiveMauve match_part 83042 83051 60.0 + . Parent=phagey +karma progressiveMauve match_part 83060 83069 60.0 + . Parent=phagey +karma progressiveMauve match_part 83070 83079 40.0 + . Parent=phagey +karma progressiveMauve match_part 83080 83089 40.0 + . Parent=phagey +karma progressiveMauve match_part 83108 83117 40.0 + . Parent=phagey +karma progressiveMauve match_part 83125 83134 70.0 + . Parent=phagey +karma progressiveMauve match_part 83144 83153 60.0 + . Parent=phagey +karma progressiveMauve match_part 83154 83163 40.0 + . Parent=phagey +karma progressiveMauve match_part 83164 83173 70.0 + . Parent=phagey +karma progressiveMauve match_part 83196 83205 40.0 + . Parent=phagey +karma progressiveMauve match_part 83206 83215 30.0 + . Parent=phagey +karma progressiveMauve match_part 140791 140800 40.0 + . Parent=phagey +karma progressiveMauve match_part 140801 140810 20.0 + . Parent=phagey +karma progressiveMauve match_part 140811 140820 60.0 + . Parent=phagey +karma progressiveMauve match_part 140821 140830 50.0 + . Parent=phagey +karma progressiveMauve match_part 140831 140840 50.0 + . Parent=phagey +karma progressiveMauve match_part 140841 140850 60.0 + . Parent=phagey +karma progressiveMauve match_part 140851 140860 20.0 + . Parent=phagey +karma progressiveMauve match_part 140861 140870 50.0 + . Parent=phagey +karma progressiveMauve match_part 140871 140880 40.0 + . Parent=phagey +karma progressiveMauve match_part 140906 140915 80.0 + . Parent=phagey
--- a/tool_dependencies.xml Thu May 21 22:49:06 2015 -0400 +++ b/tool_dependencies.xml Fri May 22 22:50:24 2015 -0400 @@ -3,4 +3,13 @@ <package name="progressivemauve" version="2.4.0"> <repository changeset_revision="2a5b50fd549f" name="package_progressivemauve_2_4_0" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> + <package name="python" version="2.7"> + <repository changeset_revision="00d2c3e43807" name="package_python_2_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="biopython" version="1.65"> + <repository changeset_revision="f8d72690eeae" name="package_biopython_1_65" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="bcbiogff" version="0.6.2"> + <repository changeset_revision="1195382a6172" name="package_python_2_7_bcbiogff_0_6_2" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> </tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xmfa2gff3.py Fri May 22 22:50:24 2015 -0400 @@ -0,0 +1,162 @@ +#!/usr/bin/env python +import sys +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +import argparse +from BCBio import GFF +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def parse_xmfa(xmfa): + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 + """ + current_lcb = [] + current_seq = {} + for line in xmfa.readlines(): + if line.startswith('#'): + continue + + if line.strip() == '=': + if 'id' in current_seq: + current_lcb.append(current_seq) + current_seq = {} + yield current_lcb + current_lcb = [] + else: + line = line.strip() + if line.startswith('>'): + if 'id' in current_seq: + current_lcb.append(current_seq) + current_seq = {} + data = line.strip().split() + id, loc = data[1].split(':') + start, end = loc.split('-') + current_seq = { + 'rid': '_'.join(data[1:]), + 'id': id, + 'start': int(start), + 'end': int(end), + 'strand': 1 if data[2] == '+' else -1, + 'seq': '' + } + else: + current_seq['seq'] += line.strip() + + +def _percent_identity(a, b): + """Calculate % identity, ignoring gaps in the host sequence + """ + match = 0 + mismatch = 0 + for char_a, char_b in zip(list(a), list(b)): + if char_a == '-': + continue + if char_a == char_b: + match += 1 + else: + mismatch += 1 + + if match + mismatch == 0: + return 0 + return 100 * float(match) / (match + mismatch) + + +def _id_tn_dict(sequences): + """Figure out sequence IDs + """ + label_convert = {} + if sequences is not None: + if len(sequences) == 1: + for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')): + label_convert[str(i + 1)] = record.id + else: + for i, sequence in enumerate(sequences): + for record in SeqIO.parse(sequence, 'fasta'): + label_convert[str(i + 1)] = record.id + continue + return label_convert + + +def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000): + label_convert = _id_tn_dict(sequences) + + lcbs = parse_xmfa(xmfa_file) + + records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))] + for lcb in lcbs: + ids = [seq['id'] for seq in lcb] + + # Doesn't match part of our sequence + if relative_to not in ids: + continue + + # Skip sequences that are JUST our "relative_to" genome + if len(ids) == 1: + continue + + parent = [seq for seq in lcb if seq['id'] == relative_to][0] + others = [seq for seq in lcb if seq['id'] != relative_to] + + for other in others: + other['feature'] = SeqFeature( + FeatureLocation(parent['start'], parent['end'] + 1), + type="match", strand=parent['strand'], + qualifiers={ + "source": "progressiveMauve", + "target": label_convert.get(other['id'], other['id']), + "ID": label_convert.get(other['id'], 'xmfa_' + other['rid']) + } + ) + + for i in range(0, len(lcb[0]['seq']), window_size): + block_seq = parent['seq'][i:i + window_size] + real_window_size = len(block_seq) + real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i + real_end = real_start + real_window_size - block_seq.count('-') + + if (real_end - real_start) < 10: + continue + + if parent['start'] < 0: + strand = -1 + else: + strand = 1 + + for other in others: + pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size]) + # Ignore 0% identity sequences + if pid == 0: + continue + other['feature'].sub_features.append( + SeqFeature( + FeatureLocation(real_start, real_end), + type="match_part", strand=strand, + qualifiers={ + "source": "progressiveMauve", + 'score': pid + } + ) + ) + + for other in others: + records[0].features.append(other['feature']) + return records + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3') + parser.add_argument('xmfa_file', type=file, help='XMFA File') + parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000) + parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1') + parser.add_argument('--sequences', type=file, nargs='+', + help='Fasta files (in same order) passed to parent for reconstructing proper IDs') + parser.add_argument('--version', action='version', version='%(prog)s 1.0') + + args = parser.parse_args() + + result = convert_xmfa_to_gff3(**vars(args)) + GFF.write(result, sys.stdout)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xmfa2gff3.xml Fri May 22 22:50:24 2015 -0400 @@ -0,0 +1,66 @@ +<?xml version="1.0"?> +<tool id="xmfa2gff3" name="Convert XMFA to gapped GFF3" version="@WRAPPER_VERSION@.0"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <expand macro="stdio"/> + <version_command>python xmfa2gff3.py --version</version_command> + <command interpreter="python"><![CDATA[ +xmfa2gff3.py +$xmfa +--window_size $window_size +--relative_to $relative_to +--sequences +#for file in $sequences: + "${file}" +#end for +> $output + +]]></command> + <inputs> + <param type="data" format="xmfa" name="xmfa" + label="XMFA MSA" /> + + <param type="data" format="fasta" name="sequences" multiple="True" + label="Select sequences used in progressiveMauve alignment" + help="Should be provided in the same order as in original progressiveMauve run"/> + + <param type="integer" label="Window Size" value="100" name="window_size" + help="maximum length of a match_part feature. Lower number = larger GFF output" /> + + <param type="integer" label="Generate relative to Nth sequence in XMFA file" value="1" name="relative_to" + help="progressiveMauve (un)helpfully strips original sequence IDs and replaces with integer numbers. This option lets you generate the alignment relative to e.g. the second genome if you wished to display against that base sequence" /> + </inputs> + <outputs> + <data format="gff3" name="output"> + </data> + </outputs> + <tests> + <test> + <param name="xmfa" value="1.xmfa"/> + <param name="sequences" value="karma.fa,phagey.fa" /> + <param name="window_size" value="10" /> + <param name="relative_to" value="1" /> + <output name="output" file="1.gff3"/> + </test> + </tests> + <help><![CDATA[ +What it does +============ + +XMFA Alignments are great, but now you need a way to visualize this data! + +This tool provides a conversion step to GFF3 formatted output consiting of +``match``es and ``match_part``s with scores indicating percent identity over +that chunk of sequence. + +A rendering of the output GFF3 file in JBrowse is shown below: + +.. image:: $PATH_TO_IMAGES/xmfa2gff3.png + +]]></help> + <citations> + </citations> +</tool>