# HG changeset patch # User iuc # Date 1432349424 14400 # Node ID 272c6e61dd0c4c86e19a60bd657011d3c71ec852 # Parent aaa94e13ac6038d2a3fb0a84b043435a6fe5c25e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 936ca4b4641eae2bdfa93885f35f5a9263f9dae1 diff -r aaa94e13ac60 -r 272c6e61dd0c macros.xml --- a/macros.xml Thu May 21 22:49:06 2015 -0400 +++ b/macros.xml Fri May 22 22:50:24 2015 -0400 @@ -3,6 +3,9 @@ progressivemauve + python + biopython + bcbiogff diff -r aaa94e13ac60 -r 272c6e61dd0c progressivemauve.xml --- a/progressivemauve.xml Thu May 21 22:49:06 2015 -0400 +++ b/progressivemauve.xml Fri May 22 22:50:24 2015 -0400 @@ -239,6 +239,10 @@ Y. pestis genomes can be aligned in under a minute, while a group of 9 divergent Enterobacterial genomes can be aligned in a few hours. +progressiveMauve XMFA alignment visualized with the Mauve tool: + +.. image:: $PATH_TO_IMAGES/hemolysin.jpg + Example Usage ============= diff -r aaa94e13ac60 -r 272c6e61dd0c static/images/hemolysin.jpg Binary file static/images/hemolysin.jpg has changed diff -r aaa94e13ac60 -r 272c6e61dd0c static/images/xmfa2gff3.png Binary file static/images/xmfa2gff3.png has changed diff -r aaa94e13ac60 -r 272c6e61dd0c test-data/1.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.gff3 Fri May 22 22:50:24 2015 -0400 @@ -0,0 +1,25 @@ +##gff-version 3 +##sequence-region karma 1 1 +karma progressiveMauve match 83032 140930 . - . ID=phagey;target=phagey +karma progressiveMauve match_part 83032 83041 50.0 + . Parent=phagey +karma progressiveMauve match_part 83042 83051 60.0 + . Parent=phagey +karma progressiveMauve match_part 83060 83069 60.0 + . Parent=phagey +karma progressiveMauve match_part 83070 83079 40.0 + . Parent=phagey +karma progressiveMauve match_part 83080 83089 40.0 + . Parent=phagey +karma progressiveMauve match_part 83108 83117 40.0 + . Parent=phagey +karma progressiveMauve match_part 83125 83134 70.0 + . Parent=phagey +karma progressiveMauve match_part 83144 83153 60.0 + . Parent=phagey +karma progressiveMauve match_part 83154 83163 40.0 + . Parent=phagey +karma progressiveMauve match_part 83164 83173 70.0 + . Parent=phagey +karma progressiveMauve match_part 83196 83205 40.0 + . Parent=phagey +karma progressiveMauve match_part 83206 83215 30.0 + . Parent=phagey +karma progressiveMauve match_part 140791 140800 40.0 + . Parent=phagey +karma progressiveMauve match_part 140801 140810 20.0 + . Parent=phagey +karma progressiveMauve match_part 140811 140820 60.0 + . Parent=phagey +karma progressiveMauve match_part 140821 140830 50.0 + . Parent=phagey +karma progressiveMauve match_part 140831 140840 50.0 + . Parent=phagey +karma progressiveMauve match_part 140841 140850 60.0 + . Parent=phagey +karma progressiveMauve match_part 140851 140860 20.0 + . Parent=phagey +karma progressiveMauve match_part 140861 140870 50.0 + . Parent=phagey +karma progressiveMauve match_part 140871 140880 40.0 + . Parent=phagey +karma progressiveMauve match_part 140906 140915 80.0 + . Parent=phagey diff -r aaa94e13ac60 -r 272c6e61dd0c tool_dependencies.xml --- a/tool_dependencies.xml Thu May 21 22:49:06 2015 -0400 +++ b/tool_dependencies.xml Fri May 22 22:50:24 2015 -0400 @@ -3,4 +3,13 @@ + + + + + + + + + diff -r aaa94e13ac60 -r 272c6e61dd0c xmfa2gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xmfa2gff3.py Fri May 22 22:50:24 2015 -0400 @@ -0,0 +1,162 @@ +#!/usr/bin/env python +import sys +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +import argparse +from BCBio import GFF +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def parse_xmfa(xmfa): + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 + """ + current_lcb = [] + current_seq = {} + for line in xmfa.readlines(): + if line.startswith('#'): + continue + + if line.strip() == '=': + if 'id' in current_seq: + current_lcb.append(current_seq) + current_seq = {} + yield current_lcb + current_lcb = [] + else: + line = line.strip() + if line.startswith('>'): + if 'id' in current_seq: + current_lcb.append(current_seq) + current_seq = {} + data = line.strip().split() + id, loc = data[1].split(':') + start, end = loc.split('-') + current_seq = { + 'rid': '_'.join(data[1:]), + 'id': id, + 'start': int(start), + 'end': int(end), + 'strand': 1 if data[2] == '+' else -1, + 'seq': '' + } + else: + current_seq['seq'] += line.strip() + + +def _percent_identity(a, b): + """Calculate % identity, ignoring gaps in the host sequence + """ + match = 0 + mismatch = 0 + for char_a, char_b in zip(list(a), list(b)): + if char_a == '-': + continue + if char_a == char_b: + match += 1 + else: + mismatch += 1 + + if match + mismatch == 0: + return 0 + return 100 * float(match) / (match + mismatch) + + +def _id_tn_dict(sequences): + """Figure out sequence IDs + """ + label_convert = {} + if sequences is not None: + if len(sequences) == 1: + for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')): + label_convert[str(i + 1)] = record.id + else: + for i, sequence in enumerate(sequences): + for record in SeqIO.parse(sequence, 'fasta'): + label_convert[str(i + 1)] = record.id + continue + return label_convert + + +def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000): + label_convert = _id_tn_dict(sequences) + + lcbs = parse_xmfa(xmfa_file) + + records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))] + for lcb in lcbs: + ids = [seq['id'] for seq in lcb] + + # Doesn't match part of our sequence + if relative_to not in ids: + continue + + # Skip sequences that are JUST our "relative_to" genome + if len(ids) == 1: + continue + + parent = [seq for seq in lcb if seq['id'] == relative_to][0] + others = [seq for seq in lcb if seq['id'] != relative_to] + + for other in others: + other['feature'] = SeqFeature( + FeatureLocation(parent['start'], parent['end'] + 1), + type="match", strand=parent['strand'], + qualifiers={ + "source": "progressiveMauve", + "target": label_convert.get(other['id'], other['id']), + "ID": label_convert.get(other['id'], 'xmfa_' + other['rid']) + } + ) + + for i in range(0, len(lcb[0]['seq']), window_size): + block_seq = parent['seq'][i:i + window_size] + real_window_size = len(block_seq) + real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i + real_end = real_start + real_window_size - block_seq.count('-') + + if (real_end - real_start) < 10: + continue + + if parent['start'] < 0: + strand = -1 + else: + strand = 1 + + for other in others: + pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size]) + # Ignore 0% identity sequences + if pid == 0: + continue + other['feature'].sub_features.append( + SeqFeature( + FeatureLocation(real_start, real_end), + type="match_part", strand=strand, + qualifiers={ + "source": "progressiveMauve", + 'score': pid + } + ) + ) + + for other in others: + records[0].features.append(other['feature']) + return records + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3') + parser.add_argument('xmfa_file', type=file, help='XMFA File') + parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000) + parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1') + parser.add_argument('--sequences', type=file, nargs='+', + help='Fasta files (in same order) passed to parent for reconstructing proper IDs') + parser.add_argument('--version', action='version', version='%(prog)s 1.0') + + args = parser.parse_args() + + result = convert_xmfa_to_gff3(**vars(args)) + GFF.write(result, sys.stdout) diff -r aaa94e13ac60 -r 272c6e61dd0c xmfa2gff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xmfa2gff3.xml Fri May 22 22:50:24 2015 -0400 @@ -0,0 +1,66 @@ + + + + + macros.xml + + + + python xmfa2gff3.py --version + $output + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + +