# HG changeset patch
# User iuc
# Date 1432349424 14400
# Node ID 272c6e61dd0c4c86e19a60bd657011d3c71ec852
# Parent aaa94e13ac6038d2a3fb0a84b043435a6fe5c25e
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 936ca4b4641eae2bdfa93885f35f5a9263f9dae1
diff -r aaa94e13ac60 -r 272c6e61dd0c macros.xml
--- a/macros.xml Thu May 21 22:49:06 2015 -0400
+++ b/macros.xml Fri May 22 22:50:24 2015 -0400
@@ -3,6 +3,9 @@
progressivemauve
+ python
+ biopython
+ bcbiogff
diff -r aaa94e13ac60 -r 272c6e61dd0c progressivemauve.xml
--- a/progressivemauve.xml Thu May 21 22:49:06 2015 -0400
+++ b/progressivemauve.xml Fri May 22 22:50:24 2015 -0400
@@ -239,6 +239,10 @@
Y. pestis genomes can be aligned in under a minute, while a group of 9
divergent Enterobacterial genomes can be aligned in a few hours.
+progressiveMauve XMFA alignment visualized with the Mauve tool:
+
+.. image:: $PATH_TO_IMAGES/hemolysin.jpg
+
Example Usage
=============
diff -r aaa94e13ac60 -r 272c6e61dd0c static/images/hemolysin.jpg
Binary file static/images/hemolysin.jpg has changed
diff -r aaa94e13ac60 -r 272c6e61dd0c static/images/xmfa2gff3.png
Binary file static/images/xmfa2gff3.png has changed
diff -r aaa94e13ac60 -r 272c6e61dd0c test-data/1.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.gff3 Fri May 22 22:50:24 2015 -0400
@@ -0,0 +1,25 @@
+##gff-version 3
+##sequence-region karma 1 1
+karma progressiveMauve match 83032 140930 . - . ID=phagey;target=phagey
+karma progressiveMauve match_part 83032 83041 50.0 + . Parent=phagey
+karma progressiveMauve match_part 83042 83051 60.0 + . Parent=phagey
+karma progressiveMauve match_part 83060 83069 60.0 + . Parent=phagey
+karma progressiveMauve match_part 83070 83079 40.0 + . Parent=phagey
+karma progressiveMauve match_part 83080 83089 40.0 + . Parent=phagey
+karma progressiveMauve match_part 83108 83117 40.0 + . Parent=phagey
+karma progressiveMauve match_part 83125 83134 70.0 + . Parent=phagey
+karma progressiveMauve match_part 83144 83153 60.0 + . Parent=phagey
+karma progressiveMauve match_part 83154 83163 40.0 + . Parent=phagey
+karma progressiveMauve match_part 83164 83173 70.0 + . Parent=phagey
+karma progressiveMauve match_part 83196 83205 40.0 + . Parent=phagey
+karma progressiveMauve match_part 83206 83215 30.0 + . Parent=phagey
+karma progressiveMauve match_part 140791 140800 40.0 + . Parent=phagey
+karma progressiveMauve match_part 140801 140810 20.0 + . Parent=phagey
+karma progressiveMauve match_part 140811 140820 60.0 + . Parent=phagey
+karma progressiveMauve match_part 140821 140830 50.0 + . Parent=phagey
+karma progressiveMauve match_part 140831 140840 50.0 + . Parent=phagey
+karma progressiveMauve match_part 140841 140850 60.0 + . Parent=phagey
+karma progressiveMauve match_part 140851 140860 20.0 + . Parent=phagey
+karma progressiveMauve match_part 140861 140870 50.0 + . Parent=phagey
+karma progressiveMauve match_part 140871 140880 40.0 + . Parent=phagey
+karma progressiveMauve match_part 140906 140915 80.0 + . Parent=phagey
diff -r aaa94e13ac60 -r 272c6e61dd0c tool_dependencies.xml
--- a/tool_dependencies.xml Thu May 21 22:49:06 2015 -0400
+++ b/tool_dependencies.xml Fri May 22 22:50:24 2015 -0400
@@ -3,4 +3,13 @@
+
+
+
+
+
+
+
+
+
diff -r aaa94e13ac60 -r 272c6e61dd0c xmfa2gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xmfa2gff3.py Fri May 22 22:50:24 2015 -0400
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+import sys
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+import argparse
+from BCBio import GFF
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def parse_xmfa(xmfa):
+ """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
+ """
+ current_lcb = []
+ current_seq = {}
+ for line in xmfa.readlines():
+ if line.startswith('#'):
+ continue
+
+ if line.strip() == '=':
+ if 'id' in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ yield current_lcb
+ current_lcb = []
+ else:
+ line = line.strip()
+ if line.startswith('>'):
+ if 'id' in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ data = line.strip().split()
+ id, loc = data[1].split(':')
+ start, end = loc.split('-')
+ current_seq = {
+ 'rid': '_'.join(data[1:]),
+ 'id': id,
+ 'start': int(start),
+ 'end': int(end),
+ 'strand': 1 if data[2] == '+' else -1,
+ 'seq': ''
+ }
+ else:
+ current_seq['seq'] += line.strip()
+
+
+def _percent_identity(a, b):
+ """Calculate % identity, ignoring gaps in the host sequence
+ """
+ match = 0
+ mismatch = 0
+ for char_a, char_b in zip(list(a), list(b)):
+ if char_a == '-':
+ continue
+ if char_a == char_b:
+ match += 1
+ else:
+ mismatch += 1
+
+ if match + mismatch == 0:
+ return 0
+ return 100 * float(match) / (match + mismatch)
+
+
+def _id_tn_dict(sequences):
+ """Figure out sequence IDs
+ """
+ label_convert = {}
+ if sequences is not None:
+ if len(sequences) == 1:
+ for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')):
+ label_convert[str(i + 1)] = record.id
+ else:
+ for i, sequence in enumerate(sequences):
+ for record in SeqIO.parse(sequence, 'fasta'):
+ label_convert[str(i + 1)] = record.id
+ continue
+ return label_convert
+
+
+def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000):
+ label_convert = _id_tn_dict(sequences)
+
+ lcbs = parse_xmfa(xmfa_file)
+
+ records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))]
+ for lcb in lcbs:
+ ids = [seq['id'] for seq in lcb]
+
+ # Doesn't match part of our sequence
+ if relative_to not in ids:
+ continue
+
+ # Skip sequences that are JUST our "relative_to" genome
+ if len(ids) == 1:
+ continue
+
+ parent = [seq for seq in lcb if seq['id'] == relative_to][0]
+ others = [seq for seq in lcb if seq['id'] != relative_to]
+
+ for other in others:
+ other['feature'] = SeqFeature(
+ FeatureLocation(parent['start'], parent['end'] + 1),
+ type="match", strand=parent['strand'],
+ qualifiers={
+ "source": "progressiveMauve",
+ "target": label_convert.get(other['id'], other['id']),
+ "ID": label_convert.get(other['id'], 'xmfa_' + other['rid'])
+ }
+ )
+
+ for i in range(0, len(lcb[0]['seq']), window_size):
+ block_seq = parent['seq'][i:i + window_size]
+ real_window_size = len(block_seq)
+ real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i
+ real_end = real_start + real_window_size - block_seq.count('-')
+
+ if (real_end - real_start) < 10:
+ continue
+
+ if parent['start'] < 0:
+ strand = -1
+ else:
+ strand = 1
+
+ for other in others:
+ pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size])
+ # Ignore 0% identity sequences
+ if pid == 0:
+ continue
+ other['feature'].sub_features.append(
+ SeqFeature(
+ FeatureLocation(real_start, real_end),
+ type="match_part", strand=strand,
+ qualifiers={
+ "source": "progressiveMauve",
+ 'score': pid
+ }
+ )
+ )
+
+ for other in others:
+ records[0].features.append(other['feature'])
+ return records
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3')
+ parser.add_argument('xmfa_file', type=file, help='XMFA File')
+ parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000)
+ parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1')
+ parser.add_argument('--sequences', type=file, nargs='+',
+ help='Fasta files (in same order) passed to parent for reconstructing proper IDs')
+ parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+ args = parser.parse_args()
+
+ result = convert_xmfa_to_gff3(**vars(args))
+ GFF.write(result, sys.stdout)
diff -r aaa94e13ac60 -r 272c6e61dd0c xmfa2gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xmfa2gff3.xml Fri May 22 22:50:24 2015 -0400
@@ -0,0 +1,66 @@
+
+
+
+
+ macros.xml
+
+
+
+ python xmfa2gff3.py --version
+ $output
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+