Mercurial > repos > iuc > progressivemauve

--- a/macros.xml	Thu May 21 22:49:06 2015 -0400
+++ b/macros.xml	Fri May 22 22:50:24 2015 -0400
@@ -3,6 +3,9 @@
   <xml name="requirements">
     <requirements>
       <requirement type="package" version="2.4.0">progressivemauve</requirement>
+      <requirement type="package" version="2.7">python</requirement>
+      <requirement type="package" version="1.65">biopython</requirement>
+      <requirement type="package" version="0.6.2">bcbiogff</requirement>
       <yield/>
     </requirements>
   </xml>
--- a/progressivemauve.xml	Thu May 21 22:49:06 2015 -0400
+++ b/progressivemauve.xml	Fri May 22 22:50:24 2015 -0400
@@ -239,6 +239,10 @@
 Y. pestis genomes can be aligned in under a minute, while a group of 9
 divergent Enterobacterial genomes can be aligned in a few hours.

+progressiveMauve XMFA alignment visualized with the Mauve tool:
+
+.. image:: $PATH_TO_IMAGES/hemolysin.jpg
+
 Example Usage
 =============
Binary file static/images/hemolysin.jpg has changed
Binary file static/images/xmfa2gff3.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.gff3	Fri May 22 22:50:24 2015 -0400
@@ -0,0 +1,25 @@
+##gff-version 3
+##sequence-region karma 1 1
+karma	progressiveMauve	match	83032	140930	.	-	.	ID=phagey;target=phagey
+karma	progressiveMauve	match_part	83032	83041	50.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83042	83051	60.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83060	83069	60.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83070	83079	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83080	83089	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83108	83117	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83125	83134	70.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83144	83153	60.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83154	83163	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83164	83173	70.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83196	83205	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	83206	83215	30.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140791	140800	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140801	140810	20.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140811	140820	60.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140821	140830	50.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140831	140840	50.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140841	140850	60.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140851	140860	20.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140861	140870	50.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140871	140880	40.0	+	.	Parent=phagey
+karma	progressiveMauve	match_part	140906	140915	80.0	+	.	Parent=phagey
--- a/tool_dependencies.xml	Thu May 21 22:49:06 2015 -0400
+++ b/tool_dependencies.xml	Fri May 22 22:50:24 2015 -0400
@@ -3,4 +3,13 @@
   <package name="progressivemauve" version="2.4.0">
     <repository changeset_revision="2a5b50fd549f" name="package_progressivemauve_2_4_0" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
   </package>
+  <package name="python" version="2.7">
+    <repository changeset_revision="00d2c3e43807" name="package_python_2_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="biopython" version="1.65">
+    <repository changeset_revision="f8d72690eeae" name="package_biopython_1_65" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
+  <package name="bcbiogff" version="0.6.2">
+    <repository changeset_revision="1195382a6172" name="package_python_2_7_bcbiogff_0_6_2" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+  </package>
 </tool_dependency>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xmfa2gff3.py	Fri May 22 22:50:24 2015 -0400
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+import sys
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+import argparse
+from BCBio import GFF
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def parse_xmfa(xmfa):
+    """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
+    """
+    current_lcb = []
+    current_seq = {}
+    for line in xmfa.readlines():
+        if line.startswith('#'):
+            continue
+
+        if line.strip() == '=':
+            if 'id' in current_seq:
+                current_lcb.append(current_seq)
+                current_seq = {}
+            yield current_lcb
+            current_lcb = []
+        else:
+            line = line.strip()
+            if line.startswith('>'):
+                if 'id' in current_seq:
+                    current_lcb.append(current_seq)
+                    current_seq = {}
+                data = line.strip().split()
+                id, loc = data[1].split(':')
+                start, end = loc.split('-')
+                current_seq = {
+                    'rid': '_'.join(data[1:]),
+                    'id': id,
+                    'start': int(start),
+                    'end': int(end),
+                    'strand': 1 if data[2] == '+' else -1,
+                    'seq': ''
+                }
+            else:
+                current_seq['seq'] += line.strip()
+
+
+def _percent_identity(a, b):
+    """Calculate % identity, ignoring gaps in the host sequence
+    """
+    match = 0
+    mismatch = 0
+    for char_a, char_b in zip(list(a), list(b)):
+        if char_a == '-':
+            continue
+        if char_a == char_b:
+            match += 1
+        else:
+            mismatch += 1
+
+    if match + mismatch == 0:
+        return 0
+    return 100 * float(match) / (match + mismatch)
+
+
+def _id_tn_dict(sequences):
+    """Figure out sequence IDs
+    """
+    label_convert = {}
+    if sequences is not None:
+        if len(sequences) == 1:
+            for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')):
+                label_convert[str(i + 1)] = record.id
+        else:
+            for i, sequence in enumerate(sequences):
+                for record in SeqIO.parse(sequence, 'fasta'):
+                    label_convert[str(i + 1)] = record.id
+                    continue
+    return label_convert
+
+
+def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000):
+    label_convert = _id_tn_dict(sequences)
+
+    lcbs = parse_xmfa(xmfa_file)
+
+    records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))]
+    for lcb in lcbs:
+        ids = [seq['id'] for seq in lcb]
+
+        # Doesn't match part of our sequence
+        if relative_to not in ids:
+            continue
+
+        # Skip sequences that are JUST our "relative_to" genome
+        if len(ids) == 1:
+            continue
+
+        parent = [seq for seq in lcb if seq['id'] == relative_to][0]
+        others = [seq for seq in lcb if seq['id'] != relative_to]
+
+        for other in others:
+            other['feature'] = SeqFeature(
+                FeatureLocation(parent['start'], parent['end'] + 1),
+                type="match", strand=parent['strand'],
+                qualifiers={
+                    "source": "progressiveMauve",
+                    "target": label_convert.get(other['id'], other['id']),
+                    "ID": label_convert.get(other['id'], 'xmfa_' + other['rid'])
+                }
+            )
+
+        for i in range(0, len(lcb[0]['seq']), window_size):
+            block_seq = parent['seq'][i:i + window_size]
+            real_window_size = len(block_seq)
+            real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i
+            real_end = real_start + real_window_size - block_seq.count('-')
+
+            if (real_end - real_start) < 10:
+                continue
+
+            if parent['start'] < 0:
+                strand = -1
+            else:
+                strand = 1
+
+            for other in others:
+                pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size])
+                # Ignore 0% identity sequences
+                if pid == 0:
+                    continue
+                other['feature'].sub_features.append(
+                    SeqFeature(
+                        FeatureLocation(real_start, real_end),
+                        type="match_part", strand=strand,
+                        qualifiers={
+                            "source": "progressiveMauve",
+                            'score': pid
+                        }
+                    )
+                )
+
+        for other in others:
+            records[0].features.append(other['feature'])
+    return records
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3')
+    parser.add_argument('xmfa_file', type=file, help='XMFA File')
+    parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000)
+    parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1')
+    parser.add_argument('--sequences', type=file, nargs='+',
+                        help='Fasta files (in same order) passed to parent for reconstructing proper IDs')
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+    args = parser.parse_args()
+
+    result = convert_xmfa_to_gff3(**vars(args))
+    GFF.write(result, sys.stdout)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xmfa2gff3.xml	Fri May 22 22:50:24 2015 -0400
@@ -0,0 +1,66 @@
+<?xml version="1.0"?>
+<tool id="xmfa2gff3" name="Convert XMFA to gapped GFF3" version="@WRAPPER_VERSION@.0">
+  <description></description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <expand macro="stdio"/>
+  <version_command>python xmfa2gff3.py --version</version_command>
+  <command interpreter="python"><![CDATA[
+xmfa2gff3.py
+$xmfa
+--window_size $window_size
+--relative_to $relative_to
+--sequences
+#for file in $sequences:
+    "${file}"
+#end for
+> $output
+
+]]></command>
+  <inputs>
+      <param type="data" format="xmfa" name="xmfa"
+          label="XMFA MSA" />
+
+      <param type="data" format="fasta" name="sequences" multiple="True"
+          label="Select sequences used in progressiveMauve alignment"
+          help="Should be provided in the same order as in original progressiveMauve run"/>
+
+      <param type="integer" label="Window Size" value="100" name="window_size"
+          help="maximum length of a match_part feature. Lower number = larger GFF output" />
+
+      <param type="integer" label="Generate relative to Nth sequence in XMFA file" value="1" name="relative_to"
+          help="progressiveMauve (un)helpfully strips original sequence IDs and replaces with integer numbers. This option lets you generate the alignment relative to e.g. the second genome if you wished to display against that base sequence" />
+  </inputs>
+  <outputs>
+    <data format="gff3" name="output">
+    </data>
+  </outputs>
+  <tests>
+      <test>
+          <param name="xmfa" value="1.xmfa"/>
+          <param name="sequences" value="karma.fa,phagey.fa" />
+          <param name="window_size" value="10" />
+          <param name="relative_to" value="1" />
+          <output name="output" file="1.gff3"/>
+      </test>
+  </tests>
+  <help><![CDATA[
+What it does
+============
+
+XMFA Alignments are great, but now you need a way to visualize this data!
+
+This tool provides a conversion step to GFF3 formatted output consiting of
+``match``es and ``match_part``s with scores indicating percent identity over
+that chunk of sequence.
+
+A rendering of the output GFF3 file in JBrowse is shown below:
+
+.. image:: $PATH_TO_IMAGES/xmfa2gff3.png
+
+]]></help>
+  <citations>
+  </citations>
+</tool>