changeset 0:35bcefc9176b draft default tip

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit dbd9af1e941e35ec9ca2a9f75af02edea67a5981
author sanbi-uwc
date Thu, 15 Jun 2017 07:41:38 -0400
parents
children
files tb2neo.xml test-data/sample.gff3 write_db_summary.py
diffstat 3 files changed, 175 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tb2neo.xml	Thu Jun 15 07:41:38 2017 -0400
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<tool id="tb2neo" name="M. tuberculosis annotation database builder" version="1.0.0">
+    <description>Parses TB GFF file and other annotation to build a
+        Neo4j Graph database.</description>
+    <requirements>
+      <requirement type="package" version="4.0.0b2">py2neo</requirement>
+      <requirement type="package" version="0.0.7">tb2neo</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+        tb2neo --docker --outputdir '${outputDb.files_path}' init ${test_mode}
+          '${input_gff}' &&
+        python $__tool_directory__/write_db_summary.py
+          '${outputDb.files_path}' '${outputDb.name}' >'${outputDb}'
+        ]]>
+    </command>
+    <inputs>
+        <param name="input_gff" type="data"
+          format="gff" label="TB genome annotation"
+          help="Specify the M. tuberculosis annotation file (sourced from Ensembl or NCBI)"
+          optional="False" />
+        <param name="test_mode" type="boolean" truevalue="-d -R -U -P -G -O"
+          falsevalue="-d -r -u -p -g -o" label="Quick load mode"
+          help="Only load GFF3 features (for test mode only)" />
+    </inputs>
+    <outputs>
+        <data format="neostore" name="outputDb"/>
+    </outputs>
+    <tests>
+        <test>
+          <param name="test_mode" value="True" />
+          <param name="input_gff" value="sample.gff3" />
+          <assert_stdout>
+            <has_text text="Extract and load features to Neo4j" />
+          </assert_stdout>
+        </test>
+    </tests>
+    <help><![CDATA[
+      The tb2neo_ tool builds a Neo4j database from M. tuberculosis genome
+      annotation (supplied in GFF3 format) and other annotation sources
+      (Uniprot, GO, InterPro and Tuberculist). The output is a Neo4j database
+      stored as a Galaxy neostore datatype that can be explored with the Neo4j
+      Interactive Environment or saved for use outside Galaxy.
+
+      .. _tb2neo: https://github.com/sanbi-sa/tb2neo
+      ]]></help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample.gff3	Thu Jun 15 07:41:38 2017 -0400
@@ -0,0 +1,100 @@
+##gff-version   3
+##sequence-region   Chromosome 1 4410929
+#!genome-build European Nucleotide Archive ASM19595v2
+#!genome-version GCA_000195955.2
+#!genome-date 2015-02
+#!genome-build-accession GCA_000195955.2
+#!genebuild-last-updated 2015-02
+Chromosome	ena	gene	1	1524	.	+	.	ID=gene:Rv0001;Name=dnaA;biotype=protein_coding;description=Chromosomal replication initiator protein DnaA;gene_id=Rv0001;logic_name=ena;version=1
+Chromosome	ena	transcript	1	1524	.	+	.	ID=transcript:CCP42723;Parent=gene:Rv0001;Name=dnaA-1;biotype=protein_coding;transcript_id=CCP42723;version=1
+Chromosome	ena	exon	1	1524	.	+	.	Parent=transcript:CCP42723;Name=CCP42723-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42723-1;rank=1;version=1
+Chromosome	ena	CDS	1	1524	.	+	0	ID=CDS:CCP42723;Parent=transcript:CCP42723;protein_id=CCP42723
+###
+Chromosome	ena	gene	2052	3260	.	+	.	ID=gene:Rv0002;Name=dnaN;biotype=protein_coding;description=DNA polymerase III (beta chain) DnaN (DNA nucleotidyltransferase);gene_id=Rv0002;logic_name=ena;version=1
+Chromosome	ena	transcript	2052	3260	.	+	.	ID=transcript:CCP42724;Parent=gene:Rv0002;Name=dnaN-1;biotype=protein_coding;transcript_id=CCP42724;version=1
+Chromosome	ena	exon	2052	3260	.	+	.	Parent=transcript:CCP42724;Name=CCP42724-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42724-1;rank=1;version=1
+Chromosome	ena	CDS	2052	3260	.	+	0	ID=CDS:CCP42724;Parent=transcript:CCP42724;protein_id=CCP42724
+###
+Chromosome	ena	gene	3280	4437	.	+	.	ID=gene:Rv0003;Name=recF;biotype=protein_coding;description=DNA replication and repair protein RecF (single-strand DNA binding protein);gene_id=Rv0003;logic_name=ena;version=1
+Chromosome	ena	transcript	3280	4437	.	+	.	ID=transcript:CCP42725;Parent=gene:Rv0003;Name=recF-1;biotype=protein_coding;transcript_id=CCP42725;version=1
+Chromosome	ena	exon	3280	4437	.	+	.	Parent=transcript:CCP42725;Name=CCP42725-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42725-1;rank=1;version=1
+Chromosome	ena	CDS	3280	4437	.	+	0	ID=CDS:CCP42725;Parent=transcript:CCP42725;protein_id=CCP42725
+###
+Chromosome	ena	gene	4434	4997	.	+	.	ID=gene:Rv0004;biotype=protein_coding;description=Conserved hypothetical protein;gene_id=Rv0004;logic_name=ena;version=1
+Chromosome	ena	transcript	4434	4997	.	+	.	ID=transcript:CCP42726;Parent=gene:Rv0004;biotype=protein_coding;transcript_id=CCP42726;version=1
+Chromosome	ena	exon	4434	4997	.	+	.	Parent=transcript:CCP42726;Name=CCP42726-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42726-1;rank=1;version=1
+Chromosome	ena	CDS	4434	4997	.	+	0	ID=CDS:CCP42726;Parent=transcript:CCP42726;protein_id=CCP42726
+###
+Chromosome	ena	gene	5240	7267	.	+	.	ID=gene:Rv0005;Name=gyrB;biotype=protein_coding;description=DNA gyrase (subunit B) GyrB (DNA topoisomerase (ATP-hydrolysing)) (DNA topoisomerase II) (type II DNA topoisomerase);gene_id=Rv0005;logic_name=ena;version=1
+Chromosome	ena	transcript	5240	7267	.	+	.	ID=transcript:CCP42727;Parent=gene:Rv0005;Name=gyrB-1;biotype=protein_coding;transcript_id=CCP42727;version=1
+Chromosome	ena	exon	5240	7267	.	+	.	Parent=transcript:CCP42727;Name=CCP42727-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42727-1;rank=1;version=1
+Chromosome	ena	CDS	5240	7267	.	+	0	ID=CDS:CCP42727;Parent=transcript:CCP42727;protein_id=CCP42727
+###
+Chromosome	ena	gene	7302	9818	.	+	.	ID=gene:Rv0006;Name=gyrA;biotype=protein_coding;description=DNA gyrase (subunit A) GyrA (DNA topoisomerase (ATP-hydrolysing)) (DNA topoisomerase II) (type II DNA topoisomerase);gene_id=Rv0006;logic_name=ena;version=1
+Chromosome	ena	transcript	7302	9818	.	+	.	ID=transcript:CCP42728;Parent=gene:Rv0006;Name=gyrA-1;biotype=protein_coding;transcript_id=CCP42728;version=1
+Chromosome	ena	exon	7302	9818	.	+	.	Parent=transcript:CCP42728;Name=CCP42728-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42728-1;rank=1;version=1
+Chromosome	ena	CDS	7302	9818	.	+	0	ID=CDS:CCP42728;Parent=transcript:CCP42728;protein_id=CCP42728
+###
+Chromosome	ena	gene	9914	10828	.	+	.	ID=gene:Rv0007;biotype=protein_coding;description=Possible conserved membrane protein;gene_id=Rv0007;logic_name=ena;version=1
+Chromosome	ena	transcript	9914	10828	.	+	.	ID=transcript:CCP42729;Parent=gene:Rv0007;biotype=protein_coding;transcript_id=CCP42729;version=1
+Chromosome	ena	exon	9914	10828	.	+	.	Parent=transcript:CCP42729;Name=CCP42729-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42729-1;rank=1;version=1
+Chromosome	ena	CDS	9914	10828	.	+	0	ID=CDS:CCP42729;Parent=transcript:CCP42729;protein_id=CCP42729
+###
+Chromosome	ena	tRNA_gene	10887	10960	.	+	.	ID=gene:EBG00000313329;Name=ileT;biotype=tRNA;gene_id=EBG00000313329;logic_name=ena_rna;version=1
+Chromosome	ena	transcript	10887	10960	.	+	.	ID=transcript:EBG00000313329-1;Parent=gene:EBG00000313329;Name=ileT;biotype=tRNA;transcript_id=EBG00000313329-1;version=1
+Chromosome	ena	exon	10887	10960	.	+	.	Parent=transcript:EBG00000313329-1;Name=EBG00000313329-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=EBG00000313329-1;rank=1;version=1
+###
+Chromosome	ena_gene	biological_region	10887	10960	.	+	.	external_name=AL123456.3:gene:10887..10960;logic_name=ena_gene
+Chromosome	ena	tRNA_gene	11112	11184	.	+	.	ID=gene:EBG00000313365;Name=alaT;biotype=tRNA;gene_id=EBG00000313365;logic_name=ena_rna;version=1
+Chromosome	ena	transcript	11112	11184	.	+	.	ID=transcript:EBG00000313365-1;Parent=gene:EBG00000313365;Name=alaT;biotype=tRNA;transcript_id=EBG00000313365-1;version=1
+Chromosome	ena	exon	11112	11184	.	+	.	Parent=transcript:EBG00000313365-1;Name=EBG00000313365-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=EBG00000313365-1;rank=1;version=1
+###
+Chromosome	ena_gene	biological_region	11112	11184	.	+	.	external_name=AL123456.3:gene:11112..11184;logic_name=ena_gene
+Chromosome	ena	gene	11874	12311	.	-	.	ID=gene:Rv0008c;biotype=protein_coding;description=Possible membrane protein;gene_id=Rv0008c;logic_name=ena;version=1
+Chromosome	ena	transcript	11874	12311	.	-	.	ID=transcript:CCP42730;Parent=gene:Rv0008c;biotype=protein_coding;transcript_id=CCP42730;version=1
+Chromosome	ena	exon	11874	12311	.	-	.	Parent=transcript:CCP42730;Name=CCP42730-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42730-1;rank=1;version=1
+Chromosome	ena	CDS	11874	12311	.	-	0	ID=CDS:CCP42730;Parent=transcript:CCP42730;protein_id=CCP42730
+###
+Chromosome	ena	gene	12468	13016	.	+	.	ID=gene:Rv0009;Name=ppiA;biotype=protein_coding;description=Probable iron-regulated peptidyl-prolyl cis-trans isomerase A PpiA (PPIase A) (rotamase A);gene_id=Rv0009;logic_name=ena;version=1
+Chromosome	ena	transcript	12468	13016	.	+	.	ID=transcript:CCP42731;Parent=gene:Rv0009;Name=ppiA-1;biotype=protein_coding;transcript_id=CCP42731;version=1
+Chromosome	ena	exon	12468	13016	.	+	.	Parent=transcript:CCP42731;Name=CCP42731-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42731-1;rank=1;version=1
+Chromosome	ena	CDS	12468	13016	.	+	0	ID=CDS:CCP42731;Parent=transcript:CCP42731;protein_id=CCP42731
+###
+Chromosome	ena	gene	13133	13558	.	-	.	ID=gene:Rv0010c;biotype=protein_coding;description=Probable conserved membrane protein;gene_id=Rv0010c;logic_name=ena;version=1
+Chromosome	ena	transcript	13133	13558	.	-	.	ID=transcript:CCP42732;Parent=gene:Rv0010c;biotype=protein_coding;transcript_id=CCP42732;version=1
+Chromosome	ena	exon	13133	13558	.	-	.	Parent=transcript:CCP42732;Name=CCP42732-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42732-1;rank=1;version=1
+Chromosome	ena	CDS	13133	13558	.	-	0	ID=CDS:CCP42732;Parent=transcript:CCP42732;protein_id=CCP42732
+###
+Chromosome	ena	gene	13714	13995	.	-	.	ID=gene:Rv0011c;biotype=protein_coding;description=Probable conserved transmembrane protein;gene_id=Rv0011c;logic_name=ena;version=1
+Chromosome	ena	transcript	13714	13995	.	-	.	ID=transcript:CCP42733;Parent=gene:Rv0011c;biotype=protein_coding;transcript_id=CCP42733;version=1
+Chromosome	ena	exon	13714	13995	.	-	.	Parent=transcript:CCP42733;Name=CCP42733-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42733-1;rank=1;version=1
+Chromosome	ena	CDS	13714	13995	.	-	0	ID=CDS:CCP42733;Parent=transcript:CCP42733;protein_id=CCP42733
+###
+Chromosome	ena	gene	14089	14877	.	+	.	ID=gene:Rv0012;biotype=protein_coding;description=Probable conserved membrane protein;gene_id=Rv0012;logic_name=ena;version=1
+Chromosome	ena	transcript	14089	14877	.	+	.	ID=transcript:CCP42734;Parent=gene:Rv0012;biotype=protein_coding;transcript_id=CCP42734;version=1
+Chromosome	ena	exon	14089	14877	.	+	.	Parent=transcript:CCP42734;Name=CCP42734-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42734-1;rank=1;version=1
+Chromosome	ena	CDS	14089	14877	.	+	0	ID=CDS:CCP42734;Parent=transcript:CCP42734;protein_id=CCP42734
+###
+Chromosome	ena	gene	14914	15612	.	+	.	ID=gene:Rv0013;Name=trpG;biotype=protein_coding;description=Possible anthranilate synthase component II TrpG (glutamine amidotransferase);gene_id=Rv0013;logic_name=ena;version=1
+Chromosome	ena	transcript	14914	15612	.	+	.	ID=transcript:CCP42735;Parent=gene:Rv0013;Name=trpG-1;biotype=protein_coding;transcript_id=CCP42735;version=1
+Chromosome	ena	exon	14914	15612	.	+	.	Parent=transcript:CCP42735;Name=CCP42735-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42735-1;rank=1;version=1
+Chromosome	ena	CDS	14914	15612	.	+	0	ID=CDS:CCP42735;Parent=transcript:CCP42735;protein_id=CCP42735
+###
+Chromosome	ena	gene	15590	17470	.	-	.	ID=gene:Rv0014c;Name=pknB;biotype=protein_coding;description=Transmembrane serine/threonine-protein kinase B PknB (protein kinase B) (STPK B);gene_id=Rv0014c;logic_name=ena;version=1
+Chromosome	ena	transcript	15590	17470	.	-	.	ID=transcript:CCP42736;Parent=gene:Rv0014c;Name=pknB-1;biotype=protein_coding;transcript_id=CCP42736;version=1
+Chromosome	ena	exon	15590	17470	.	-	.	Parent=transcript:CCP42736;Name=CCP42736-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42736-1;rank=1;version=1
+Chromosome	ena	CDS	15590	17470	.	-	0	ID=CDS:CCP42736;Parent=transcript:CCP42736;protein_id=CCP42736
+###
+Chromosome	ena	gene	17467	18762	.	-	.	ID=gene:Rv0015c;Name=pknA;biotype=protein_coding;description=Transmembrane serine/threonine-protein kinase A PknA (protein kinase A) (STPK A);gene_id=Rv0015c;logic_name=ena;version=1
+Chromosome	ena	transcript	17467	18762	.	-	.	ID=transcript:CCP42737;Parent=gene:Rv0015c;Name=pknA-1;biotype=protein_coding;transcript_id=CCP42737;version=1
+Chromosome	ena	exon	17467	18762	.	-	.	Parent=transcript:CCP42737;Name=CCP42737-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42737-1;rank=1;version=1
+Chromosome	ena	CDS	17467	18762	.	-	0	ID=CDS:CCP42737;Parent=transcript:CCP42737;protein_id=CCP42737
+###
+Chromosome	ena	gene	18759	20234	.	-	.	ID=gene:Rv0016c;Name=pbpA;biotype=protein_coding;description=Probable penicillin-binding protein PbpA;gene_id=Rv0016c;logic_name=ena;version=1
+Chromosome	ena	transcript	18759	20234	.	-	.	ID=transcript:CCP42738;Parent=gene:Rv0016c;Name=pbpA-1;biotype=protein_coding;transcript_id=CCP42738;version=1
+Chromosome	ena	exon	18759	20234	.	-	.	Parent=transcript:CCP42738;Name=CCP42738-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42738-1;rank=1;version=1
+Chromosome	ena	CDS	18759	20234	.	-	0	ID=CDS:CCP42738;Parent=transcript:CCP42738;protein_id=CCP42738
+###
+Chromosome	ena	gene	20231	21640	.	-	.	ID=gene:Rv0017c;Name=rodA;biotype=protein_coding;description=Probable cell division protein RodA;gene_id=Rv0017c;logic_name=ena;version=1
+Chromosome	ena	transcript	20231	21640	.	-	.	ID=transcript:CCP42739;Parent=gene:Rv0017c;Name=rodA-1;biotype=protein_coding;transcript_id=CCP42739;version=1
+Chromosome	ena	exon	20231	21640	.	-	.	Parent=transcript:CCP42739;Name=CCP42739-1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=CCP42739-1;rank=1;version=1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/write_db_summary.py	Thu Jun 15 07:41:38 2017 -0400
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import argparse
+import os
+import os.path
+
+parser = argparse.ArgumentParser(
+    description="Write HTML summary from neostore datatype")
+parser.add_argument('basepath')
+parser.add_argument('label')
+
+args = parser.parse_args()
+
+output = """<html><head><title>Files for Composite Dataset ({})</title></head>
+    <p/>This composite dataset is composed of
+     the following files:<p/><ul>\n""".format(args.label)
+db_path = args.basepath + '/neo4jdb/databases/graph.db'
+for filename in os.listdir(db_path):
+    if filename.startswith('.'):
+        continue
+    path = db_path + '/' + filename
+    if os.path.isdir(path):
+        continue
+    output += "<li>{}</li>\n".format(filename)
+output += '</ul></html>\b'
+print(output)