Mercurial > repos > devteam > find_diag_hits

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_diag_hits.py	Thu Jan 23 12:30:47 2014 -0500
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+
+"""
+tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
+    finds reads that only hit one taxonomic group. For example, consider the folliowing example:
+
+    read1   mammalia
+    read1   insecta
+    read2   insecta
+
+    in this case only read2 will be selected becuase it stays within insecta
+
+    This program takes the following options:
+
+    file in taxonomy format - dataset that complies with Galaxy's taxonomy format
+    id column               - integer specifying the number of column containing seq id (starting with 1)
+    taxonomic ranks         - a comma separated list of ranks from this list:
+
+         superkingdom
+         kingdom
+         subkingdom
+         superphylum
+         phylum
+         subphylum
+         superclass
+         class
+         subclass
+         superorder
+         order
+         suborder
+         superfamily
+         family
+         subfamily
+         tribe
+         subtribe
+         genus
+         subgenus
+         species
+         subspecies
+
+    output format           - reads or counts
+
+"""
+
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( 'pysqlite' )
+from pysqlite2 import dbapi2 as sqlite
+import string, sys, tempfile
+
+# This dictionary maps taxonomic ranks to fields of Taxonomy file
+taxRank = {
+        'root'        :2,
+        'superkingdom':3,
+        'kingdom'     :4,
+        'subkingdom'  :5,
+        'superphylum' :6,
+        'phylum'      :7,
+        'subphylum'   :8,
+        'superclass'  :9,
+        'class'       :10,
+        'subclass'    :11,
+        'superorder'  :12,
+        'ord'         :13,
+        'suborder'    :14,
+        'superfamily' :15,
+        'family'      :16,
+        'subfamily'   :17,
+        'tribe'       :18,
+        'subtribe'    :19,
+        'genus'       :20,
+        'subgenus'    :21,
+        'species'     :22,
+        'subspecies'  :23,
+        'order'       :13
+    }
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+db = tempfile.NamedTemporaryFile('w')
+
+try:
+    con = sqlite.connect(db.name)
+    cur = con.cursor()
+except:
+    stop_err('Cannot connect to %s\n') % db.name
+
+try:
+    tax_file   = open(sys.argv[1], 'r')
+    id_col     = int(sys.argv[2]) - 1
+    taxa       = string.split(sys.argv[3].rstrip(),',')
+
+    if sys.argv[4] == 'reads':
+        out_format = True
+    elif sys.argv[4] == 'counts':
+        out_format = False
+    else:
+        stop_err('Please specify "reads" or "counts" for output format\n')
+    out_file = open(sys.argv[5], 'w')
+
+except:
+    stop_err('Check arguments\n')
+
+if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
+
+sql = ""
+for i in range(len(taxa)):
+        if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
+        sql += '%s text, ' % taxa[i]
+
+sql = sql.strip(', ')
+sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
+
+
+cur.execute(sql)
+
+invalid_line_number = 0
+
+try:
+    for line in tax_file:
+        fields = string.split(line.rstrip(), '\t')
+        if len(fields) < 24:
+            invalid_line_number += 1
+            continue # Skipping malformed taxonomy lines
+
+        val_string = '"' + fields[id_col] + '", '
+
+        for rank in taxa:
+            taxon = fields[taxRank[rank]]
+            val_string += '"%s", ' % taxon
+
+        val_string = val_string.strip(', ')
+        val_string = "insert into tax values(" + val_string + ")"
+        cur.execute(val_string)
+except Exception, e:
+    stop_err('%s\n' % e)
+
+tax_file.close()
+
+try:
+    for rank in taxa:
+        cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank  )
+        cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
+        cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
+        cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
+
+        if rank == 'ord':
+            rankName = 'order'
+        else:
+            rankName = rank
+
+        if out_format:
+            cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
+            for item in cur.fetchall():
+                out_string = '%s\t%s\t' % ( item[0], item[1] )
+                out_string += rankName
+                print >>out_file, out_string
+        else:
+            cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
+            for item in cur.fetchall():
+                out_string = '%s\t%s\t' % ( item[0], item[1] )
+                out_string += rankName
+                print >>out_file, out_string
+except Exception, e:
+    stop_err("%s\n" % e)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_diag_hits.xml	Thu Jan 23 12:30:47 2014 -0500
@@ -0,0 +1,99 @@
+<tool id="find_diag_hits" name="Find diagnostic hits" version="1.0.0">
+    <description></description>
+    <requirements>
+        <requirement type="package" version="1.0.0">taxonomy</requirement>
+    </requirements>
+    <command interpreter="python">find_diag_hits.py $input1 $id_col $rank_list $out_format $out_file1</command>
+    <inputs>
+        <param format="taxonomy" name="input1" type="data" label="Find diagnostic hits in"/>
+        <param name="id_col" type="data_column" data_ref="input1" numerical="False" label="Select column with sequence id" />
+        <param name="rank_list" type="select" display="checkboxes" multiple="true" label="select taxonomic ranks">
+            <option value="superkingdom">Superkingdom</option>
+            <option value="kingdom">Kingdom</option>
+            <option value="subkingdom">Subkingdom</option>
+            <option value="superphylum">Superphylum</option>
+            <option value="phylum">Phylum</option>
+            <option value="subphylum">Subphylum</option>
+            <option value="superclass">Superclass</option>
+            <option value="class">Class</option>
+            <option value="subclass">Subclass</option>
+            <option value="superorder">Superorder</option>
+            <option value="order">Order</option>
+            <option value="suborder">Suborder</option>
+            <option value="superfamily">Superfamily</option>
+            <option value="family">Family</option>
+            <option value="subfamily">Subfamily</option>
+            <option value="tribe">Tribe</option>
+            <option value="subtribe">Subtribe</option>
+            <option value="genus">Genus</option>
+            <option value="subgenus">Subgenus</option>
+            <option selected="true" value="species">Species</option>
+            <option value="subspecies">Subspecies</option>
+        </param>
+        <param name="out_format" type="select" label="Select output format">
+            <option value="reads">Diagnostic read list</option>
+            <option value="counts">Number of diagnostic reads per taxonomic rank</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="out_file1" />
+    </outputs>
+      <tests>
+    <test>
+      <param name="input1" value="taxonomyGI.taxonomy" ftype="taxonomy"/>
+      <param name="id_col" value="1" />
+      <param name="rank_list" value="order,genus" />
+      <param name="out_format" value="counts" />
+      <output name="out_file1" file="find_diag_hits.tabular" />
+    </test>
+  </tests>
+
+
+<help>
+
+**What it does**
+
+When performing metagenomic analyses it is often necessary to identify sequence reads corresponding to a particular taxonomic group, or, in other words, diagnostic of a particular taxonomic rank. This utility performs this analysis. It takes data generated by *Taxonomy manipulation->Fetch Taxonomic Ranks* as input and outputs either a list of sequence reads unique to a particular taxonomic rank, or a list of taxonomic ranks and the count of unique reads corresponding to each rank.
+
+------
+
+**Example**
+
+Suppose the *Taxonomy manipulation->Fetch Taxonomic Ranks* generated the following taxonomy representation::
+
+    read1 2      root Eukaryota Metazoa n n Chordata   Craniata Gnathostomata Mammalia n        Laurasiatheria   n           Ruminantia  n             Bovidae     Bovinae      n          n          Bos        n Bos taurus        n
+    read2 12585	 root Eukaryota Metazoa n n Chordata   Craniata Gnathostomata Mammalia n        Euarchontoglires Primates	 Haplorrhini Hominoidea    Hominidae   n            n          n          Homo       n Homo sapiens      n
+    read1 58615  root Eukaryota Metazoa n n Arthropoda n        Hexapoda      Insecta  Neoptera Amphiesmenoptera Lepidoptera Glossata    Papilionoidea Nymphalidae Nymphalinae  Melitaeini Phyciodina Anthanassa n Anthanassa otanes n
+    read3 56785	 root Eukaryota Metazoa n n Chordata   Craniata Gnathostomata Mammalia n        Euarchontoglires Primates	 Haplorrhini Hominoidea    Hominidae   n            n          n          Homo       n Homo sapiens      n
+
+Running this tool with the following parameters:
+
+  * *Select column with sequence id* set to **c1**
+  * *Select taxonomic ranks* with **order**, and **genus** checked
+  * *Output format* set to **Diagnostic read list**
+
+will return::
+
+    read2 Primates order
+    read3 Primates order
+    read2 Homo     genus
+    read3 Homo     genus
+
+Changing *Output format* set to **Number of diagnostic reads per taxonomic rank** will produce::
+
+    Primates 2       order
+    Homo     2       genus
+
+.. class:: infomark
+
+Note that **read1** is omitted because it is non-unique: it hits Mammals and Insects at the same time.
+
+--------
+
+.. class:: warningmark
+
+This tool omits "**n**" corresponding to ranks missing from NCBI taxonomy. In the above example *Home sapiens* contains the order name (Primates) while *Bos taurus* does not.
+
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_diag_hits.tabular	Thu Jan 23 12:30:47 2014 -0500
@@ -0,0 +1,2 @@
+Primates	4	order
+Homo	2	genus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomyGI.taxonomy	Thu Jan 23 12:30:47 2014 -0500
@@ -0,0 +1,5 @@
+9606	9606	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Euarchontoglires	Primates	Haplorrhini	Hominoidea	Hominidae	n	n	n	Homo	n	Homo sapiens	n	12583
+40674	40674	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	n	n	n	n	n	n	n	n	n	n	n	n	410771
+63221	63221	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Euarchontoglires	Primates	Haplorrhini	Hominoidea	Hominidae	n	n	n	Homo	n	Homo sapiens	Homo sapiens neanderthalensis	2286205
+9604	9604	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Euarchontoglires	Primates	Haplorrhini	Hominoidea	Hominidae	n	n	n	n	n	n	n	23236241
+9443	9443	root	Eukaryota	Metazoa	n	n	Chordata	Craniata	Gnathostomata	Mammalia	n	Euarchontoglires	Primates	n	n	n	n	n	n	n	n	n	n	33001686
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Jan 23 12:30:47 2014 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="taxonomy" version="1.0.0">
+      <repository changeset_revision="00dc297ecd07" name="package_taxonomy_1_0_0" owner="devteam" prior_installation_required="False" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>