# HG changeset patch
# User bcclaywell
# Date 1424997080 18000
# Node ID ce6db18f5fd321fa6dbf887add82c7030bb7af92
# Parent c8cc6529038c72119f8d36e62ab1d9edec6a1bd0
Uploaded
diff -r c8cc6529038c -r ce6db18f5fd3 bootstrap-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bootstrap-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+python $(dirname $0)/bootstrap.py \
+ --plate ${PLATE_ID} \
+ ${JUNIOR} \
+ --zone ${ZONE_ID} \
+ --barcodes ${BARCODES} \
+ --labels ${LABELS} \
+ --metadata ${METADATA} \
+ - < ${SAMPLE_INFO}
diff -r c8cc6529038c -r ce6db18f5fd3 bootstrap.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bootstrap.py Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import csv
+import sys
+import os
+import argparse
+
+def warning(*objs):
+ print("WARNING: ", *objs, file=sys.stderr)
+
+def main(arguments):
+
+ parser = argparse.ArgumentParser(arguments, description=__doc__,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument('infile', help = "CSV input",
+ type = argparse.FileType('r'), default = sys.stdin)
+ parser.add_argument('--junior', help = "use junior run specimen naming convention", action = 'store_true')
+ parser.add_argument('--plate', help = "plate number", type = int, required = True)
+ parser.add_argument('--zone', help = "zone number", type = int, required = True)
+ parser.add_argument('--barcodes', help = "name of barcodes file",
+ type = argparse.FileType('w'), default = 'barcodes.csv')
+ parser.add_argument('--labels', help = "name of labels file",
+ type = argparse.FileType('w'), default = 'labels.csv')
+ parser.add_argument('--metadata', help = "name of metadata template file",
+ type = argparse.FileType('w'), default = 'metadata.csv')
+
+ args = parser.parse_args(arguments)
+
+ label_key = 'sampleid'
+ primer_key = 'reverse'
+ barcode_key = 'barcode'
+ zone_key = 'zone'
+
+ fstr = "j{plate_id}{primer_id}" if args.junior else "p{plate_id}z{zone_id}{primer_id}"
+
+ reader = csv.DictReader(sys.stdin)
+
+ barcodes = csv.writer(args.barcodes)
+ labels = csv.writer(args.labels)
+ metadata = csv.writer(args.metadata)
+
+ barcodes.writerow(['stub', 'barcode'])
+ labels.writerow(['specimen', 'label'])
+ metadata.writerow(['specimen', 'plate', 'zone', 'label', 'primer'])
+
+ seen_labels = {}
+ seen_primers = {}
+
+ # TODO: add checks for duplicates, empty fields, etc., and bail if something goes wrong
+ for i, d in enumerate(reader):
+ if not all (k in d for k in (label_key, primer_key, barcode_key)):
+ return "Expected columns not found"
+
+ if zone_key in d and d[zone_key] != str(args.zone):
+ continue
+
+ label = d[label_key]
+ primer = d[primer_key]
+ barcode = d[barcode_key]
+ zone = args.zone
+
+ if not all((label, primer, barcode)):
+ # only print a warning if at least one of the fields is non-empty
+ if any((label, primer, barcode)):
+ warning("Missing required field on row {}, skipping".format(i+2))
+ continue
+
+ if label in seen_labels:
+ return "Duplicate label '{}' found on rows {} and {}".format(label, seen_labels[label]+2, i+2)
+
+ if primer in seen_primers:
+ return "Duplicate primer '{}' found on rows {} and {}".format(primer, seen_primers[primer]+2, i+2)
+
+ seen_labels[label] = i
+ seen_primers[primer] = i
+
+ specimen = fstr.format(plate_id=args.plate, zone_id=zone, primer_id=primer.strip().lower().replace('-',''))
+ barcodes.writerow([specimen, barcode])
+ labels.writerow([specimen, label])
+ metadata.writerow([specimen, args.plate, zone, label, primer])
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff -r c8cc6529038c -r ce6db18f5fd3 bootstrap.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bootstrap.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,67 @@
+
+ for analysis
+
+ yapp_env
+
+
+ macros.xml
+
+ echo "bootstrap script 1.1.0"
+
+ bootstrap-wrapper.sh $config
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+PLATE_ID="${plate_id}"
+#if $run_type.run_type_select == "senior"
+JUNIOR=""
+ZONE_ID="${run_type.zone_id}"
+#else
+JUNIOR="--junior"
+ZONE_ID="1"
+#end if
+SAMPLE_INFO="${sample_info}"
+
+BARCODES="${barcodes}"
+LABELS="${labels}"
+METADATA="${metadata}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool parses sample information and creates a mapping of samples to
+barcodes. The sample information file must contain the columns ``sampleid``,
+``barcode``, and ``reverse``, and can optionally contain a ``zone`` column
+also.
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 classification-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/classification-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+classif_table.py \
+ --specimen-map ${SPLIT_MAP} \
+ --metadata-map ${LABEL_MAP} \
+ --rank ${WANT_RANK} \
+ --tallies-wide ${TALLIES_WIDE} \
+ --by-specimen ${BY_SPECIMEN} \
+ ${CLASS_DB} \
+ ${BY_TAXON}
diff -r c8cc6529038c -r ce6db18f5fd3 classification.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/classification.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,104 @@
+
+ in tabular format
+
+ yapp_env
+
+
+ macros.xml
+
+ echo "guppy $(guppy --version)"
+
+ classification-wrapper.sh ${config}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+SPLIT_MAP="${split_map}"
+LABEL_MAP="${label_map}"
+CLASS_DB="${class_db}"
+WANT_RANK="${want_rank}"
+
+BY_TAXON="${by_taxon}"
+BY_SPECIMEN="${by_specimen}"
+TALLIES_WIDE="${tallies_wide}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool outputs the classifications made by ``pplacer`` to a tabular format
+appropriate for use with R.
+
+-----
+
+**Example**
+
+The classifications are simply done by containment. Say clade A of the
+reference tree is the smallest such that contains a given placement. The most
+specific classification for that read will be the lowest common ancestor of the
+taxonomic classifications for the leaves of A. If the desired classification is
+more specific than that, then we get a disconnect between the desired and the
+actual classification. For example, if we try to classify at the species level
+and the clade LCA is a genus, then we will get a genus name. If there is
+uncertainty in read placement, then there is uncertainty in classification.
+
+For example, here is a classification list made for one read using the tabular
+output. The columns are as follows: read name, attempted rank for
+classification, actual rank for classification, taxonomic identifier, and
+confidence. You can see that in this example, there is some uncertainty at and
+below species, but only one classification at the genus level::
+
+ GLKT0ZE01CQ2BU root root 1 1
+ GLKT0ZE01CQ2BU below_root below_root 131567 1
+ GLKT0ZE01CQ2BU superkingdom superkingdom 2 1
+ GLKT0ZE01CQ2BU below_superkingdom superkingdom 2 1
+ GLKT0ZE01CQ2BU below_below_superkingdom superkingdom 2 1
+ GLKT0ZE01CQ2BU superphylum superkingdom 2 1
+ GLKT0ZE01CQ2BU phylum phylum 1239 1
+ GLKT0ZE01CQ2BU subphylum phylum 1239 1
+ GLKT0ZE01CQ2BU class class 186801 1
+ GLKT0ZE01CQ2BU subclass class 186801 1
+ GLKT0ZE01CQ2BU order order 186802 1
+ GLKT0ZE01CQ2BU below_order order 186802 1
+ GLKT0ZE01CQ2BU below_below_order order 186802 1
+ GLKT0ZE01CQ2BU suborder order 186802 1
+ GLKT0ZE01CQ2BU family family 186804 1
+ GLKT0ZE01CQ2BU below_family family 186804 1
+ GLKT0ZE01CQ2BU genus genus 1257 1
+ GLKT0ZE01CQ2BU species_group genus 1257 1
+ GLKT0ZE01CQ2BU species_subgroup genus 1257 1
+ GLKT0ZE01CQ2BU species genus 1257 0.0732247
+ GLKT0ZE01CQ2BU species species 1261 0.853561
+ GLKT0ZE01CQ2BU species species 341694 0.073214
+ GLKT0ZE01CQ2BU below_species genus 1257 0.0732247
+ GLKT0ZE01CQ2BU below_species species 1261 0.853561
+ GLKT0ZE01CQ2BU below_species species 341694 0.073214
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 decorate-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/decorate-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+csvcut -c "specimen,${COLUMNS}" ${METADATA} | \
+ csvjoin -c "specimen" ${GROUP_BY_SPECIMEN} - > ${DECORATED_GROUP_BY_SPECIMEN}
+
+# drop duplicate columns (thanks, Erick!)
+#csvcut -c $(head -n 1 addresses.csv | sed "s/,/\n/g" | sort |uniq | paste -s -d",")
diff -r c8cc6529038c -r ce6db18f5fd3 decorate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/decorate.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,46 @@
+
+ classification results with sample metadata
+
+ yapp_env
+
+
+ macros.xml
+
+ echo "decorate script 1.0.0"
+
+ decorate-wrapper.sh ${config}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+GROUP_BY_SPECIMEN="${group_by_specimen}"
+METADATA="${metadata}"
+COLUMNS="${columns}"
+
+DECORATED_GROUP_BY_SPECIMEN="${decorated_group_by_specimen}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool joins the classifications made by ``pplacer`` with arbitrary sample
+metadata.
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 filter-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+INPUT_QUAL=$(extify qual ${INPUT_QUAL})
+BARCODES=$(extify csv ${BARCODES})
+RAW_SEQS=$(extify fasta ${RAW_SEQS})
+
+seqmagick quality-filter \
+ --input-qual ${INPUT_QUAL} \
+ --barcode-file ${BARCODES} \
+ --primer "${PRIMER}" \
+ --report-out ${FILTER_REPORT} \
+ --details-out ${FILTER_DETAILS} \
+ --map-out ${SPLIT_MAP} \
+ --barcode-header \
+ --min-length ${MIN_LENGTH} \
+ --min-mean-quality ${MIN_QUALITY} \
+ --quality-window 30 \
+ --quality-window-prop 0.9 \
+ --quality-window-mean-qual 15 \
+ ${RAW_SEQS} \
+ filtered.fasta
+
+if [[ ${REVERSE_COMPLEMENT} == "TRUE" ]]; then
+ seqmagick mogrify \
+ --reverse-complement \
+ filtered.fasta
+fi
+
+mv filtered.fasta ${FILTERED_SEQS}
diff -r c8cc6529038c -r ce6db18f5fd3 filter.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,87 @@
+
+ sequences
+
+ yapp_env
+
+
+ macros.xml
+
+ seqmagick --version
+
+ filter-wrapper.sh ${config}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+RAW_SEQS="${raw_seqs}"
+INPUT_QUAL="${input_qual}"
+BARCODES="${barcodes}"
+PRIMER="${primer}"
+MIN_LENGTH="${min_length}"
+MIN_QUALITY="${min_quality}"
+REVERSE_COMPLEMENT="${reverse_complement}"
+
+FILTERED_SEQS="${filtered_seqs}"
+FILTER_REPORT="${filter_report}"
+FILTER_DETAILS="${filter_details}"
+SPLIT_MAP="${split_map}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool truncates and removes sequences that don’t match a set of quality
+criteria, as well as mapping sequence barcodes to specimens. It takes input
+sequences in FASTA format and a quality file, and outputs the filtered
+sequences as well as a filtering summary.
+
+The default quality filter settings are:
+
++---------------------------+------+
+|parameter |value |
++===========================+======+
+|--min-length |350 |
++---------------------------+------+
+|--min-mean-quality |35 |
++---------------------------+------+
+|--quality-window |30 |
++---------------------------+------+
+|--quality-window-prop |0.9 |
++---------------------------+------+
+|--quality-window-mean-qual |15 |
++---------------------------+------+
+
+See seqmagick's `quality filter documentation`_ for full explanations of these
+parameters.
+
+.. _quality filter documentation: http://fhcrc.github.io/seqmagick/quality_filter.html
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 pplacer-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pplacer-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+QUERY_SEQS=$(extify fasta ${QUERY_SEQS})
+PPLACER_DEFAULT_ARGS="-j ${GALAXY_SLOTS:-4} -p --inform-prior --prior-lower 0.01 --map-identity"
+
+pplacer \
+ ${PPLACER_DEFAULT_ARGS} \
+ -c ${REFPKG} \
+ -o ${PLACED_SEQS} \
+ ${QUERY_SEQS}
diff -r c8cc6529038c -r ce6db18f5fd3 pplacer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pplacer.py Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,54 @@
+import json
+from galaxy.datatypes.data import Text
+from galaxy.datatypes.images import Html
+
+class Jplace(Text):
+ file_ext = "jplace"
+
+ def sniff(self, filename):
+ try:
+ with open(filename, "r") as f:
+ data = json.load(f)
+ if all (k in data for k in ("version", "tree", "placements", "fields")):
+ return True
+ except:
+ pass
+
+ return False
+
+ def get_mime(self):
+ return "application/json"
+
+class AutoPrimaryComposite(Html):
+ composite_type = "auto_primary_file"
+
+ def __init__(self, **kwd):
+ Html.__init__(self, **kwd)
+
+ def regenerate_primary_file(self,dataset):
+ """
+ cannot do this until we are setting metadata
+ """
+ bn = dataset.metadata.base_name
+ efp = dataset.extra_files_path
+ flist = os.listdir(efp)
+ rval = ['
Files for Composite Dataset %sComposite %s contains:' % (dataset.name,dataset.name)]
+ for i,fname in enumerate(flist):
+ sfname = os.path.split(fname)[-1]
+ f,e = os.path.splitext(fname)
+ rval.append( '- %s
' % ( sfname, sfname) )
+ rval.append( '
' )
+ f = file(dataset.file_name,'w')
+ f.write("\n".join( rval ))
+ f.write('\n')
+ f.close()
+
+ def set_meta(self, dataset, **kwd):
+ Html.set_meta(self, dataset, **kwd)
+ self.regenerate_primary_file(dataset)
+
+ def get_mime(self):
+ return "text/html"
+
+class BasicHtmlComposite(Html):
+ composite_type = "basic"
diff -r c8cc6529038c -r ce6db18f5fd3 pplacer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pplacer.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,53 @@
+
+ on a reference tree
+
+ yapp_env
+
+
+ macros.xml
+
+ echo "pplacer $(pplacer --version)"
+
+ pplacer-wrapper.sh ${config}
+
+
+
+
+
+
+
+
+
+
+
+
+
+REFPKG="${refpkg.extra_files_path}"
+QUERY_SEQS="${query_seqs}"
+
+PLACED_SEQS="${placed_seqs}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool places query sequences on a fixed reference phylogenetic tree
+according to a reference alignment, producing taxonomic annotations which can
+be used for classification and visualization.
+
+-----
+
+**Citation**
+
+Matsen F, Kodner R, Armbrust E V: **pplacer: linear time maximum-likelihood and
+Bayesian phylogenetic placement of sequences onto a fixed reference tree**. BMC
+Bioinformatics 2010, **11**:1.
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 preclassification-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preclassification-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+PLACED_SEQS=$(extify jplace ${PLACED_SEQS})
+NBC_SEQS=$(extify fasta ${NBC_SEQS})
+
+guppy redup \
+ -m \
+ -d ${DEDUP_INFO} \
+ -o ${REDUPED_SEQS} \
+ ${PLACED_SEQS}
+
+REDUPED_SEQS=$(extify jplace ${REDUPED_SEQS})
+
+rppr prep_db \
+ -c ${REFPKG} \
+ --sqlite ${CLASS_DB}
+
+guppy classify \
+ -c ${REFPKG} \
+ -j ${GALAXY_SLOTS:-4} \
+ --pp \
+ --sqlite ${CLASS_DB} \
+ --classifier hybrid2 \
+ --nbc-sequences ${NBC_SEQS} \
+ ${REDUPED_SEQS}
+
+multiclass_concat.py --dedup-info ${DEDUP_INFO} ${CLASS_DB}
diff -r c8cc6529038c -r ce6db18f5fd3 preclassification.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preclassification.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,50 @@
+
+ for classification
+
+ yapp_env
+
+
+ macros.xml
+
+ echo "guppy $(guppy --version)"
+
+ preclassification-wrapper.sh ${config}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+DEDUP_INFO="${dedup_info}"
+REFPKG="${refpkg.extra_files_path}"
+NBC_SEQS="${nbc_seqs}"
+PLACED_SEQS="${placed_seqs}"
+
+REDUPED_SEQS="${reduped_seqs}"
+CLASS_DB="${class_db}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool outputs the classifications made by ``pplacer`` to a database for use
+in taxonomic classification.
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 preprocessing-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocessing-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+source $1
+
+deduplicate_sequences.py \
+ --split-map ${SPLIT_MAP} \
+ --deduplicated-sequences-file ${DEDUP_INFO} \
+ ${INPUT_SEQS} \
+ ${DEDUP_SEQS}
+
+# adapted from yapp/bin/refpkg_align
+ref_sto=$(taxit rp ${REFPKG} aln_sto)
+profile=$(taxit rp ${REFPKG} profile)
+
+sto=$(mktemp -u).sto
+
+cmalign --cpu ${GALAXY_SLOTS:-4} -o "$sto" --sfile "${ALIGNED_SCORES}" --noprob --dnaout "$profile" "${DEDUP_SEQS}" | grep -E '^#'
+
+esl-alimerge --dna --outformat afa "$ref_sto" "$sto" | \
+ seqmagick convert --output-format fasta --dash-gap - "${ALIGNED_SEQS}"
diff -r c8cc6529038c -r ce6db18f5fd3 preprocessing.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/preprocessing.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,53 @@
+
+ in preparation for phylogenetic placement
+
+ yapp_env
+
+
+ macros.xml
+
+ echo "guppy $(guppy --version)"
+
+ preprocessing-wrapper.sh ${config}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+REFPKG="${refpkg.extra_files_path}"
+INPUT_SEQS="${input_seqs}"
+SPLIT_MAP="${split_map}"
+
+DEDUP_SEQS="${dedup_seqs}"
+DEDUP_INFO="${dedup_info}"
+ALIGNED_SEQS="${aligned_seqs}"
+ALIGNED_SCORES="${aligned_scores}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool aligns query sequences with the reference sequences used to make the
+reference tree contained in the reference package and then merges the query and
+reference sequences.
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 refpkgzip_to_refpkg.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/refpkgzip_to_refpkg.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,11 @@
+
+ unzip -o -j $input -d $output.files_path
+
+
+
+
+
+
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 render_datatable-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/render_datatable-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+mkdir -p ${OUTPUT_DIR}
+
+python $(dirname $0)/render_datatable.py \
+ < ${INPUT} \
+ > ${OUTPUT_DIR}/index.html
+
+cat < ${OUTPUT}
+
+
+
+
+ Generated table
+
+
+EOF
diff -r c8cc6529038c -r ce6db18f5fd3 render_datatable.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/render_datatable.py Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,412 @@
+#!/usr/bin/env python
+
+import csv
+import itertools
+import string
+import sys
+
+input = sys.stdin
+start_lines = input.readlines(10)
+all_input = itertools.chain(iter(start_lines), input)
+
+def detect_delimiter(iterable, char_set):
+ matches = (c for c in char_set if c in iterable)
+ return next(matches, None)
+
+def detect_csv_dialect(sample):
+ try:
+ return csv.Sniffer().sniff(sample)
+ except:
+ return None
+
+delimiter = detect_delimiter(start_lines[0], list('\t, '))
+reader = None
+
+if delimiter in list('\t,'):
+ # try to detect csv dialect, which should neatly handle quoted separators and stuff
+ dialect = detect_csv_dialect(''.join(start_lines))
+ if dialect:
+ reader = csv.reader(all_input, dialect)
+
+if not reader:
+ if delimiter in list(string.whitespace):
+ # use str.split() with no arguments to split on arbitrary whitespace strings
+ reader = (line.strip().split() for line in all_input)
+ else:
+ reader = all_input
+
+print """\
+
+
+
+
+
+
+
+
+
+
+
+
+
+ \
+"""
+
+for i, row in enumerate(reader):
+ if i == 0:
+ print "" + " | ".join(row) + " |
"
+ else:
+ print "" + " | ".join(row) + " |
"
+
+ if i == 0:
+ print ""
+
+print """\
+
+
+
+
+\
+"""
diff -r c8cc6529038c -r ce6db18f5fd3 render_datatable.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/render_datatable.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,63 @@
+
+ as an interactive HTML table
+
+ macros.xml
+
+
+ render_datatable-wrapper.sh $config
+
+
+
+
+
+
+
+
+
+
+
+
+INPUT="${input}"
+
+OUTPUT="${output}"
+OUTPUT_DIR="${output.files_path}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool reformats a CSV file, like this::
+
+ "seqname","accession","tax_id","species_name","is_type"
+ "FM872653","FM872653","308994","Dialister propionicifaciens",0.0
+ "AY331416","AY331416","239137","Candidate Division TM7 oral",0.0
+ "DQ666092","DQ666092","95818_1","Candidate Division TM7 vaginal",0.0
+ "S002223913","GQ900631","186802_3","bacterium BVAB3-Strain 1",0.0
+ ...
+
+into an interactive HTML table.
+
+[placeholder]
+
++-------------+-----------+----------+---------------------------------------+----------+
+| seqname | accession | tax_id | species_name | is_type |
++=============+===========+==========+=======================================+==========+
+| FM872653 | FM872653 | 308994 | Dialister propionicifaciens | 0.0 |
++-------------+-----------+----------+---------------------------------------+----------+
+| AY331416 | AY331416 | 239137 | Candidate Division TM7 oral | 0.0 |
++-------------+-----------+----------+---------------------------------------+----------+
+| DQ666092 | DQ666092 | 95818_1 | Candidate Division TM7 vaginal | 0.0 |
++-------------+-----------+----------+---------------------------------------+----------+
+| S002223913 | GQ900631 | 186802_3 | bacterium BVAB3-Strain 1 | 0.0 |
++-------------+-----------+----------+---------------------------------------+----------+
+
+...
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 taxtastic.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/taxtastic.py Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,64 @@
+import os
+import zipfile
+from galaxy.datatypes.binary import Binary
+from galaxy.datatypes.data import Text
+
+class Refpkg(Text):
+ composite_type = "basic"
+
+ def __init__(self, **kwd):
+ Text.__init__(self, **kwd)
+ self.add_composite_file("CONTENTS.json")
+
+ def get_mime(self):
+ return "application/json"
+
+class RefpkgZip(Binary):
+ file_ext = "refpkg.zip"
+
+ def __init__(self, **kwd):
+ Binary.__init__(self, **kwd)
+
+ def sniff(self, filename):
+ if not zipfile.is_zipfile(filename):
+ return False
+ contains_contents_file = False
+ zip_file = zipfile.ZipFile(filename, "r")
+ for name in zip_file.namelist():
+ if os.path.basename(name) == "CONTENTS.json":
+ contains_contents_file = True
+ break
+ zip_file.close()
+ if not contains_contents_file:
+ return False
+ return True
+
+ def get_mime(self):
+ return "application/zip"
+
+class OfficeXlsx(Binary):
+ file_ext = "xlsx"
+
+ def __init__(self, **kwd):
+ Binary.__init__(self, **kwd)
+
+ # TODO: this should check for an xl/ directory also
+ def sniff(self, filename):
+ if not zipfile.is_zipfile(filename):
+ return False
+ contains_contents_file = False
+ zip_file = zipfile.ZipFile(filename, "r")
+ for name in zip_file.namelist():
+ if os.path.basename(name) == "[Content_Types].xml":
+ contains_contents_file = True
+ break
+ zip_file.close()
+ if not contains_contents_file:
+ return False
+ return True
+
+ def get_mime(self):
+ return "application/zip"
+
+Binary.register_sniffable_binary_format("refpkg.zip", "refpkg.zip", RefpkgZip)
+Binary.register_sniffable_binary_format("xlsx", "xlsx", OfficeXlsx)
diff -r c8cc6529038c -r ce6db18f5fd3 usearch-wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch-wrapper.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+source $(dirname $0)/util.sh
+source $1
+
+RDP_SEQS="/shared/silo_researcher/Matsen_F/MatsenGrp/micro_refset/rdp/10_31/tax_filter/filtered/rdp_10_31.filter.fasta"
+RDP_SEQINFO="/shared/silo_researcher/Matsen_F/MatsenGrp/micro_refset/rdp/10_31/tax_filter/filtered/rdp_10_31.filter.seq_info.csv"
+
+sqlite3 -csv -header ${CLASS_DB} < usearch_meta.csv
+SELECT pn.name, CAST(pn.mass AS INT) count, tax_id, tax_name, taxa.rank
+ FROM multiclass_concat
+ JOIN taxa USING (tax_id)
+ JOIN placement_names pn USING (placement_id, name)
+ WHERE want_rank = 'species';
+EOF
+
+romp -v usearch_clusters \
+ --usearch-quietly \
+ --query-group tax_id \
+ --query-duplication count \
+ --database-name seqname \
+ --database-group tax_id \
+ ${INPUT_SEQS} \
+ usearch_meta.csv \
+ ${RDP_SEQS} \
+ ${RDP_SEQINFO} \
+ ${USEARCH_HITS} \
+ ${USEARCH_GROUPS}
diff -r c8cc6529038c -r ce6db18f5fd3 usearch.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,50 @@
+
+ with USEARCH
+
+ macros.xml
+
+ /home/matsengrp/local/bin/usearch6_64 --version
+
+ usearch-wrapper.sh $config
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+INPUT_SEQS="${input_seqs}"
+CLASS_DB="${class_db}"
+
+USEARCH_HITS="${usearch_hits}"
+USEARCH_GROUPS="${usearch_groups}"
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool queries large sequence databases for target sequences and assigns
+those sequences to clusters.
+
+-----
+
+**Citation**
+
+Edgar, R C: **Search and clustering orders of magnitude faster than
+BLAST**. Bioinformatics 2010, **26**:19.
+
+
+
diff -r c8cc6529038c -r ce6db18f5fd3 util.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util.sh Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+extify() {
+ local REQ_EXT=$1
+ shift
+
+ local OUTPUT=""
+ local FILE
+ for FILE in $*; do
+ local BASENAME=$(basename ${FILE})
+ local EXT=${BASENAME##*.}
+ if [[ ${EXT} != ${REQ_EXT} ]]; then
+ local LINK="${BASENAME%%.*}.${REQ_EXT}"
+ if [[ ! -f ${LINK} ]]; then
+ ln -s ${FILE} ${LINK}
+ fi
+ FILE="${LINK}"
+ fi
+ OUTPUT="${OUTPUT} ${FILE}"
+ done
+
+ echo ${OUTPUT}
+}
+
+# from http://www.linuxjournal.com/content/use-date-command-measure-elapsed-time
+timer() {
+ if [[ $# -eq 0 ]]; then
+ echo $(date '+%s')
+ else
+ local stime=$1
+ etime=$(date '+%s')
+
+ if [[ -z "$stime" ]]; then stime=$etime; fi
+
+ dt=$((etime - stime))
+ ds=$((dt % 60))
+ dm=$(((dt / 60) % 60))
+ dh=$((dt / 3600))
+ printf '%d:%02d:%02d' $dh $dm $ds
+ fi
+}
+
+on_exit() {
+ echo "Elapsed time: $(timer ${START_TIME})"
+}
+
+set -eux
+
+xargs -n 1 -0 < /proc/self/environ > env.log
+
+START_TIME=$(timer)
+trap on_exit EXIT
diff -r c8cc6529038c -r ce6db18f5fd3 xlsx_to_csv.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xlsx_to_csv.xml Thu Feb 26 19:31:20 2015 -0500
@@ -0,0 +1,21 @@
+
+
+ yapp_env
+
+ in2csv -f xlsx $input > $output
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool converts a spreadsheet in Microsoft Excel 2007 (.xlsx) format to CSV.
+
+
+