# HG changeset patch # User bcclaywell # Date 1424997080 18000 # Node ID ce6db18f5fd321fa6dbf887add82c7030bb7af92 # Parent c8cc6529038c72119f8d36e62ab1d9edec6a1bd0 Uploaded diff -r c8cc6529038c -r ce6db18f5fd3 bootstrap-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bootstrap-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,13 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +python $(dirname $0)/bootstrap.py \ + --plate ${PLATE_ID} \ + ${JUNIOR} \ + --zone ${ZONE_ID} \ + --barcodes ${BARCODES} \ + --labels ${LABELS} \ + --metadata ${METADATA} \ + - < ${SAMPLE_INFO} diff -r c8cc6529038c -r ce6db18f5fd3 bootstrap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bootstrap.py Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +from __future__ import print_function +import csv +import sys +import os +import argparse + +def warning(*objs): + print("WARNING: ", *objs, file=sys.stderr) + +def main(arguments): + + parser = argparse.ArgumentParser(arguments, description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('infile', help = "CSV input", + type = argparse.FileType('r'), default = sys.stdin) + parser.add_argument('--junior', help = "use junior run specimen naming convention", action = 'store_true') + parser.add_argument('--plate', help = "plate number", type = int, required = True) + parser.add_argument('--zone', help = "zone number", type = int, required = True) + parser.add_argument('--barcodes', help = "name of barcodes file", + type = argparse.FileType('w'), default = 'barcodes.csv') + parser.add_argument('--labels', help = "name of labels file", + type = argparse.FileType('w'), default = 'labels.csv') + parser.add_argument('--metadata', help = "name of metadata template file", + type = argparse.FileType('w'), default = 'metadata.csv') + + args = parser.parse_args(arguments) + + label_key = 'sampleid' + primer_key = 'reverse' + barcode_key = 'barcode' + zone_key = 'zone' + + fstr = "j{plate_id}{primer_id}" if args.junior else "p{plate_id}z{zone_id}{primer_id}" + + reader = csv.DictReader(sys.stdin) + + barcodes = csv.writer(args.barcodes) + labels = csv.writer(args.labels) + metadata = csv.writer(args.metadata) + + barcodes.writerow(['stub', 'barcode']) + labels.writerow(['specimen', 'label']) + metadata.writerow(['specimen', 'plate', 'zone', 'label', 'primer']) + + seen_labels = {} + seen_primers = {} + + # TODO: add checks for duplicates, empty fields, etc., and bail if something goes wrong + for i, d in enumerate(reader): + if not all (k in d for k in (label_key, primer_key, barcode_key)): + return "Expected columns not found" + + if zone_key in d and d[zone_key] != str(args.zone): + continue + + label = d[label_key] + primer = d[primer_key] + barcode = d[barcode_key] + zone = args.zone + + if not all((label, primer, barcode)): + # only print a warning if at least one of the fields is non-empty + if any((label, primer, barcode)): + warning("Missing required field on row {}, skipping".format(i+2)) + continue + + if label in seen_labels: + return "Duplicate label '{}' found on rows {} and {}".format(label, seen_labels[label]+2, i+2) + + if primer in seen_primers: + return "Duplicate primer '{}' found on rows {} and {}".format(primer, seen_primers[primer]+2, i+2) + + seen_labels[label] = i + seen_primers[primer] = i + + specimen = fstr.format(plate_id=args.plate, zone_id=zone, primer_id=primer.strip().lower().replace('-','')) + barcodes.writerow([specimen, barcode]) + labels.writerow([specimen, label]) + metadata.writerow([specimen, args.plate, zone, label, primer]) + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) diff -r c8cc6529038c -r ce6db18f5fd3 bootstrap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bootstrap.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,67 @@ + + for analysis + + yapp_env + + + macros.xml + + echo "bootstrap script 1.1.0" + + bootstrap-wrapper.sh $config + + + + + + + + + + + + + + + + + + + + + + + + + +PLATE_ID="${plate_id}" +#if $run_type.run_type_select == "senior" +JUNIOR="" +ZONE_ID="${run_type.zone_id}" +#else +JUNIOR="--junior" +ZONE_ID="1" +#end if +SAMPLE_INFO="${sample_info}" + +BARCODES="${barcodes}" +LABELS="${labels}" +METADATA="${metadata}" + + + + + +.. class:: infomark + +**What it does** + +This tool parses sample information and creates a mapping of samples to +barcodes. The sample information file must contain the columns ``sampleid``, +``barcode``, and ``reverse``, and can optionally contain a ``zone`` column +also. + + + diff -r c8cc6529038c -r ce6db18f5fd3 classification-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/classification-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,13 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +classif_table.py \ + --specimen-map ${SPLIT_MAP} \ + --metadata-map ${LABEL_MAP} \ + --rank ${WANT_RANK} \ + --tallies-wide ${TALLIES_WIDE} \ + --by-specimen ${BY_SPECIMEN} \ + ${CLASS_DB} \ + ${BY_TAXON} diff -r c8cc6529038c -r ce6db18f5fd3 classification.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/classification.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,104 @@ + + in tabular format + + yapp_env + + + macros.xml + + echo "guppy $(guppy --version)" + + classification-wrapper.sh ${config} + + + + + + + + + + + + + + + + + + + + + + + + +SPLIT_MAP="${split_map}" +LABEL_MAP="${label_map}" +CLASS_DB="${class_db}" +WANT_RANK="${want_rank}" + +BY_TAXON="${by_taxon}" +BY_SPECIMEN="${by_specimen}" +TALLIES_WIDE="${tallies_wide}" + + + + + +.. class:: infomark + +**What it does** + +This tool outputs the classifications made by ``pplacer`` to a tabular format +appropriate for use with R. + +----- + +**Example** + +The classifications are simply done by containment. Say clade A of the +reference tree is the smallest such that contains a given placement. The most +specific classification for that read will be the lowest common ancestor of the +taxonomic classifications for the leaves of A. If the desired classification is +more specific than that, then we get a disconnect between the desired and the +actual classification. For example, if we try to classify at the species level +and the clade LCA is a genus, then we will get a genus name. If there is +uncertainty in read placement, then there is uncertainty in classification. + +For example, here is a classification list made for one read using the tabular +output. The columns are as follows: read name, attempted rank for +classification, actual rank for classification, taxonomic identifier, and +confidence. You can see that in this example, there is some uncertainty at and +below species, but only one classification at the genus level:: + + GLKT0ZE01CQ2BU root root 1 1 + GLKT0ZE01CQ2BU below_root below_root 131567 1 + GLKT0ZE01CQ2BU superkingdom superkingdom 2 1 + GLKT0ZE01CQ2BU below_superkingdom superkingdom 2 1 + GLKT0ZE01CQ2BU below_below_superkingdom superkingdom 2 1 + GLKT0ZE01CQ2BU superphylum superkingdom 2 1 + GLKT0ZE01CQ2BU phylum phylum 1239 1 + GLKT0ZE01CQ2BU subphylum phylum 1239 1 + GLKT0ZE01CQ2BU class class 186801 1 + GLKT0ZE01CQ2BU subclass class 186801 1 + GLKT0ZE01CQ2BU order order 186802 1 + GLKT0ZE01CQ2BU below_order order 186802 1 + GLKT0ZE01CQ2BU below_below_order order 186802 1 + GLKT0ZE01CQ2BU suborder order 186802 1 + GLKT0ZE01CQ2BU family family 186804 1 + GLKT0ZE01CQ2BU below_family family 186804 1 + GLKT0ZE01CQ2BU genus genus 1257 1 + GLKT0ZE01CQ2BU species_group genus 1257 1 + GLKT0ZE01CQ2BU species_subgroup genus 1257 1 + GLKT0ZE01CQ2BU species genus 1257 0.0732247 + GLKT0ZE01CQ2BU species species 1261 0.853561 + GLKT0ZE01CQ2BU species species 341694 0.073214 + GLKT0ZE01CQ2BU below_species genus 1257 0.0732247 + GLKT0ZE01CQ2BU below_species species 1261 0.853561 + GLKT0ZE01CQ2BU below_species species 341694 0.073214 + + + diff -r c8cc6529038c -r ce6db18f5fd3 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + diff -r c8cc6529038c -r ce6db18f5fd3 decorate-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decorate-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,10 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +csvcut -c "specimen,${COLUMNS}" ${METADATA} | \ + csvjoin -c "specimen" ${GROUP_BY_SPECIMEN} - > ${DECORATED_GROUP_BY_SPECIMEN} + +# drop duplicate columns (thanks, Erick!) +#csvcut -c $(head -n 1 addresses.csv | sed "s/,/\n/g" | sort |uniq | paste -s -d",") diff -r c8cc6529038c -r ce6db18f5fd3 decorate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decorate.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,46 @@ + + classification results with sample metadata + + yapp_env + + + macros.xml + + echo "decorate script 1.0.0" + + decorate-wrapper.sh ${config} + + + + + + + + + + + + + + +GROUP_BY_SPECIMEN="${group_by_specimen}" +METADATA="${metadata}" +COLUMNS="${columns}" + +DECORATED_GROUP_BY_SPECIMEN="${decorated_group_by_specimen}" + + + + + +.. class:: infomark + +**What it does** + +This tool joins the classifications made by ``pplacer`` with arbitrary sample +metadata. + + + diff -r c8cc6529038c -r ce6db18f5fd3 filter-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,32 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +INPUT_QUAL=$(extify qual ${INPUT_QUAL}) +BARCODES=$(extify csv ${BARCODES}) +RAW_SEQS=$(extify fasta ${RAW_SEQS}) + +seqmagick quality-filter \ + --input-qual ${INPUT_QUAL} \ + --barcode-file ${BARCODES} \ + --primer "${PRIMER}" \ + --report-out ${FILTER_REPORT} \ + --details-out ${FILTER_DETAILS} \ + --map-out ${SPLIT_MAP} \ + --barcode-header \ + --min-length ${MIN_LENGTH} \ + --min-mean-quality ${MIN_QUALITY} \ + --quality-window 30 \ + --quality-window-prop 0.9 \ + --quality-window-mean-qual 15 \ + ${RAW_SEQS} \ + filtered.fasta + +if [[ ${REVERSE_COMPLEMENT} == "TRUE" ]]; then + seqmagick mogrify \ + --reverse-complement \ + filtered.fasta +fi + +mv filtered.fasta ${FILTERED_SEQS} diff -r c8cc6529038c -r ce6db18f5fd3 filter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,87 @@ + + sequences + + yapp_env + + + macros.xml + + seqmagick --version + + filter-wrapper.sh ${config} + + + + + + + + + + + + + + + + + + + + + + + + + +RAW_SEQS="${raw_seqs}" +INPUT_QUAL="${input_qual}" +BARCODES="${barcodes}" +PRIMER="${primer}" +MIN_LENGTH="${min_length}" +MIN_QUALITY="${min_quality}" +REVERSE_COMPLEMENT="${reverse_complement}" + +FILTERED_SEQS="${filtered_seqs}" +FILTER_REPORT="${filter_report}" +FILTER_DETAILS="${filter_details}" +SPLIT_MAP="${split_map}" + + + + + +.. class:: infomark + +**What it does** + +This tool truncates and removes sequences that don’t match a set of quality +criteria, as well as mapping sequence barcodes to specimens. It takes input +sequences in FASTA format and a quality file, and outputs the filtered +sequences as well as a filtering summary. + +The default quality filter settings are: + ++---------------------------+------+ +|parameter |value | ++===========================+======+ +|--min-length |350 | ++---------------------------+------+ +|--min-mean-quality |35 | ++---------------------------+------+ +|--quality-window |30 | ++---------------------------+------+ +|--quality-window-prop |0.9 | ++---------------------------+------+ +|--quality-window-mean-qual |15 | ++---------------------------+------+ + +See seqmagick's `quality filter documentation`_ for full explanations of these +parameters. + +.. _quality filter documentation: http://fhcrc.github.io/seqmagick/quality_filter.html + + + diff -r c8cc6529038c -r ce6db18f5fd3 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,8 @@ + + + + + + + + diff -r c8cc6529038c -r ce6db18f5fd3 pplacer-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pplacer-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,13 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +QUERY_SEQS=$(extify fasta ${QUERY_SEQS}) +PPLACER_DEFAULT_ARGS="-j ${GALAXY_SLOTS:-4} -p --inform-prior --prior-lower 0.01 --map-identity" + +pplacer \ + ${PPLACER_DEFAULT_ARGS} \ + -c ${REFPKG} \ + -o ${PLACED_SEQS} \ + ${QUERY_SEQS} diff -r c8cc6529038c -r ce6db18f5fd3 pplacer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pplacer.py Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,54 @@ +import json +from galaxy.datatypes.data import Text +from galaxy.datatypes.images import Html + +class Jplace(Text): + file_ext = "jplace" + + def sniff(self, filename): + try: + with open(filename, "r") as f: + data = json.load(f) + if all (k in data for k in ("version", "tree", "placements", "fields")): + return True + except: + pass + + return False + + def get_mime(self): + return "application/json" + +class AutoPrimaryComposite(Html): + composite_type = "auto_primary_file" + + def __init__(self, **kwd): + Html.__init__(self, **kwd) + + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + bn = dataset.metadata.base_name + efp = dataset.extra_files_path + flist = os.listdir(efp) + rval = ['Files for Composite Dataset %s

Composite %s contains:

' ) + f = file(dataset.file_name,'w') + f.write("\n".join( rval )) + f.write('\n') + f.close() + + def set_meta(self, dataset, **kwd): + Html.set_meta(self, dataset, **kwd) + self.regenerate_primary_file(dataset) + + def get_mime(self): + return "text/html" + +class BasicHtmlComposite(Html): + composite_type = "basic" diff -r c8cc6529038c -r ce6db18f5fd3 pplacer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pplacer.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,53 @@ + + on a reference tree + + yapp_env + + + macros.xml + + echo "pplacer $(pplacer --version)" + + pplacer-wrapper.sh ${config} + + + + + + + + + + + + + +REFPKG="${refpkg.extra_files_path}" +QUERY_SEQS="${query_seqs}" + +PLACED_SEQS="${placed_seqs}" + + + + + +.. class:: infomark + +**What it does** + +This tool places query sequences on a fixed reference phylogenetic tree +according to a reference alignment, producing taxonomic annotations which can +be used for classification and visualization. + +----- + +**Citation** + +Matsen F, Kodner R, Armbrust E V: **pplacer: linear time maximum-likelihood and +Bayesian phylogenetic placement of sequences onto a fixed reference tree**. BMC +Bioinformatics 2010, **11**:1. + + + diff -r c8cc6529038c -r ce6db18f5fd3 preclassification-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preclassification-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,30 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +PLACED_SEQS=$(extify jplace ${PLACED_SEQS}) +NBC_SEQS=$(extify fasta ${NBC_SEQS}) + +guppy redup \ + -m \ + -d ${DEDUP_INFO} \ + -o ${REDUPED_SEQS} \ + ${PLACED_SEQS} + +REDUPED_SEQS=$(extify jplace ${REDUPED_SEQS}) + +rppr prep_db \ + -c ${REFPKG} \ + --sqlite ${CLASS_DB} + +guppy classify \ + -c ${REFPKG} \ + -j ${GALAXY_SLOTS:-4} \ + --pp \ + --sqlite ${CLASS_DB} \ + --classifier hybrid2 \ + --nbc-sequences ${NBC_SEQS} \ + ${REDUPED_SEQS} + +multiclass_concat.py --dedup-info ${DEDUP_INFO} ${CLASS_DB} diff -r c8cc6529038c -r ce6db18f5fd3 preclassification.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preclassification.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,50 @@ + + for classification + + yapp_env + + + macros.xml + + echo "guppy $(guppy --version)" + + preclassification-wrapper.sh ${config} + + + + + + + + + + + + + + + + +DEDUP_INFO="${dedup_info}" +REFPKG="${refpkg.extra_files_path}" +NBC_SEQS="${nbc_seqs}" +PLACED_SEQS="${placed_seqs}" + +REDUPED_SEQS="${reduped_seqs}" +CLASS_DB="${class_db}" + + + + + +.. class:: infomark + +**What it does** + +This tool outputs the classifications made by ``pplacer`` to a database for use +in taxonomic classification. + + + diff -r c8cc6529038c -r ce6db18f5fd3 preprocessing-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocessing-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,20 @@ +#!/bin/bash + +source $1 + +deduplicate_sequences.py \ + --split-map ${SPLIT_MAP} \ + --deduplicated-sequences-file ${DEDUP_INFO} \ + ${INPUT_SEQS} \ + ${DEDUP_SEQS} + +# adapted from yapp/bin/refpkg_align +ref_sto=$(taxit rp ${REFPKG} aln_sto) +profile=$(taxit rp ${REFPKG} profile) + +sto=$(mktemp -u).sto + +cmalign --cpu ${GALAXY_SLOTS:-4} -o "$sto" --sfile "${ALIGNED_SCORES}" --noprob --dnaout "$profile" "${DEDUP_SEQS}" | grep -E '^#' + +esl-alimerge --dna --outformat afa "$ref_sto" "$sto" | \ + seqmagick convert --output-format fasta --dash-gap - "${ALIGNED_SEQS}" diff -r c8cc6529038c -r ce6db18f5fd3 preprocessing.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preprocessing.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,53 @@ + + in preparation for phylogenetic placement + + yapp_env + + + macros.xml + + echo "guppy $(guppy --version)" + + preprocessing-wrapper.sh ${config} + + + + + + + + + + + + + + + + + +REFPKG="${refpkg.extra_files_path}" +INPUT_SEQS="${input_seqs}" +SPLIT_MAP="${split_map}" + +DEDUP_SEQS="${dedup_seqs}" +DEDUP_INFO="${dedup_info}" +ALIGNED_SEQS="${aligned_seqs}" +ALIGNED_SCORES="${aligned_scores}" + + + + + +.. class:: infomark + +**What it does** + +This tool aligns query sequences with the reference sequences used to make the +reference tree contained in the reference package and then merges the query and +reference sequences. + + + diff -r c8cc6529038c -r ce6db18f5fd3 refpkgzip_to_refpkg.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/refpkgzip_to_refpkg.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,11 @@ + + unzip -o -j $input -d $output.files_path + + + + + + + + + diff -r c8cc6529038c -r ce6db18f5fd3 render_datatable-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/render_datatable-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,20 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +mkdir -p ${OUTPUT_DIR} + +python $(dirname $0)/render_datatable.py \ + < ${INPUT} \ + > ${OUTPUT_DIR}/index.html + +cat < ${OUTPUT} + + + + + Generated table + + +EOF diff -r c8cc6529038c -r ce6db18f5fd3 render_datatable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/render_datatable.py Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,412 @@ +#!/usr/bin/env python + +import csv +import itertools +import string +import sys + +input = sys.stdin +start_lines = input.readlines(10) +all_input = itertools.chain(iter(start_lines), input) + +def detect_delimiter(iterable, char_set): + matches = (c for c in char_set if c in iterable) + return next(matches, None) + +def detect_csv_dialect(sample): + try: + return csv.Sniffer().sniff(sample) + except: + return None + +delimiter = detect_delimiter(start_lines[0], list('\t, ')) +reader = None + +if delimiter in list('\t,'): + # try to detect csv dialect, which should neatly handle quoted separators and stuff + dialect = detect_csv_dialect(''.join(start_lines)) + if dialect: + reader = csv.reader(all_input, dialect) + +if not reader: + if delimiter in list(string.whitespace): + # use str.split() with no arguments to split on arbitrary whitespace strings + reader = (line.strip().split() for line in all_input) + else: + reader = all_input + +print """\ + + + + + + + + + + + +
+ + \ +""" + +for i, row in enumerate(reader): + if i == 0: + print "" + else: + print "" + + if i == 0: + print "" + +print """\ + +
" + "".join(row) + "
" + "".join(row) + "
+
+ +\ +""" diff -r c8cc6529038c -r ce6db18f5fd3 render_datatable.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/render_datatable.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,63 @@ + + as an interactive HTML table + + macros.xml + + + render_datatable-wrapper.sh $config + + + + + + + + + + + + +INPUT="${input}" + +OUTPUT="${output}" +OUTPUT_DIR="${output.files_path}" + + + + + +.. class:: infomark + +**What it does** + +This tool reformats a CSV file, like this:: + + "seqname","accession","tax_id","species_name","is_type" + "FM872653","FM872653","308994","Dialister propionicifaciens",0.0 + "AY331416","AY331416","239137","Candidate Division TM7 oral",0.0 + "DQ666092","DQ666092","95818_1","Candidate Division TM7 vaginal",0.0 + "S002223913","GQ900631","186802_3","bacterium BVAB3-Strain 1",0.0 + ... + +into an interactive HTML table. + +[placeholder] + ++-------------+-----------+----------+---------------------------------------+----------+ +| seqname | accession | tax_id | species_name | is_type | ++=============+===========+==========+=======================================+==========+ +| FM872653 | FM872653 | 308994 | Dialister propionicifaciens | 0.0 | ++-------------+-----------+----------+---------------------------------------+----------+ +| AY331416 | AY331416 | 239137 | Candidate Division TM7 oral | 0.0 | ++-------------+-----------+----------+---------------------------------------+----------+ +| DQ666092 | DQ666092 | 95818_1 | Candidate Division TM7 vaginal | 0.0 | ++-------------+-----------+----------+---------------------------------------+----------+ +| S002223913 | GQ900631 | 186802_3 | bacterium BVAB3-Strain 1 | 0.0 | ++-------------+-----------+----------+---------------------------------------+----------+ + +... + + + diff -r c8cc6529038c -r ce6db18f5fd3 taxtastic.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/taxtastic.py Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,64 @@ +import os +import zipfile +from galaxy.datatypes.binary import Binary +from galaxy.datatypes.data import Text + +class Refpkg(Text): + composite_type = "basic" + + def __init__(self, **kwd): + Text.__init__(self, **kwd) + self.add_composite_file("CONTENTS.json") + + def get_mime(self): + return "application/json" + +class RefpkgZip(Binary): + file_ext = "refpkg.zip" + + def __init__(self, **kwd): + Binary.__init__(self, **kwd) + + def sniff(self, filename): + if not zipfile.is_zipfile(filename): + return False + contains_contents_file = False + zip_file = zipfile.ZipFile(filename, "r") + for name in zip_file.namelist(): + if os.path.basename(name) == "CONTENTS.json": + contains_contents_file = True + break + zip_file.close() + if not contains_contents_file: + return False + return True + + def get_mime(self): + return "application/zip" + +class OfficeXlsx(Binary): + file_ext = "xlsx" + + def __init__(self, **kwd): + Binary.__init__(self, **kwd) + + # TODO: this should check for an xl/ directory also + def sniff(self, filename): + if not zipfile.is_zipfile(filename): + return False + contains_contents_file = False + zip_file = zipfile.ZipFile(filename, "r") + for name in zip_file.namelist(): + if os.path.basename(name) == "[Content_Types].xml": + contains_contents_file = True + break + zip_file.close() + if not contains_contents_file: + return False + return True + + def get_mime(self): + return "application/zip" + +Binary.register_sniffable_binary_format("refpkg.zip", "refpkg.zip", RefpkgZip) +Binary.register_sniffable_binary_format("xlsx", "xlsx", OfficeXlsx) diff -r c8cc6529038c -r ce6db18f5fd3 usearch-wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch-wrapper.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,28 @@ +#!/bin/bash + +source $(dirname $0)/util.sh +source $1 + +RDP_SEQS="/shared/silo_researcher/Matsen_F/MatsenGrp/micro_refset/rdp/10_31/tax_filter/filtered/rdp_10_31.filter.fasta" +RDP_SEQINFO="/shared/silo_researcher/Matsen_F/MatsenGrp/micro_refset/rdp/10_31/tax_filter/filtered/rdp_10_31.filter.seq_info.csv" + +sqlite3 -csv -header ${CLASS_DB} < usearch_meta.csv +SELECT pn.name, CAST(pn.mass AS INT) count, tax_id, tax_name, taxa.rank + FROM multiclass_concat + JOIN taxa USING (tax_id) + JOIN placement_names pn USING (placement_id, name) + WHERE want_rank = 'species'; +EOF + +romp -v usearch_clusters \ + --usearch-quietly \ + --query-group tax_id \ + --query-duplication count \ + --database-name seqname \ + --database-group tax_id \ + ${INPUT_SEQS} \ + usearch_meta.csv \ + ${RDP_SEQS} \ + ${RDP_SEQINFO} \ + ${USEARCH_HITS} \ + ${USEARCH_GROUPS} diff -r c8cc6529038c -r ce6db18f5fd3 usearch.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,50 @@ + + with USEARCH + + macros.xml + + /home/matsengrp/local/bin/usearch6_64 --version + + usearch-wrapper.sh $config + + + + + + + + + + + + + + +INPUT_SEQS="${input_seqs}" +CLASS_DB="${class_db}" + +USEARCH_HITS="${usearch_hits}" +USEARCH_GROUPS="${usearch_groups}" + + + + + +.. class:: infomark + +**What it does** + +This tool queries large sequence databases for target sequences and assigns +those sequences to clusters. + +----- + +**Citation** + +Edgar, R C: **Search and clustering orders of magnitude faster than +BLAST**. Bioinformatics 2010, **26**:19. + + + diff -r c8cc6529038c -r ce6db18f5fd3 util.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util.sh Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,52 @@ +#!/bin/bash + +extify() { + local REQ_EXT=$1 + shift + + local OUTPUT="" + local FILE + for FILE in $*; do + local BASENAME=$(basename ${FILE}) + local EXT=${BASENAME##*.} + if [[ ${EXT} != ${REQ_EXT} ]]; then + local LINK="${BASENAME%%.*}.${REQ_EXT}" + if [[ ! -f ${LINK} ]]; then + ln -s ${FILE} ${LINK} + fi + FILE="${LINK}" + fi + OUTPUT="${OUTPUT} ${FILE}" + done + + echo ${OUTPUT} +} + +# from http://www.linuxjournal.com/content/use-date-command-measure-elapsed-time +timer() { + if [[ $# -eq 0 ]]; then + echo $(date '+%s') + else + local stime=$1 + etime=$(date '+%s') + + if [[ -z "$stime" ]]; then stime=$etime; fi + + dt=$((etime - stime)) + ds=$((dt % 60)) + dm=$(((dt / 60) % 60)) + dh=$((dt / 3600)) + printf '%d:%02d:%02d' $dh $dm $ds + fi +} + +on_exit() { + echo "Elapsed time: $(timer ${START_TIME})" +} + +set -eux + +xargs -n 1 -0 < /proc/self/environ > env.log + +START_TIME=$(timer) +trap on_exit EXIT diff -r c8cc6529038c -r ce6db18f5fd3 xlsx_to_csv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xlsx_to_csv.xml Thu Feb 26 19:31:20 2015 -0500 @@ -0,0 +1,21 @@ + + + yapp_env + + in2csv -f xlsx $input > $output + + + + + + + + +.. class:: infomark + +**What it does** + +This tool converts a spreadsheet in Microsoft Excel 2007 (.xlsx) format to CSV. + + +