Mercurial > repos > peterjc > seq_primer_clip
changeset 11:5ccb4e31510a draft
planemo upload for repository https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_primer_clip commit 4bd49529e9ca2096cd875e98daf7190d13fa8d0b-dirty
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 13:18:48 -0500 |
parents | 1ea4bc07d303 |
children | 86cfa8eebb73 |
files | tools/seq_primer_clip/README.rst tools/seq_primer_clip/seq_primer_clip.py tools/seq_primer_clip/seq_primer_clip.xml tools/seq_primer_clip/tool_dependencies.xml |
diffstat | 4 files changed, 70 insertions(+), 66 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_primer_clip/README.rst Thu May 21 10:53:06 2015 -0400 +++ b/tools/seq_primer_clip/README.rst Wed Feb 01 13:18:48 2017 -0500 @@ -1,7 +1,7 @@ Galaxy tool to primer clip (trim) FASTA, FASTQ or SFF reads =========================================================== -This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute +This tool is copyright 2011-2017 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below (MIT licence). @@ -68,6 +68,9 @@ v0.0.13 - Use ``format_source=...`` tag. - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). + - Fixed input file help text. +v0.0.14 - Updated to point at Biopython 1.67 (latest version in Tool Shed). + - Explicit dependency on ``galaxy_sequence_utils``. ======= ====================================================================== @@ -86,12 +89,12 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_update --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_primer_clip/ + $ planemo shed_update -t testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_primer_clip/ ... or:: - $ planemo shed_update --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_primer_clip/ + $ planemo shed_update -t toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_primer_clip/ ... To just build and check the tar ball, use::
--- a/tools/seq_primer_clip/seq_primer_clip.py Thu May 21 10:53:06 2015 -0400 +++ b/tools/seq_primer_clip/seq_primer_clip.py Wed Feb 01 13:18:48 2017 -0500 @@ -38,49 +38,45 @@ print "v0.0.12" sys.exit(0) -def sys_exit(msg, err=1): - sys.stderr.write(msg) - sys.exit(err) - try: from Bio.Seq import reverse_complement from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: - sys_exit("Requires Biopython 1.54 or later") + sys.exit("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest -#Parse Command Line +# Parse Command Line try: in_file, seq_format, primer_fasta, primer_type, mm, min_len, keep_negatives, out_file = sys.argv[1:] except ValueError: - sys_exit("Expected 8 arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + sys.exit("Expected 8 arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) if in_file == primer_fasta: - sys_exit("Same file given as both primer sequences and sequences to clip!") + sys.exit("Same file given as both primer sequences and sequences to clip!") if in_file == out_file: - sys_exit("Same file given as both sequences to clip and output!") + sys.exit("Same file given as both sequences to clip and output!") if primer_fasta == out_file: - sys_exit("Same file given as both primer sequences and output!") + sys.exit("Same file given as both primer sequences and output!") try: mm = int(mm) except ValueError: - sys_exit("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) + sys.exit("Expected non-negative integer number of mismatches (e.g. 0 or 1), not %r" % mm) if mm < 0: - sys_exit("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) -if mm not in [0,1,2]: + sys.exit("Expected non-negtive integer number of mismatches (e.g. 0 or 1), not %r" % mm) +if mm not in [0, 1, 2]: raise NotImplementedError try: min_len = int(min_len) except ValueError: - sys_exit("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) + sys.exit("Expected non-negative integer min_len (e.g. 0 or 1), not %r" % min_len) if min_len < 0: - sys_exit("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) + sys.exit("Expected non-negtive integer min_len (e.g. 0 or 1), not %r" % min_len) if keep_negatives.lower() in ["true", "yes", "on"]: @@ -88,7 +84,7 @@ elif keep_negatives.lower() in ["false", "no", "off"]: keep_negatives = False else: - sys_exit("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) + sys.exit("Expected boolean for keep_negatives (e.g. true or false), not %r" % keep_negatives) if primer_type.lower() == "forward": @@ -101,7 +97,7 @@ forward = False rc = True else: - sys_exit("Expected foward, reverse or reverse-complement not %r" % primer_type) + sys.exit("Expected foward, reverse or reverse-complement not %r" % primer_type) ambiguous_dna_values = { @@ -119,9 +115,9 @@ "H": "ACTMWYH", "D": "AGTRWKD", "B": "CGTSYKB", - "X": ".", #faster than [GATCMRWSYKVVHDBXN] or even [GATC] + "X": ".", # faster than [GATCMRWSYKVVHDBXN] or even [GATC] "N": ".", - } +} ambiguous_dna_re = {} for letter, values in ambiguous_dna_values.iteritems(): @@ -134,39 +130,41 @@ def make_reg_ex(seq): return "".join(ambiguous_dna_re[letter] for letter in seq) + def make_reg_ex_mm(seq, mm): if mm > 2: raise NotImplementedError("At most 2 mismatches allowed!") seq = seq.upper() yield make_reg_ex(seq) - for i in range(1,mm+1): - #Missing first/last i bases at very start/end of sequence - for reg in make_reg_ex_mm(seq[i:], mm-i): + for i in range(1, mm + 1): + # Missing first/last i bases at very start/end of sequence + for reg in make_reg_ex_mm(seq[i:], mm - i): yield "^" + reg - for reg in make_reg_ex_mm(seq[:-i], mm-i): + for reg in make_reg_ex_mm(seq[:-i], mm - i): yield "$" + reg if mm >= 1: - for i,letter in enumerate(seq): - #We'll use a set to remove any duplicate patterns - #if letter not in "NX": - pattern = seq[:i] + "N" + seq[i+1:] + for i, letter in enumerate(seq): + # We'll use a set to remove any duplicate patterns + # if letter not in "NX": + pattern = seq[:i] + "N" + seq[i + 1:] assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ % (pattern, len(pattern), seq, len(seq)) yield make_reg_ex(pattern) - if mm >=2: - for i,letter in enumerate(seq): - #We'll use a set to remove any duplicate patterns - #if letter not in "NX": - for k,letter in enumerate(seq[i+1:]): - #We'll use a set to remove any duplicate patterns - #if letter not in "NX": - pattern = seq[:i] + "N" + seq[i+1:i+1+k] + "N" + seq[i+k+2:] + if mm >= 2: + for i, letter in enumerate(seq): + # We'll use a set to remove any duplicate patterns + # if letter not in "NX": + for k, letter in enumerate(seq[i + 1:]): + # We'll use a set to remove any duplicate patterns + # if letter not in "NX": + pattern = seq[:i] + "N" + seq[i + 1:i + 1 + k] + "N" + seq[i + k + 2:] assert len(pattern) == len(seq), "Len %s is %i, len %s is %i" \ % (pattern, len(pattern), seq, len(seq)) yield make_reg_ex(pattern) + def load_primers_as_re(primer_fasta, mm, rc=False): - #Read primer file and record all specified sequences + # Read primer file and record all specified sequences primers = set() in_handle = open(primer_fasta, "rU") reader = fastaReader(in_handle) @@ -176,19 +174,18 @@ seq = reverse_complement(record.sequence) else: seq = record.sequence - #primers.add(re.compile(make_reg_ex(seq))) + # primers.add(re.compile(make_reg_ex(seq))) count += 1 for pattern in make_reg_ex_mm(seq, mm): primers.add(pattern) in_handle.close() - #Use set to avoid duplicates, sort to have longest first - #(so more specific primers found before less specific ones) + # Use set to avoid duplicates, sort to have longest first + # (so more specific primers found before less specific ones) primers = sorted(set(primers), key=lambda p: -len(p)) - return count, re.compile("|".join(primers)) #make one monster re! + return count, re.compile("|".join(primers)) # make one monster re! - -#Read primer file and record all specified sequences +# Read primer file and record all specified sequences count, primer = load_primers_as_re(primer_fasta, mm, rc) print "%i primer sequences" % count @@ -197,8 +194,8 @@ clipped = 0 negs = 0 -if seq_format.lower()=="sff": - #SFF is different because we just change the trim points +if seq_format.lower() == "sff": + # SFF is different because we just change the trim points if forward: def process(records): global short_clipped, short_neg, clipped, negs @@ -208,8 +205,8 @@ seq = str(record.seq)[left_clip:right_clip].upper() result = primer.search(seq) if result: - #Forward primer, take everything after it - #so move the left clip along + # Forward primer, take everything after it + # so move the left clip along if len(seq) - result.end() >= min_len: record.annotations["clip_qual_left"] = left_clip + result.end() clipped += 1 @@ -231,8 +228,8 @@ seq = str(record.seq)[left_clip:right_clip].upper() result = primer.search(seq) if result: - #Reverse primer, take everything before it - #so move the right clip back + # Reverse primer, take everything before it + # so move the right clip back new_len = result.start() if new_len >= min_len: record.annotations["clip_qual_right"] = left_clip + new_len @@ -246,7 +243,7 @@ yield record else: short_neg += 1 - + in_handle = open(in_file, "rb") try: manifest = ReadRocheXmlManifest(in_handle) @@ -256,7 +253,7 @@ out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) writer.write_file(process(SffIterator(in_handle))) - #End of SFF code + # End of SFF code elif seq_format.lower().startswith("fastq"): in_handle = open(in_file, "rU") out_handle = open(out_file, "w") @@ -267,7 +264,7 @@ seq = record.sequence.upper() result = primer.search(seq) if result: - #Forward primer, take everything after it + # Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: @@ -287,7 +284,7 @@ seq = record.sequence.upper() result = primer.search(seq) if result: - #Reverse primer, take everything before it + # Reverse primer, take everything before it cut = result.start() record.sequence = seq[:cut] if len(record.sequence) >= min_len: @@ -302,18 +299,18 @@ writer.write(record) else: short_neg += 1 -elif seq_format.lower()=="fasta": +elif seq_format.lower() == "fasta": in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastaReader(in_handle) writer = fastaWriter(out_handle) - #Following code is identical to that for FASTQ but without editing qualities + # Following code is identical to that for FASTQ but without editing qualities if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: - #Forward primer, take everything after it + # Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: @@ -332,7 +329,7 @@ seq = record.sequence.upper() result = primer.search(seq) if result: - #Reverse primer, take everything before it + # Reverse primer, take everything before it cut = result.start() record.sequence = seq[:cut] if len(record.sequence) >= min_len: @@ -347,7 +344,7 @@ else: short_neg += 1 else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % seq_format) in_handle.close() out_handle.close()
--- a/tools/seq_primer_clip/seq_primer_clip.xml Thu May 21 10:53:06 2015 -0400 +++ b/tools/seq_primer_clip/seq_primer_clip.xml Wed Feb 01 13:18:48 2017 -0500 @@ -1,7 +1,8 @@ -<tool id="seq_primer_clip" name="Primer clip sequences" version="0.0.13"> +<tool id="seq_primer_clip" name="Primer clip sequences" version="0.0.14"> <description>Trim off 5' or 3' primers</description> <requirements> - <requirement type="package" version="1.62">biopython</requirement> + <requirement type="package" version="1.0.1">galaxy_sequence_utils</requirement> + <requirement type="package" version="1.67">biopython</requirement> <requirement type="python-module">Bio</requirement> </requirements> <stdio> @@ -14,7 +15,7 @@ seq_primer_clip.py $input_file $input_file.ext $primer_fasta $primer_type $mm $min_len $keep_negatives $output_file </command> <inputs> - <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to clip" description="FASTA, FASTQ, or SFF format."/> + <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to clip" help="FASTA, FASTQ, or SFF format."/> <param name="primer_fasta" type="data" format="fasta" label="FASTA file containing primer(s)"/> <param name="primer_type" type="select" label="Type of primers"> <option value="Forward">Forward (5') primers</option>
--- a/tools/seq_primer_clip/tool_dependencies.xml Thu May 21 10:53:06 2015 -0400 +++ b/tools/seq_primer_clip/tool_dependencies.xml Wed Feb 01 13:18:48 2017 -0500 @@ -1,6 +1,9 @@ <?xml version="1.0"?> <tool_dependency> - <package name="biopython" version="1.62"> - <repository changeset_revision="ac9cc2992b69" name="package_biopython_1_62" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + <package name="biopython" version="1.67"> + <repository changeset_revision="fc45a61abc2f" name="package_biopython_1_67" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="galaxy_sequence_utils" version="1.0.1"> + <repository changeset_revision="c38bd3fe9da6" name="package_galaxy_sequence_utils_1_0_1" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> </tool_dependency>