Mercurial > repos > peterjc > seq_filter_by_mapping
changeset 3:9bb9b7164c12 draft
Uploaded v0.0.2, fixed some error messages
author | peterjc |
---|---|
date | Tue, 27 Jan 2015 08:29:54 -0500 |
parents | 575e59833ef7 |
children | b4acae21c11e |
files | tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.py~ tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/seq_filter_by_mapping.xml~ |
diffstat | 5 files changed, 16 insertions(+), 443 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/seq_filter_by_mapping/README.rst Wed Nov 19 10:00:34 2014 -0500 +++ b/tools/seq_filter_by_mapping/README.rst Tue Jan 27 08:29:54 2015 -0500 @@ -61,6 +61,7 @@ Version Changes ------- ---------------------------------------------------------------------- v0.0.1 - Initial version. +v0.0.2 - Fixed some error messages. ======= ====================================================================== @@ -76,7 +77,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder:: - $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.* tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq + $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq Check this worked::
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed Nov 19 10:00:34 2014 -0500 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Tue Jan 27 08:29:54 2015 -0500 @@ -24,7 +24,7 @@ import subprocess from optparse import OptionParser -def stop_err(msg, err=1): +def sys_exit(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) @@ -64,7 +64,7 @@ options, args = parser.parse_args() if options.version: - print "v0.0.1" + print "v0.0.2" sys.exit(0) in_file = options.input @@ -74,18 +74,18 @@ pair_mode = options.pair_mode if in_file is None or not os.path.isfile(in_file): - stop_err("Missing input file: %r" % in_file) + sys_exit("Missing input file: %r" % in_file) if out_positive_file is None and out_negative_file is None: - stop_err("Neither output file requested") + sys_exit("Neither output file requested") if seq_format is None: - stop_err("Missing sequence format") + sys_exit("Missing sequence format") if pair_mode not in ["lax", "strict"]: - stop_err("Pair mode argument should be 'lax' or 'strict', not %r" % logic) + sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) for mapping in args: if not os.path.isfile(mapping): - stop_err("Mapping file %r not found") + sys_exit("Mapping file %r not found" % mapping) if not args: - stop_err("At least one SAM/BAM mapping file is required") + sys_exit("At least one SAM/BAM mapping file is required") #Cope with three widely used suffix naming convensions, @@ -187,8 +187,9 @@ stdout, stderr = child.communicate() assert child.returncode is not None if child.returncode: - msg = "Error %i from 'samtools view %s'\n%s" % (filename, stderr) - stop_err(msg.strip(), child.returncode) + msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, + filename, stderr) + sys_exit(msg.strip(), child.returncode) else: handle.close() @@ -317,7 +318,7 @@ try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: - stop_err("SFF filtering requires Biopython 1.54 or later") + sys_exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest @@ -366,4 +367,4 @@ fastq_filter(in_file, out_positive_file, out_negative_file, ids) # This does not currently track the counts else: - stop_err("Unsupported file type %r" % seq_format) + sys_exit("Unsupported file type %r" % seq_format)
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py~ Wed Nov 19 10:00:34 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,307 +0,0 @@ -#!/usr/bin/env python -"""Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information. - -When filtering an SFF file, any Roche XML manifest in the input file is -preserved in both output files. - -This tool is a short Python script which requires Biopython 1.54 or later -for SFF file support. If you use this tool in scientific work leading to a -publication, please cite the Biopython application note: - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - -This script is copyright 2014 by Peter Cock, The James Hutton Institute -(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. -See accompanying text file for licence details (MIT license). - -Use -v or --version to get the version, -h or --help for help. -""" -import os -import sys -import re -from optparse import OptionParser - -def stop_err(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -#Parse Command Line -usage = """Use as follows: - -$ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] - -e.g. Positive matches using column one from a single BAM file: - -$ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam - -Multiple SAM/BAM mapping files may be given. -""" -parser = OptionParser(usage=usage) -parser.add_option('-i', '--input', dest='input', - default=None, help='Input sequences filename', - metavar="FILE") -parser.add_option('-f', '--format', dest='format', - default=None, - help='Input sequence format (e.g. fasta, fastq, sff)') -parser.add_option('-p', '--positive', dest='output_positive', - default=None, - help='Output filename for mapping reads', - metavar="FILE") -parser.add_option('-n', '--negative', dest='output_negative', - default=None, - help='Output filename for non-mapping reads', - metavar="FILE") -parser.add_option("-m", "--pair-mode", dest="pair_mode", - default="lax", - help="How to treat paired reads (lax or strict)") -parser.add_option("-v", "--version", dest="version", - default=False, action="store_true", - help="Show version and quit") - -options, args = parser.parse_args() - -if options.version: - print "v0.0.1" - sys.exit(0) - -in_file = options.input -seq_format = options.format -out_positive_file = options.output_positive -out_negative_file = options.output_negative -pair_mode = options.pair_mode - -if in_file is None or not os.path.isfile(in_file): - stop_err("Missing input file: %r" % in_file) -if out_positive_file is None and out_negative_file is None: - stop_err("Neither output file requested") -if seq_format is None: - stop_err("Missing sequence format") -if pair_mode not in ["lax", "strict"]: - stop_err("Pair mode argument should be 'lax' or 'strict', not %r" % logic) -for mapping in args: - if not os.path.isfile(mapping): - stop_err("Mapping file %r not found") -if not args: - stop_err("At least one SAM/BAM mapping file is required") - - -#Cope with three widely used suffix naming convensions, -#Illumina: /1 or /2 -#Forward/revered: .f or .r -#Sanger, e.g. .p1k and .q1k -#See http://staden.sourceforge.net/manual/pregap4_unix_50.html -#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") -#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") -re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") -assert re_suffix.search("demo.f") -assert re_suffix.search("demo.s1") -assert re_suffix.search("demo.f1k") -assert re_suffix.search("demo.p1") -assert re_suffix.search("demo.p1k") -assert re_suffix.search("demo.p1lk") -assert re_suffix.search("demo/2") -assert re_suffix.search("demo.r") -assert re_suffix.search("demo.q1") -assert re_suffix.search("demo.q1lk") - -def clean_name(name): - """Remove suffix.""" - match = re_suffix.search(name) - if match: - # Use the fact this is a suffix, and regular expression will be - # anchored to the end of the name: - return name[:match.start()] - else: - # Nothing to do - return name -assert clean_name("foo/1") == "foo" -assert clean_name("foo/2") == "foo" -assert clean_name("bar.f") == "bar" -assert clean_name("bar.r") == "bar" -assert clean_name("baz.p1") == "baz" -assert clean_name("baz.q2") == "baz" - -mapped_chars = { '>' :'__gt__', - '<' :'__lt__', - "'" :'__sq__', - '"' :'__dq__', - '[' :'__ob__', - ']' :'__cb__', - '{' :'__oc__', - '}' :'__cc__', - '@' : '__at__', - '\n' : '__cn__', - '\r' : '__cr__', - '\t' : '__tc__', - '#' : '__pd__' - } - -# Read mapping file(s) and record all mapped identifiers -ids = set() -for mapping in args: - handle = open(mapping) - # Hope this is SAM format... - for line in handle: - # Ignore header lines - if line[0] != "@": - qname, flag, rest = line.split("\t", 2) - flag = int(flag) - # TODO - pair mode - if flag & 0x4: - ids.add(qname) -# TODO - If want to support naive paired mode, have to record -# more than just qname (need /1 or /2 indicator) -print("Loaded %i mapped IDs" % (len(ids))) - - -def crude_fasta_iterator(handle): - """Yields tuples, record ID and the full record as a string.""" - while True: - line = handle.readline() - if line == "": - return # Premature end of file, or just empty? - if line[0] == ">": - break - - no_id_warned = False - while True: - if line[0] != ">": - raise ValueError( - "Records in Fasta files should start with '>' character") - try: - id = line[1:].split(None, 1)[0] - except IndexError: - if not no_id_warned: - sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n") - no_id_warned = True - id = None - lines = [line] - line = handle.readline() - while True: - if not line: - break - if line[0] == ">": - break - lines.append(line) - line = handle.readline() - yield id, "".join(lines) - if not line: - return # StopIteration - - -def fasta_filter(in_file, pos_file, neg_file, wanted): - """FASTA filter producing 60 character line wrapped outout.""" - pos_count = neg_count = 0 - #Galaxy now requires Python 2.5+ so can use with statements, - with open(in_file) as in_handle: - #Doing the if statement outside the loop for speed - #(with the downside of three very similar loops). - if pos_file is not None and neg_file is not None: - print "Generating two FASTA files" - with open(pos_file, "w") as pos_handle: - with open(neg_file, "w") as neg_handle: - for identifier, record in crude_fasta_iterator(in_handle): - if clean_name(identifier) in wanted: - pos_handle.write(record) - pos_count += 1 - else: - neg_handle.write(record) - neg_count += 1 - elif pos_file is not None: - print "Generating matching FASTA file" - with open(pos_file, "w") as pos_handle: - for identifier, record in crude_fasta_iterator(in_handle): - if clean_name(identifier) in wanted: - pos_handle.write(record) - pos_count += 1 - else: - neg_count += 1 - else: - print "Generating non-matching FASTA file" - assert neg_file is not None - with open(neg_file, "w") as neg_handle: - for identifier, record in crude_fasta_iterator(in_handle): - if clean_name(identifier) in wanted: - pos_count += 1 - else: - neg_handle.write(record) - neg_count += 1 - return pos_count, neg_count - - -if seq_format.lower()=="sff": - #Now write filtered SFF file based on IDs from BLAST file - try: - from Bio.SeqIO.SffIO import SffIterator, SffWriter - except ImportError: - stop_err("SFF filtering requires Biopython 1.54 or later") - - try: - from Bio.SeqIO.SffIO import ReadRocheXmlManifest - except ImportError: - #Prior to Biopython 1.56 this was a private function - from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest - in_handle = open(in_file, "rb") #must be binary mode! - try: - manifest = ReadRocheXmlManifest(in_handle) - except ValueError: - manifest = None - #This makes two passes though the SFF file with isn't so efficient, - #but this makes the code simple. - pos_count = neg_count = 0 - if out_positive_file is not None: - out_handle = open(out_positive_file, "wb") - writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) - out_handle.close() - if out_negative_file is not None: - out_handle = open(out_negative_file, "wb") - writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) - out_handle.close() - #And we're done - in_handle.close() - #At the time of writing, Galaxy doesn't show SFF file read counts, - #so it is useful to put them in stdout and thus shown in job info. - print "%i with and %i without specified IDs" % (pos_count, neg_count) -elif seq_format.lower()=="fasta": - #Write filtered FASTA file based on IDs from tabular file - pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) - print "%i with and %i without specified IDs" % (pos_count, neg_count) -elif seq_format.lower().startswith("fastq"): - #Write filtered FASTQ file based on IDs from mapping file - from Bio.SeqIO.QualityIO import FastqGeneralIterator - handle = open(in_file, "r") - if out_positive_file is not None and out_negative_file is not None: - print "Generating two FASTQ files" - positive_handle = open(out_positive_file, "w") - negative_handle = open(out_negative_file, "w") - for title, seq, qual in FastqGeneralIterator(handle): - print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) - if clean_name(title.split(None, 1)[0]) in ids: - positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) - else: - negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) - positive_handle.close() - negative_handle.close() - elif out_positive_file is not None: - print "Generating matching FASTQ file" - positive_handle = open(out_positive_file, "w") - for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) in ids: - positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) - positive_handle.close() - elif out_negative_file is not None: - print "Generating non-matching FASTQ file" - negative_handle = open(out_negative_file, "w") - for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) not in ids: - negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) - negative_handle.close() - handle.close() -else: - stop_err("Unsupported file type %r" % seq_format)
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed Nov 19 10:00:34 2014 -0500 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Tue Jan 27 08:29:54 2015 -0500 @@ -1,4 +1,4 @@ -<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.1"> +<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.2"> <description>from SAM/BAM file</description> <requirements> <requirement type="package" version="1.64">biopython</requirement>
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml~ Wed Nov 19 10:00:34 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ -<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.1"> - <description>from SAM/BAM file</description> - <requirements> - <requirement type="package" version="1.64">biopython</requirement> - <requirement type="python-module">Bio</requirement> - </requirements> - <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command> - <command interpreter="python"> -seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode -#if $output_choice_cond.output_choice=="both" - -p $output_pos -n $output_neg -#elif $output_choice_cond.output_choice=="pos" - -p $output_pos -#elif $output_choice_cond.output_choice=="neg" - -n $output_neg -#end if - </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <inputs> - <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." /> - <param name="mapping_file" type="data" format="sam,bam" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." /> - <conditional name="output_choice_cond"> - <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?"> - <option value="both">Both mapped and unmapped reads, as two files</option> - <option value="pos">Just mapped reads, as a single file</option> - <option value="neg">Just unmapped reads, as a single file</option> - </param> - <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml --> - <when value="both" /> - <when value="pos" /> - <when value="neg" /> - </conditional> - <param name="pair_mode" type="select" label="Paired read treatment"> - <option value="strict">Treat as a pair, require both reads to be mapped</option> - <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option> - <option value="strict">Treat as a pair, require both reads to be mapped</option> - <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter... - <option value="solo">Treat independently (will split partners when only one maps)</option> - --> - </param> - </inputs> - <outputs> - <data name="output_pos" format="input" metadata_source="input_file" label="With matched ID"> - <filter>output_choice_cond["output_choice"] != "neg"</filter> - </data> - <data name="output_neg" format="input" metadata_source="input_file" label="Without matched ID"> - <filter>output_choice_cond["output_choice"] != "pos"</filter> - </data> - </outputs> - <tests> - <test> - <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" /> - <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" /> - <param name="output_choice" value="pos" /> - <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" /> - <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" /> - <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" /> - <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" /> - </test> - <test> - <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" /> - <output name="output_pos" file="sanger-sample.fastq" ftype="fastq" /> - </test> - <test> - <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" /> - <output name="output_pos" file="sanger-pairs-mixed.fastq" ftype="fastq" /> - <output name="output_neg" file="empty_file.dat" ftype="fastq" /> - </test> - <test> - <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" /> - <output name="output_pos" file="empty_file.dat" ftype="fastq" /> - <output name="output_neg" file="sanger-pairs-mixed.fastq" ftype="fastq" /> - </test> - </tests> - <help> -**What it does** - -By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in -two, those sequences (or read pairs) which do or don't map in the provided -SAM/BAM file. You can opt to have a single output file of just the mapping reads, -or just the non-mapping ones. - -**Example Usage** - -You might wish to perform a contamination screan by mapping your reads against -known contaminant reference sequences, then use this tool to select only the -unmapped reads for further analysis (e.g. *de novo* assembly). - -Similarly you might wish to map your reads against a known bacterial reference, -then take the non-mapping sequences forward for analysis if looking for novel -plasmids. - - -**References** - -If you use this Galaxy tool in work leading to a scientific publication please -cite: - -Peter J.A. Cock (2014), Galaxy tool for filtering reads by mapping -http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping - -This tool uses Biopython to read and write SFF files, so you may also wish to -cite the Biopython application note (and Galaxy too of course): - -Cock et al (2009). Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - -This tool is available to install into other Galaxy Instances via the Galaxy -Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping - </help> -</tool>