Mercurial > repos > bgruening > find_subsequences

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_subsequences.py	Thu Mar 19 09:40:43 2015 -0400
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import re
+import sys
+import argparse
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqUtils import nt_search
+from Bio.Alphabet import generic_dna
+
+choices = ['embl', 'fasta', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb']
+
+def find_pattern(seqs, pattern, outfile_path):
+    """
+    Finds all occurrences of a pattern in the a given sequence.
+    Outputs sequence ID, start and end postion of the pattern.
+    """
+    pattern = pattern.upper()
+    rev_compl = Seq(pattern, generic_dna).complement()
+    search_func = simple_pattern_search
+    if set(pattern).difference(set('ATCG')):
+        search_func = complex_pattern_search
+
+    with open(outfile_path, 'w+') as outfile:
+        for seq in seqs:
+            search_func(seq, pattern, outfile)
+            search_func(seq, rev_compl, outfile, '-')
+
+
+def simple_pattern_search(sequence, pattern, outfile, strand='+'):
+    """
+    Simple regular expression search. This is way faster than the complex search.
+    """
+    bed_template = '%s\t%s\t%s\t%s\t%s\t%s\n'
+    for match in re.finditer( str(pattern), str(sequence.seq) ):
+        outfile.write(bed_template % (sequence.id,  match.start(), match.end(), sequence.name, '', strand))
+
+
+def complex_pattern_search(sequence, pattern, outfile, strand='+'):
+    """
+    Searching for pattern with biopyhon's nt_search().
+    This allows for ambiguous values, like N = A or T or C or G, R = A or G ...
+    """
+    l = len(pattern)
+    matches = nt_search(str(sequence.seq), pattern)
+    bed_template = '%s\t%s\t%s\t%s\t%s\t%s\n'
+    for match in matches[1:]:
+        outfile.write(bed_template % (sequence.id, match, match+l, sequence.name, '', strand) )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--input' , required=True)
+    parser.add_argument('-o', '--output' , required=True)
+    parser.add_argument('-p', '--pattern' , required=True)
+    parser.add_argument('-f', '--format', default="fasta", choices=choices)
+    args = parser.parse_args()
+
+    with open(args.input) as handle:
+        find_pattern( SeqIO.parse(handle, args.format), args.pattern, args.output )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find_subsequences.xml	Thu Mar 19 09:40:43 2015 -0400
@@ -0,0 +1,58 @@
+<tool id="bg_find_subsequences" name="Subsequence search" version="0.1">
+    <description>providing regions in BED format</description>
+    <requirements>
+        <requirement type="package" version="1.6.5">biopyhon</requirement>
+    </requirements>
+    <command interpreter="python">
+    <![CDATA[
+        find_subsequences.py
+            --input $input
+            --output $output
+            --pattern "$pattern"
+            #if $input.ext == 'fasta':
+                --format 'fasta'
+            #else:
+                --format 'fastq'
+            #end if
+    ]]>
+    </command>
+    <inputs>
+        <param format="fasta,fastq" name="input" type="data" label="Sequence to be searched"
+            help="This can be in FASTA or FASTQ format." />
+        <param name="pattern" type="text" value="" label="Search pattern" help="For example GGATCC for BamH1.">
+            <validator type="regex" message="Only letters are allowed.">^[a-zA-Z]+$</validator>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="bed" name="output" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="find_subsequences_input1.fasta" ftype="fasta"/>
+            <param name="pattern" value="atcg"/>
+            <output name="output" file="find_subsequences_simple_result1.bed" ftype="bed"/>
+        </test>
+        <test>
+            <param name="input" value="find_subsequences_input1.fasta" ftype="fasta"/>
+            <param name="pattern" value="atnncg"/>
+            <output name="output" file="find_subsequences_advanced_result1.bed" ftype="bed"/>
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+
+**What it does**
+
+Searches for a subsequence in a larger sequence. For example to get all restriction enzymes for BamH1.
+
+If you are searching in a DNA sequence you can use ambiguous values using the standard
+`nucleotide ambiguity code <http://www.dnabaser.com/articles/IUPAC%20ambiguity%20codes.html>`_.
+
+This tool is searching on both strands.
+
+]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btp163</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_subsequences_advanced_result1.bed	Thu Mar 19 09:40:43 2015 -0400
@@ -0,0 +1,2 @@
+forward_advanced	9	15			+
+reverse_advanced	9	15			-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_subsequences_input1.fasta	Thu Mar 19 09:40:43 2015 -0400
@@ -0,0 +1,11 @@
+>forward_simple
+ACCTGTAGCATCGGGGCTCGTA
+
+>reverse_simple
+AAAATTTAGCGATTGTAGTGTAGATAGTA
+
+>forward_advanced
+AAAAAAAAAATGGCGTTTTTTTTTT
+
+>reverse_advanced
+AAAAAAAAATAGGGCTTTTTTTTTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_subsequences_simple_result1.bed	Thu Mar 19 09:40:43 2015 -0400
@@ -0,0 +1,3 @@
+forward_simple	9	13			+
+forward_simple	5	9			-
+reverse_simple	6	10			-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Mar 19 09:40:43 2015 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.6.5">
+        <repository changeset_revision="f8d72690eeae" name="package_biopython_1_65" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>