Mercurial > repos > artbio > cherry_pick_fasta

--- a/cherry_pick_fasta.py	Fri Apr 08 16:56:08 2022 +0000
+++ b/cherry_pick_fasta.py	Thu Dec 29 11:52:53 2022 +0000
@@ -1,4 +1,5 @@
 import argparse
+from collections import defaultdict


 def Parser():
@@ -28,23 +29,49 @@


 def parse_fasta_dict(query, fasta_dict, mode):
+
     if not isinstance(query, list):
         query = [query]
+
+    def kmers(string, ksize, index):
+        if ksize > len(string):
+            return
+        for i in range(len(string) - ksize + 1):
+            kmer = string[i:i+ksize]
+            index[kmer].append(string)
+
+    def consult_index(word, index):
+        accumulator = []
+        print(len(index[word]))
+        for title in index[word]:
+            accumulator.append(title)
+        print(len(accumulator))
+        for title in set(accumulator):
+            print(title)
+
     accumulator = []
     if mode == 'includes':
-        for seq_id in fasta_dict:
-            for string in query:
-                if string in seq_id:
-                    accumulator.append(seq_id)
-                    continue
+        kmersizes = set([len(word) for word in query])
+        index = defaultdict(list)
+        for size in kmersizes:
+            for header in fasta_dict:
+                kmers(header, size, index)
+        for keyword in query:
+            for header in index[keyword]:
+                accumulator.append(header)
+        accumulator = set(accumulator)
+        res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
+        return res_dict
     elif mode == 'exact':
-        for seq_id in fasta_dict:
-            for string in query:
-                if string == seq_id:
-                    accumulator.append(seq_id)
-                    continue
-    res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
-    return res_dict
+        for keyword in query:
+            try:
+                len(fasta_dict[keyword])
+                accumulator.append(keyword)
+            except KeyError:
+                pass
+        accumulator = set(accumulator)
+        res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator}
+        return res_dict


 def complement_fasta_dict(fasta_dict, subfasta_dict):
--- a/cherry_pick_fasta.xml	Fri Apr 08 16:56:08 2022 +0000
+++ b/cherry_pick_fasta.xml	Thu Dec 29 11:52:53 2022 +0000
@@ -1,4 +1,4 @@
-<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="3.3">
+<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="4.0">
   <description>with header satisfying a string query</description>
   <requirements>
         <requirement type="package" version="3.8.0">python</requirement>
@@ -93,8 +93,6 @@
         <param name="match" value="exact" />
         <output name="output" ftype="fasta" file="output_exact.fa" />
     </test>
-
-
     <test>
         <param ftype="fasta" name="input" value="input.fa" />
         <param name="options_selector" value="textdataset" />
@@ -111,8 +109,6 @@
         <param name="match" value="exact" />
         <output name="output" ftype="fasta" file="output_alt_termlist.fa" />
     </test>
-
-
     <!-- partial matches -->
     <test>
         <param ftype="fasta" name="input" value="input.fa" />
@@ -147,5 +143,7 @@
 This tool retrieves nucleotide/peptide sequences from a fasta file whose headers match
 or do not match a given string, or a list of strings.

+Note that the version 4 of the tools is amazingly accelerated !
+
   </help>
 </tool>