Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences

--- a/fasta_merge_files_and_filter_unique_sequences.py	Wed Feb 01 13:24:03 2017 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.py	Fri Feb 03 14:27:44 2017 -0500
@@ -1,19 +1,22 @@
 #!/usr/bin/env python
 import os
 import sys
+import re

 class Sequence:
     ''' Holds protein sequence information '''
     def __init__(self):
         self.header = ""
+        self.accession = ""
         self.sequence = ""

 class FASTAReader:
     """
         FASTA db iterator. Returns a single FASTA sequence object.
     """
-    def __init__(self, fasta_name):
+    def __init__(self, fasta_name, accession_parser):
         self.fasta_file = open(fasta_name)
+        self.accession_parser = accession_parser

     def __iter__(self):
         return self
@@ -30,6 +33,11 @@
         seq = Sequence()
         seq.header = line.rstrip().replace('\n','').replace('\r','')

+        m = re.search(self.accession_parser, seq.header)
+        if not m or len(m.groups()) < 1 or len(m.group(1)) == 0:
+          sys.exit("Could not parse accession from '%s'" % seq.header)
+        seq.accession = m.group(1)
+
         while True:
             tail = self.fasta_file.tell()
             line = self.fasta_file.readline()
@@ -47,7 +55,7 @@

 def main():
     seen_sequences = dict([])
-    seen_headers = set([])
+    seen_accessions = set([])

     out_file = open(sys.argv[1], 'w')
     if sys.argv[2] == "sequence":
@@ -57,26 +65,30 @@
     else:
         sys.exit("2nd argument must be 'sequence' or 'accession'")

-    for fasta_file in sys.argv[3:]:
+    accession_parser = sys.argv[3]
+    for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items():
+      accession_parser = accession_parser.replace(value, key)
+
+    for fasta_file in sys.argv[4:]:
         print("Reading entries from '%s'" % fasta_file)
-        fa_reader = FASTAReader(fasta_file)
+        fa_reader = FASTAReader(fasta_file, accession_parser)
         for protein in fa_reader:
             if unique_sequences:
-                if protein.header in seen_headers:
-                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                if protein.accession in seen_accessions:
+                    print("Skipping protein '%s' with duplicate accession" % protein.header)
                     continue
-                elif protein.sequence in seen_sequences:
-                    print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence]))
+                elif hash(protein.sequence) in seen_sequences:
+                    print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)]))
                     continue
                 else:
-                    seen_sequences[protein.sequence] = protein.header
-                    seen_headers.add(protein.header)
+                    seen_sequences[hash(protein.sequence)] = protein.header
+                    seen_accessions.add(protein.accession)
             else:
-                if protein.header in seen_headers:
-                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                if protein.accession in seen_accessions:
+                    print("Skipping protein '%s' with duplicate accession" % protein.header)
                     continue
                 else:
-                    seen_headers.add(protein.header)
+                    seen_accessions.add(protein.accession)
             out_file.write(protein.header)
             out_file.write(os.linesep)
             out_file.write(protein.sequence)
--- a/fasta_merge_files_and_filter_unique_sequences.xml	Wed Feb 01 13:24:03 2017 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.xml	Fri Feb 03 14:27:44 2017 -0500
@@ -5,7 +5,7 @@
     </requirements>
     <command>
         python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
-        '$output' $uniqueness_criterion
+        '$output' $uniqueness_criterion '$accession_parser'
         #for $input in $inputs:
             '$input'
         #end for
@@ -16,6 +16,19 @@
             <option value="sequence" selected="true">Accession and Sequence</option>
             <option value="accession">Accession Only</option>
         </param>
+        <param name="accession_parser" type="text" label="Accession Parsing Regex" value="^&gt;([^ ]+).*$" help="Regular expression with 1 capture group; the capture group is the accession (which must be unique)">
+          <sanitizer>
+            <valid>
+              <add preset="string.printable"/>
+              <remove value="&#92;" />
+              <remove value="&apos;" />
+            </valid>
+            <mapping initial="none">
+              <add source="&#92;" target="__backslash__" />
+              <add source="&apos;" target="__sq__"/>
+            </mapping>
+          </sanitizer>
+        </param>
     </inputs>
     <outputs>
         <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
@@ -24,19 +37,23 @@
         <test>
           <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
           <param name="uniqueness_criterion" value="sequence" />
+          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
           <output name="output" file="res-sequence.fa" ftype="fasta" />
           <assert_stdout>
             <has_line line="Skipping protein '&gt;one_2' with duplicate sequence (first seen as '&gt;one')" />
             <has_line line="Skipping protein '&gt;two_2' with duplicate sequence (first seen as '&gt;two')" />
-            <has_line line="Skipping protein '&gt;three_2' with duplicate header" />
+            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
+            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
           </assert_stdout>
         </test>
         <test>
           <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
           <param name="uniqueness_criterion" value="accession" />
+          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
           <output name="output" file="res-accession.fa" ftype="fasta" />
           <assert_stdout>
-            <has_line line="Skipping protein '&gt;three_2' with duplicate header" />
+            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
+            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
           </assert_stdout>
         </test>
     </tests>
@@ -49,7 +66,7 @@
 If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
 Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.

-In the context of this script, the accession is the entire header line.
+The default accession parser will treat everything in the header before the first space as the accession.

 ------
--- a/test-data/2.fa	Wed Feb 01 13:24:03 2017 -0500
+++ b/test-data/2.fa	Fri Feb 03 14:27:44 2017 -0500
@@ -2,7 +2,9 @@
 ACGTACGT
 >two_2
 GGTGTGTACGT
->three_2
+>three_2|123
 ACGTACGACTTTGGTTGTGT
->three_2
+>three_2|456
+ACGTACGACTTTGGTTGTGT
+>three_2 789
 ACGTACGACTTTGGTTGTGTT
--- a/test-data/res-accession.fa	Wed Feb 01 13:24:03 2017 -0500
+++ b/test-data/res-accession.fa	Fri Feb 03 14:27:44 2017 -0500
@@ -8,5 +8,5 @@
 ACGTACGT
 >two_2
 GGTGTGTACGT
->three_2
+>three_2|123
 ACGTACGACTTTGGTTGTGT
\ No newline at end of file
--- a/test-data/res-sequence.fa	Wed Feb 01 13:24:03 2017 -0500
+++ b/test-data/res-sequence.fa	Fri Feb 03 14:27:44 2017 -0500
@@ -4,5 +4,5 @@
 GGTGTGTACGT
 >three
 ACGTACG
->three_2
+>three_2|123
 ACGTACGACTTTGGTTGTGT
\ No newline at end of file