Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences

--- a/README.md	Fri Dec 16 05:19:02 2016 -0500
+++ b/README.md	Wed Feb 01 13:24:03 2017 -0500
@@ -9,7 +9,7 @@
 Description
 -----------

-Merge FASTA files, keeping only unique sequences.
+Merge FASTA files, keeping either only unique sequences or only unique header/accession lines.


 GalaxyP Community
@@ -44,4 +44,5 @@
 Authors and contributors:

 * John Chilton <jmchilton@gmail.com>
+* Matt Chambers <matt.chambers42@gmail.com>
 * Minnesota Supercomputing Institute, Univeristy of Minnesota
--- a/fasta_merge_files_and_filter_unique_sequences.py	Fri Dec 16 05:19:02 2016 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.py	Wed Feb 01 13:24:03 2017 -0500
@@ -46,20 +46,41 @@


 def main():
-    seen_sequences = set([])
+    seen_sequences = dict([])
+    seen_headers = set([])

     out_file = open(sys.argv[1], 'w')
-    for fasta_file in sys.argv[2:]:
+    if sys.argv[2] == "sequence":
+        unique_sequences = True
+    elif sys.argv[2] == "accession":
+        unique_sequences = False
+    else:
+        sys.exit("2nd argument must be 'sequence' or 'accession'")
+
+    for fasta_file in sys.argv[3:]:
+        print("Reading entries from '%s'" % fasta_file)
         fa_reader = FASTAReader(fasta_file)
         for protein in fa_reader:
-            if protein.sequence in seen_sequences:
-                pass
+            if unique_sequences:
+                if protein.header in seen_headers:
+                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                    continue
+                elif protein.sequence in seen_sequences:
+                    print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence]))
+                    continue
+                else:
+                    seen_sequences[protein.sequence] = protein.header
+                    seen_headers.add(protein.header)
             else:
-                seen_sequences.add(protein.sequence)
-                out_file.write(protein.header)
-                out_file.write(os.linesep)
-                out_file.write(protein.sequence)
-                out_file.write(os.linesep)
+                if protein.header in seen_headers:
+                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                    continue
+                else:
+                    seen_headers.add(protein.header)
+            out_file.write(protein.header)
+            out_file.write(os.linesep)
+            out_file.write(protein.sequence)
+            out_file.write(os.linesep)
     out_file.close()

 if __name__ == "__main__":
--- a/fasta_merge_files_and_filter_unique_sequences.xml	Fri Dec 16 05:19:02 2016 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.xml	Wed Feb 01 13:24:03 2017 -0500
@@ -5,13 +5,17 @@
     </requirements>
     <command>
         python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
-        '$output'
+        '$output' $uniqueness_criterion
         #for $input in $inputs:
             '$input'
         #end for
     </command>
     <inputs>
         <param name="inputs" format="fasta" multiple="True" type="data" label="Input FASTA files"/>
+        <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?">
+            <option value="sequence" selected="true">Accession and Sequence</option>
+            <option value="accession">Accession Only</option>
+        </param>
     </inputs>
     <outputs>
         <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
@@ -19,7 +23,21 @@
     <tests>
         <test>
           <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
-          <output name="output" file="res.fa" ftype="fasta" />
+          <param name="uniqueness_criterion" value="sequence" />
+          <output name="output" file="res-sequence.fa" ftype="fasta" />
+          <assert_stdout>
+            <has_line line="Skipping protein '&gt;one_2' with duplicate sequence (first seen as '&gt;one')" />
+            <has_line line="Skipping protein '&gt;two_2' with duplicate sequence (first seen as '&gt;two')" />
+            <has_line line="Skipping protein '&gt;three_2' with duplicate header" />
+          </assert_stdout>
+        </test>
+        <test>
+          <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
+          <param name="uniqueness_criterion" value="accession" />
+          <output name="output" file="res-accession.fa" ftype="fasta" />
+          <assert_stdout>
+            <has_line line="Skipping protein '&gt;three_2' with duplicate header" />
+          </assert_stdout>
         </test>
     </tests>
     <help>
@@ -27,7 +45,11 @@
 **What it does**

 Concatenate FASTA database files together.
-Only first appearence of each unique sequence will appear in output.
+
+If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
+Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.
+
+In the context of this script, the accession is the entire header line.

 ------
--- a/test-data/2.fa	Fri Dec 16 05:19:02 2016 -0500
+++ b/test-data/2.fa	Wed Feb 01 13:24:03 2017 -0500
@@ -4,3 +4,5 @@
 GGTGTGTACGT
 >three_2
 ACGTACGACTTTGGTTGTGT
+>three_2
+ACGTACGACTTTGGTTGTGTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/res-accession.fa	Wed Feb 01 13:24:03 2017 -0500
@@ -0,0 +1,12 @@
+>one
+ACGTACGT
+>two
+GGTGTGTACGT
+>three
+ACGTACG
+>one_2
+ACGTACGT
+>two_2
+GGTGTGTACGT
+>three_2
+ACGTACGACTTTGGTTGTGT
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/res-sequence.fa	Wed Feb 01 13:24:03 2017 -0500
@@ -0,0 +1,8 @@
+>one
+ACGTACGT
+>two
+GGTGTGTACGT
+>three
+ACGTACG
+>three_2
+ACGTACGACTTTGGTTGTGT
\ No newline at end of file
--- a/test-data/res.fa	Fri Dec 16 05:19:02 2016 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
->one
-ACGTACGT
->two
-GGTGTGTACGT
->three
-ACGTACG
->three_2
-ACGTACGACTTTGGTTGTGT