diff relabel_fasta.py @ 5:bbfc9638ba84 draft

First version with (partial) bioconda deps.
author pjbriggs
date Wed, 13 Jun 2018 08:39:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/relabel_fasta.py	Wed Jun 13 08:39:26 2018 -0400
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+Replace FASTA labels with new labels <PREFIX>1, <PREFIX>2,
+<PREFIX>3 etc (where <PREFIX> is a user-provided argument).
+
+Can be used to label OTUs as OTU_1, OTU_2 etc.
+
+This is a reimplementation of the fasta_number.py script from
+https://drive5.com/python/fasta_number_py.html
+"""
+
+import argparse
+
+def relabel_fasta(fp,prefix,include_size=False):
+    """
+    """
+    # Iterate over lines in file
+    nlabel = 0
+    for line in fp:
+        # Strip trailing newlines
+        line = line.rstrip('\n')
+        if not line:
+            # Skip blank lines
+            continue
+        elif line.startswith('>'):
+            # 
+            nlabel += 1
+            label = line[1:].strip()
+            if args.needsize:
+                # Extract size from the label
+                try:
+                    size = filter(
+                        lambda x: x.startswith("size="),
+                        label.split(';'))[0]
+                except Exception as ex:
+                    raise Exception("Couldn't locate 'size' in "
+                                    "label: %s" % label)
+                yield ">%s%d;%s" % (args.prefix,
+                                    nlabel,
+                                    size)
+            else:
+                yield ">%s%d" % (args.prefix,
+                                 nlabel)
+        else:
+            # Echo the line to output
+            yield line
+
+if __name__ == "__main__":
+
+    # Set up command line parser
+    p = argparse.ArgumentParser()
+    p.add_argument("--needsize",action="store_true")
+    p.add_argument("--nosize",action="store_true")
+    p.add_argument("fasta")
+    p.add_argument("prefix")
+    
+    # Process command line
+    args = p.parse_args()
+
+    # Relabel FASTA
+    with open(args.fasta,'rU') as fasta:
+        for line in relabel_fasta(fasta,
+                                  args.prefix,
+                                  include_size=args.needsize):
+            print line
+
+