annotate relabel_fasta.py @ 5:bbfc9638ba84 draft

First version with (partial) bioconda deps.
author pjbriggs
date Wed, 13 Jun 2018 08:39:26 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
1 #!/usr/bin/env python
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
2
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
3 """
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
4 Replace FASTA labels with new labels <PREFIX>1, <PREFIX>2,
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
5 <PREFIX>3 etc (where <PREFIX> is a user-provided argument).
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
6
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
7 Can be used to label OTUs as OTU_1, OTU_2 etc.
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
8
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
9 This is a reimplementation of the fasta_number.py script from
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
10 https://drive5.com/python/fasta_number_py.html
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
11 """
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
12
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
13 import argparse
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
14
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
15 def relabel_fasta(fp,prefix,include_size=False):
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
16 """
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
17 """
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
18 # Iterate over lines in file
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
19 nlabel = 0
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
20 for line in fp:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
21 # Strip trailing newlines
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
22 line = line.rstrip('\n')
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
23 if not line:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
24 # Skip blank lines
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
25 continue
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
26 elif line.startswith('>'):
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
27 #
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
28 nlabel += 1
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
29 label = line[1:].strip()
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
30 if args.needsize:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
31 # Extract size from the label
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
32 try:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
33 size = filter(
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
34 lambda x: x.startswith("size="),
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
35 label.split(';'))[0]
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
36 except Exception as ex:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
37 raise Exception("Couldn't locate 'size' in "
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
38 "label: %s" % label)
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
39 yield ">%s%d;%s" % (args.prefix,
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
40 nlabel,
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
41 size)
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
42 else:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
43 yield ">%s%d" % (args.prefix,
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
44 nlabel)
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
45 else:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
46 # Echo the line to output
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
47 yield line
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
48
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
49 if __name__ == "__main__":
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
50
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
51 # Set up command line parser
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
52 p = argparse.ArgumentParser()
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
53 p.add_argument("--needsize",action="store_true")
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
54 p.add_argument("--nosize",action="store_true")
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
55 p.add_argument("fasta")
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
56 p.add_argument("prefix")
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
57
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
58 # Process command line
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
59 args = p.parse_args()
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
60
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
61 # Relabel FASTA
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
62 with open(args.fasta,'rU') as fasta:
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
63 for line in relabel_fasta(fasta,
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
64 args.prefix,
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
65 include_size=args.needsize):
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
66 print line
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
67
bbfc9638ba84 First version with (partial) bioconda deps.
pjbriggs
parents:
diff changeset
68