annotate fasta_tabular_converter.py @ 3:403f0769fc1c draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
author drosofff
date Fri, 25 Mar 2016 19:29:59 -0400
parents 87e27aa012d7
children 87c99e4af616
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
1 #!/usr/bin/python
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
2 #
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
3 import sys
2
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
4 import string
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
5 import argparse
0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
6 from collections import defaultdict
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
7
2
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
8 def Parser():
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
9 the_parser = argparse.ArgumentParser()
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
10 the_parser.add_argument(
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
11 '--input', action="store", type=str, help="input file")
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
12 the_parser.add_argument(
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
13 '--output', action="store", type=str, help="output converted file")
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
14 the_parser.add_argument(
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
15 '--type', action="store", type=str, help="type of convertion")
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
16 args = the_parser.parse_args()
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
17 return args
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
18
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
19 def readfasta_writetabular(fasta, tabular, mode="oneline"):
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
20 F = open(fasta, "r")
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
21 for line in F:
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
22 if line[0] == ">":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
23 try:
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
24 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
25 except: pass
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
26 stringlist=[]
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
27 else:
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
28 stringlist.append(line[:-1])
3
403f0769fc1c planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 2
diff changeset
29 try:
403f0769fc1c planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 2
diff changeset
30 seqdic["".join(stringlist)] += 1 # for the last sequence
403f0769fc1c planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 2
diff changeset
31 except: pass # in case file to convert is empty
2
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
32 F.close()
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
33 F = open(tabular, "w")
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
34 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
35 print >> F, "%s\t%s" % (seq, seqdic[seq])
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
36 F.close()
0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
37
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
38
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
39 def readtabular_writefasta(tabular, fasta):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
40 F = open(tabular, "r")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
41 Fw = open(fasta, "w")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
42 counter = 0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
43 for line in F:
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
44 fields = line.split()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
45 for i in range(int(fields[1])):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
46 counter += 1
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
47 print >> Fw, ">%s\n%s" % (counter, fields[0])
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
48 F.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
49 Fw.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
50
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
51 def readtabular_writefastaweighted (tabular, fasta):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
52 F = open(tabular, "r")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
53 Fw = open(fasta, "w")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
54 counter = 0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
55 for line in F:
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
56 counter += 1
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
57 fields = line[:-1].split()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
58 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0])
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
59 F.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
60 Fw.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
61
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
62 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
63 F = open(fastaweigthed_input, "r")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
64 number_reads = 0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
65 for line in F:
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
66 if line[0] == ">":
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
67 weigth = int(line[1:-1].split("_")[-1])
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
68 number_reads += weigth
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
69 else:
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
70 seqdic[line[:-1]] += weigth
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
71 F.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
72 F = open(fastaweigthed_reparsed, "w")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
73 n=0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
74 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
75 n += 1
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
76 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq)
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
77 F.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
78 print "%s reads collapsed" % number_reads
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
79
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
80 def readfastaeighted_writefasta(fastaweigthed, fasta):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
81 F = open(fastaweigthed, "r")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
82 Fw = open(fasta, "w")
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
83 counter = 0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
84 for line in F:
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
85 if line[0] == ">":
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
86 weigth = int(line[1:-1].split("_")[-1])
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
87 else:
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
88 seq = line[:-1]
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
89 for i in range (weigth):
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
90 counter += 1
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
91 print >> Fw, ">%s\n%s" % (counter, seq)
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
92 F.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
93 Fw.close()
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
94
2
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
95 def main(input, output, type):
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
96 if type == "fasta2tabular":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
97 readfasta_writetabular(input, output)
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
98 elif type == "tabular2fasta":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
99 readtabular_writefasta(input, output)
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
100 elif type == "tabular2fastaweight":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
101 readtabular_writefastaweighted (input, output)
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
102 elif type == "fastaweight2fastaweight":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
103 readfastaeighted_writefastaweighted(input, output)
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
104 elif type == "fastaweight2fasta":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
105 readfastaeighted_writefasta(input, output)
0
70f2654ad7e2 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
106
2
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
107 if __name__ == "__main__":
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
108 seqdic = defaultdict(int)
87e27aa012d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
109 args = Parser()
3
403f0769fc1c planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 2
diff changeset
110 main (args.input, args.output, args.type)