comparison g_frequency.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children 0df72a8ab095
comparison
equal deleted inserted replaced
1:fb617586f4b2 2:a47980ef2b96
1 import nltk 1 import nltk
2 from nltk import FreqDist 2 from nltk import FreqDist
3 import argparse 3 import argparse
4 4
5 nltk.download('punkt', quiet=True)
6
7
5 def arguments(): 8 def arguments():
6 parser = argparse.ArgumentParser(description="generate a word frequency table from a text") 9 parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
7 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
8 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
9 args = parser.parse_args() 12 return parser.parse_args()
10 return args
11 13
12 14
13 def frequency(in_file, out_file): 15 def frequency(in_file, out_file):
14 """Input: a text file 16 """Input: a text file
15 Output: a table of word frequency with three columns for Word, Count and Percent frequency 17 Output: a table of word frequency with three columns for Word, Count and Percent frequency
16 """ 18 """
17 with open(in_file, 'r') as fd: 19 with open(in_file, 'r') as fd:
18 text = fd.read() 20 text = fd.read()
19 21
20 words = nltk.word_tokenize(text) 22 words = nltk.word_tokenize(text)
21 frequency = FreqDist(words) 23 fdist = FreqDist(words)
22 total = float(frequency.N()) 24 total = float(fdist.N())
23 25
24 with open(out_file, 'w') as output: 26 with open(out_file, 'w') as output:
25 output.write("Word\tCount\tPercent\n") 27 output.write("Word\tCount\tPercent\n")
26 for pair in frequency.items(): 28 for pair in fdist.items():
27 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) 29 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))
28 30
29 31
30 if __name__ == '__main__': 32 if __name__ == '__main__':
31 args = arguments() 33 args = arguments()
32 frequency(args.input, args.output) 34 frequency(args.input, args.output)