Mercurial > repos > stevecassidy > nltktools
comparison g_frequency.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:19:55 -0400 |
parents | fb617586f4b2 |
children | 0df72a8ab095 |
comparison
equal
deleted
inserted
replaced
1:fb617586f4b2 | 2:a47980ef2b96 |
---|---|
1 import nltk | 1 import nltk |
2 from nltk import FreqDist | 2 from nltk import FreqDist |
3 import argparse | 3 import argparse |
4 | 4 |
5 nltk.download('punkt', quiet=True) | |
6 | |
7 | |
5 def arguments(): | 8 def arguments(): |
6 parser = argparse.ArgumentParser(description="generate a word frequency table from a text") | 9 parser = argparse.ArgumentParser(description="generate a word frequency table from a text") |
7 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
8 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
9 args = parser.parse_args() | 12 return parser.parse_args() |
10 return args | |
11 | 13 |
12 | 14 |
13 def frequency(in_file, out_file): | 15 def frequency(in_file, out_file): |
14 """Input: a text file | 16 """Input: a text file |
15 Output: a table of word frequency with three columns for Word, Count and Percent frequency | 17 Output: a table of word frequency with three columns for Word, Count and Percent frequency |
16 """ | 18 """ |
17 with open(in_file, 'r') as fd: | 19 with open(in_file, 'r') as fd: |
18 text = fd.read() | 20 text = fd.read() |
19 | 21 |
20 words = nltk.word_tokenize(text) | 22 words = nltk.word_tokenize(text) |
21 frequency = FreqDist(words) | 23 fdist = FreqDist(words) |
22 total = float(frequency.N()) | 24 total = float(fdist.N()) |
23 | 25 |
24 with open(out_file, 'w') as output: | 26 with open(out_file, 'w') as output: |
25 output.write("Word\tCount\tPercent\n") | 27 output.write("Word\tCount\tPercent\n") |
26 for pair in frequency.items(): | 28 for pair in fdist.items(): |
27 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) | 29 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total)) |
28 | 30 |
29 | 31 |
30 if __name__ == '__main__': | 32 if __name__ == '__main__': |
31 args = arguments() | 33 args = arguments() |
32 frequency(args.input, args.output) | 34 frequency(args.input, args.output) |