Mercurial > repos > stevecassidy > nltktools
comparison g_frequency.py @ 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
comparison
equal
deleted
inserted
replaced
0:e991d4e60c17 | 1:fb617586f4b2 |
---|---|
12 | 12 |
13 def frequency(in_file, out_file): | 13 def frequency(in_file, out_file): |
14 """Input: a text file | 14 """Input: a text file |
15 Output: a table of word frequency with three columns for Word, Count and Percent frequency | 15 Output: a table of word frequency with three columns for Word, Count and Percent frequency |
16 """ | 16 """ |
17 text = unicode(open(in_file, 'r').read(), errors='ignore') | 17 with open(in_file, 'r') as fd: |
18 text = fd.read() | |
19 | |
18 words = nltk.word_tokenize(text) | 20 words = nltk.word_tokenize(text) |
19 frequency = FreqDist(words) | 21 frequency = FreqDist(words) |
20 total = float(frequency.N()) | 22 total = float(frequency.N()) |
21 output = open(out_file, 'w') | 23 |
22 output.write("Word\tCount\tPercent\n") | 24 with open(out_file, 'w') as output: |
23 for pair in frequency.items(): | 25 output.write("Word\tCount\tPercent\n") |
24 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) | 26 for pair in frequency.items(): |
25 output.close() | 27 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) |
26 | 28 |
27 | 29 |
28 if __name__ == '__main__': | 30 if __name__ == '__main__': |
29 args = arguments() | 31 args = arguments() |
30 frequency(args.input, args.output) | 32 frequency(args.input, args.output) |