Mercurial > repos > stevecassidy > nltktools
view g_frequency.py @ 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
line wrap: on
line source
import nltk from nltk import FreqDist import argparse def arguments(): parser = argparse.ArgumentParser(description="generate a word frequency table from a text") parser.add_argument('--input', required=True, action="store", type=str, help="input text file") parser.add_argument('--output', required=True, action="store", type=str, help="output file path") args = parser.parse_args() return args def frequency(in_file, out_file): """Input: a text file Output: a table of word frequency with three columns for Word, Count and Percent frequency """ with open(in_file, 'r') as fd: text = fd.read() words = nltk.word_tokenize(text) frequency = FreqDist(words) total = float(frequency.N()) with open(out_file, 'w') as output: output.write("Word\tCount\tPercent\n") for pair in frequency.items(): output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) if __name__ == '__main__': args = arguments() frequency(args.input, args.output)