view g_frequency.py @ 1:fb617586f4b2 draft

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author stevecassidy
date Mon, 05 Dec 2016 05:22:05 -0500
parents e991d4e60c17
children a47980ef2b96
line wrap: on
line source

import nltk
from nltk import FreqDist
import argparse

def arguments():
  parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
  parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
  parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
  args = parser.parse_args()
  return args


def frequency(in_file, out_file):
    """Input: a text file
    Output: a table of word frequency with three columns for Word, Count and Percent frequency
    """
    with open(in_file, 'r') as fd:
        text = fd.read()

    words = nltk.word_tokenize(text)
    frequency = FreqDist(words)
    total = float(frequency.N())
    
    with open(out_file, 'w') as output:
        output.write("Word\tCount\tPercent\n")
        for pair in frequency.items():
            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))


if __name__ == '__main__':
    args = arguments()
    frequency(args.input, args.output)