view g_frequency.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children 0df72a8ab095
line wrap: on
line source

import nltk
from nltk import FreqDist
import argparse

nltk.download('punkt', quiet=True)


def arguments():
    parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
    return parser.parse_args()


def frequency(in_file, out_file):
    """Input: a text file
    Output: a table of word frequency with three columns for Word, Count and Percent frequency
    """
    with open(in_file, 'r') as fd:
        text = fd.read()

    words = nltk.word_tokenize(text)
    fdist = FreqDist(words)
    total = float(fdist.N())

    with open(out_file, 'w') as output:
        output.write("Word\tCount\tPercent\n")
        for pair in fdist.items():
            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))


if __name__ == '__main__':
    args = arguments()
    frequency(args.input, args.output)