Mercurial > repos > stevecassidy > nltktools
diff g_collocation.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:19:55 -0400 |
parents | fb617586f4b2 |
children |
line wrap: on
line diff
--- a/g_collocation.py Mon Dec 05 05:22:05 2016 -0500 +++ b/g_collocation.py Wed Nov 01 01:19:55 2017 -0400 @@ -1,9 +1,11 @@ -import sys -import os import nltk -from nltk.collocations import * +from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures +from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures import argparse +nltk.download('punkt', quiet=True) + + def Parser(): the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar") the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") @@ -13,8 +15,8 @@ the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find") the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags") - args = the_parser.parse_args() - return args + return the_parser.parse_args() + def collocation(inp, outp, freq_filter, results, coll_type, pos): pos = bool(pos == 'true') @@ -31,17 +33,19 @@ for sent in sents: all_words += nltk.word_tokenize(sent) if coll_type == 'bigram': - measures = nltk.collocations.BigramAssocMeasures() + measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(all_words) else: - measures = nltk.collocations.TrigramAssocMeasures() + measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(all_words) finder.apply_freq_filter(int(freq_filter)) - colls = finder.nbest(measures.pmi, int(results)) - with open(outp, 'w') as output: + # score the ngrams and get the first N + colls = finder.score_ngrams(measures.pmi)[:int(results)] + with open(outp, 'w') as output: for coll in colls: - output.write("%s\t%s" % coll) - output.write('\n') + (a, b), score = coll + output.write("%s\t%s\n" % (a, b)) + if __name__ == '__main__': args = Parser()