Mercurial > repos > stevecassidy > nltktools
comparison g_collocation.py @ 0:e991d4e60c17 draft
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author | stevecassidy |
---|---|
date | Wed, 12 Oct 2016 22:17:53 -0400 |
parents | |
children | fb617586f4b2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e991d4e60c17 |
---|---|
1 import sys | |
2 import os | |
3 import nltk | |
4 from nltk.collocations import * | |
5 import argparse | |
6 | |
7 def Parser(): | |
8 the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar") | |
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | |
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | |
11 the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus") | |
12 the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results") | |
13 the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find") | |
14 the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags") | |
15 | |
16 args = the_parser.parse_args() | |
17 return args | |
18 | |
19 def collocation(inp, outp, freq_filter, results, coll_type, pos): | |
20 pos = bool(pos == 'true') | |
21 i = str(unicode(open(inp, 'r').read(), errors='ignore')) | |
22 o = open(outp, 'w') | |
23 all_words = [] | |
24 if pos: | |
25 text = i.split(' ')[:-1] | |
26 all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] | |
27 all_words = [x.strip(' ').strip('\n') for x in all_words] | |
28 else: | |
29 sents = nltk.sent_tokenize(i) | |
30 for sent in sents: | |
31 all_words += nltk.word_tokenize(sent) | |
32 if coll_type == 'bigram': | |
33 measures = nltk.collocations.BigramAssocMeasures() | |
34 finder = BigramCollocationFinder.from_words(all_words) | |
35 else: | |
36 measures = nltk.collocations.TrigramAssocMeasures() | |
37 finder = TrigramCollocationFinder.from_words(all_words) | |
38 finder.apply_freq_filter(int(freq_filter)) | |
39 colls = finder.nbest(measures.pmi, int(results)) | |
40 for coll in colls: | |
41 o.write("%s\t%s" % coll) | |
42 o.write('\n') | |
43 o.close() | |
44 | |
45 if __name__ == '__main__': | |
46 args = Parser() | |
47 | |
48 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) |