Mercurial > repos > stevecassidy > nltktools
comparison g_collocation.py @ 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
comparison
equal
deleted
inserted
replaced
0:e991d4e60c17 | 1:fb617586f4b2 |
---|---|
16 args = the_parser.parse_args() | 16 args = the_parser.parse_args() |
17 return args | 17 return args |
18 | 18 |
19 def collocation(inp, outp, freq_filter, results, coll_type, pos): | 19 def collocation(inp, outp, freq_filter, results, coll_type, pos): |
20 pos = bool(pos == 'true') | 20 pos = bool(pos == 'true') |
21 i = str(unicode(open(inp, 'r').read(), errors='ignore')) | 21 with open(inp, 'r') as fd: |
22 o = open(outp, 'w') | 22 i = fd.read() |
23 | |
23 all_words = [] | 24 all_words = [] |
24 if pos: | 25 if pos: |
25 text = i.split(' ')[:-1] | 26 text = i.split(' ')[:-1] |
26 all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] | 27 all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] |
27 all_words = [x.strip(' ').strip('\n') for x in all_words] | 28 all_words = [x.strip(' ').strip('\n') for x in all_words] |
35 else: | 36 else: |
36 measures = nltk.collocations.TrigramAssocMeasures() | 37 measures = nltk.collocations.TrigramAssocMeasures() |
37 finder = TrigramCollocationFinder.from_words(all_words) | 38 finder = TrigramCollocationFinder.from_words(all_words) |
38 finder.apply_freq_filter(int(freq_filter)) | 39 finder.apply_freq_filter(int(freq_filter)) |
39 colls = finder.nbest(measures.pmi, int(results)) | 40 colls = finder.nbest(measures.pmi, int(results)) |
40 for coll in colls: | 41 with open(outp, 'w') as output: |
41 o.write("%s\t%s" % coll) | 42 for coll in colls: |
42 o.write('\n') | 43 output.write("%s\t%s" % coll) |
43 o.close() | 44 output.write('\n') |
44 | 45 |
45 if __name__ == '__main__': | 46 if __name__ == '__main__': |
46 args = Parser() | 47 args = Parser() |
47 | 48 |
48 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) | 49 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) |