comparison g_collocation.py @ 1:fb617586f4b2 draft

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author stevecassidy
date Mon, 05 Dec 2016 05:22:05 -0500
parents e991d4e60c17
children a47980ef2b96
comparison
equal deleted inserted replaced
0:e991d4e60c17 1:fb617586f4b2
16 args = the_parser.parse_args() 16 args = the_parser.parse_args()
17 return args 17 return args
18 18
19 def collocation(inp, outp, freq_filter, results, coll_type, pos): 19 def collocation(inp, outp, freq_filter, results, coll_type, pos):
20 pos = bool(pos == 'true') 20 pos = bool(pos == 'true')
21 i = str(unicode(open(inp, 'r').read(), errors='ignore')) 21 with open(inp, 'r') as fd:
22 o = open(outp, 'w') 22 i = fd.read()
23
23 all_words = [] 24 all_words = []
24 if pos: 25 if pos:
25 text = i.split(' ')[:-1] 26 text = i.split(' ')[:-1]
26 all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] 27 all_words = [x[0:x.index('/')] if x != '\n' else x for x in text]
27 all_words = [x.strip(' ').strip('\n') for x in all_words] 28 all_words = [x.strip(' ').strip('\n') for x in all_words]
35 else: 36 else:
36 measures = nltk.collocations.TrigramAssocMeasures() 37 measures = nltk.collocations.TrigramAssocMeasures()
37 finder = TrigramCollocationFinder.from_words(all_words) 38 finder = TrigramCollocationFinder.from_words(all_words)
38 finder.apply_freq_filter(int(freq_filter)) 39 finder.apply_freq_filter(int(freq_filter))
39 colls = finder.nbest(measures.pmi, int(results)) 40 colls = finder.nbest(measures.pmi, int(results))
40 for coll in colls: 41 with open(outp, 'w') as output:
41 o.write("%s\t%s" % coll) 42 for coll in colls:
42 o.write('\n') 43 output.write("%s\t%s" % coll)
43 o.close() 44 output.write('\n')
44 45
45 if __name__ == '__main__': 46 if __name__ == '__main__':
46 args = Parser() 47 args = Parser()
47 48
48 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) 49 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos)