Mercurial > repos > stevecassidy > nltktools
comparison g_collocation.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:19:55 -0400 |
parents | fb617586f4b2 |
children |
comparison
equal
deleted
inserted
replaced
1:fb617586f4b2 | 2:a47980ef2b96 |
---|---|
1 import sys | |
2 import os | |
3 import nltk | 1 import nltk |
4 from nltk.collocations import * | 2 from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures |
3 from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures | |
5 import argparse | 4 import argparse |
5 | |
6 nltk.download('punkt', quiet=True) | |
7 | |
6 | 8 |
7 def Parser(): | 9 def Parser(): |
8 the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar") | 10 the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar") |
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 11 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 12 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
11 the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus") | 13 the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus") |
12 the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results") | 14 the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results") |
13 the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find") | 15 the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find") |
14 the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags") | 16 the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags") |
15 | 17 |
16 args = the_parser.parse_args() | 18 return the_parser.parse_args() |
17 return args | 19 |
18 | 20 |
19 def collocation(inp, outp, freq_filter, results, coll_type, pos): | 21 def collocation(inp, outp, freq_filter, results, coll_type, pos): |
20 pos = bool(pos == 'true') | 22 pos = bool(pos == 'true') |
21 with open(inp, 'r') as fd: | 23 with open(inp, 'r') as fd: |
22 i = fd.read() | 24 i = fd.read() |
29 else: | 31 else: |
30 sents = nltk.sent_tokenize(i) | 32 sents = nltk.sent_tokenize(i) |
31 for sent in sents: | 33 for sent in sents: |
32 all_words += nltk.word_tokenize(sent) | 34 all_words += nltk.word_tokenize(sent) |
33 if coll_type == 'bigram': | 35 if coll_type == 'bigram': |
34 measures = nltk.collocations.BigramAssocMeasures() | 36 measures = BigramAssocMeasures() |
35 finder = BigramCollocationFinder.from_words(all_words) | 37 finder = BigramCollocationFinder.from_words(all_words) |
36 else: | 38 else: |
37 measures = nltk.collocations.TrigramAssocMeasures() | 39 measures = TrigramAssocMeasures() |
38 finder = TrigramCollocationFinder.from_words(all_words) | 40 finder = TrigramCollocationFinder.from_words(all_words) |
39 finder.apply_freq_filter(int(freq_filter)) | 41 finder.apply_freq_filter(int(freq_filter)) |
40 colls = finder.nbest(measures.pmi, int(results)) | 42 # score the ngrams and get the first N |
41 with open(outp, 'w') as output: | 43 colls = finder.score_ngrams(measures.pmi)[:int(results)] |
44 with open(outp, 'w') as output: | |
42 for coll in colls: | 45 for coll in colls: |
43 output.write("%s\t%s" % coll) | 46 (a, b), score = coll |
44 output.write('\n') | 47 output.write("%s\t%s\n" % (a, b)) |
48 | |
45 | 49 |
46 if __name__ == '__main__': | 50 if __name__ == '__main__': |
47 args = Parser() | 51 args = Parser() |
48 | 52 |
49 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) | 53 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) |