comparison g_collocation.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children
comparison
equal deleted inserted replaced
1:fb617586f4b2 2:a47980ef2b96
1 import sys
2 import os
3 import nltk 1 import nltk
4 from nltk.collocations import * 2 from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
3 from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures
5 import argparse 4 import argparse
5
6 nltk.download('punkt', quiet=True)
7
6 8
7 def Parser(): 9 def Parser():
8 the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar") 10 the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar")
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 11 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 12 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
11 the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus") 13 the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus")
12 the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results") 14 the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results")
13 the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find") 15 the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find")
14 the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags") 16 the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags")
15 17
16 args = the_parser.parse_args() 18 return the_parser.parse_args()
17 return args 19
18 20
19 def collocation(inp, outp, freq_filter, results, coll_type, pos): 21 def collocation(inp, outp, freq_filter, results, coll_type, pos):
20 pos = bool(pos == 'true') 22 pos = bool(pos == 'true')
21 with open(inp, 'r') as fd: 23 with open(inp, 'r') as fd:
22 i = fd.read() 24 i = fd.read()
29 else: 31 else:
30 sents = nltk.sent_tokenize(i) 32 sents = nltk.sent_tokenize(i)
31 for sent in sents: 33 for sent in sents:
32 all_words += nltk.word_tokenize(sent) 34 all_words += nltk.word_tokenize(sent)
33 if coll_type == 'bigram': 35 if coll_type == 'bigram':
34 measures = nltk.collocations.BigramAssocMeasures() 36 measures = BigramAssocMeasures()
35 finder = BigramCollocationFinder.from_words(all_words) 37 finder = BigramCollocationFinder.from_words(all_words)
36 else: 38 else:
37 measures = nltk.collocations.TrigramAssocMeasures() 39 measures = TrigramAssocMeasures()
38 finder = TrigramCollocationFinder.from_words(all_words) 40 finder = TrigramCollocationFinder.from_words(all_words)
39 finder.apply_freq_filter(int(freq_filter)) 41 finder.apply_freq_filter(int(freq_filter))
40 colls = finder.nbest(measures.pmi, int(results)) 42 # score the ngrams and get the first N
41 with open(outp, 'w') as output: 43 colls = finder.score_ngrams(measures.pmi)[:int(results)]
44 with open(outp, 'w') as output:
42 for coll in colls: 45 for coll in colls:
43 output.write("%s\t%s" % coll) 46 (a, b), score = coll
44 output.write('\n') 47 output.write("%s\t%s\n" % (a, b))
48
45 49
46 if __name__ == '__main__': 50 if __name__ == '__main__':
47 args = Parser() 51 args = Parser()
48 52
49 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) 53 collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos)