diff g_collocation.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children
line wrap: on
line diff
--- a/g_collocation.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_collocation.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,9 +1,11 @@
-import sys
-import os
 import nltk
-from nltk.collocations import *
+from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
+from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures
 import argparse
 
+nltk.download('punkt', quiet=True)
+
+
 def Parser():
     the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar")
     the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
@@ -13,8 +15,8 @@
     the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find")
     the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags")
 
-    args = the_parser.parse_args()
-    return args
+    return the_parser.parse_args()
+
 
 def collocation(inp, outp, freq_filter, results, coll_type, pos):
     pos = bool(pos == 'true')
@@ -31,17 +33,19 @@
         for sent in sents:
             all_words += nltk.word_tokenize(sent)
     if coll_type == 'bigram':
-        measures = nltk.collocations.BigramAssocMeasures()
+        measures = BigramAssocMeasures()
         finder = BigramCollocationFinder.from_words(all_words)
     else:
-        measures = nltk.collocations.TrigramAssocMeasures()
+        measures = TrigramAssocMeasures()
         finder = TrigramCollocationFinder.from_words(all_words)
     finder.apply_freq_filter(int(freq_filter))
-    colls = finder.nbest(measures.pmi, int(results))
-    with  open(outp, 'w') as output:
+    # score the ngrams and get the first N
+    colls = finder.score_ngrams(measures.pmi)[:int(results)]
+    with open(outp, 'w') as output:
         for coll in colls:
-            output.write("%s\t%s" % coll)
-            output.write('\n')
+            (a, b), score = coll
+            output.write("%s\t%s\n" % (a, b))
+
 
 if __name__ == '__main__':
     args = Parser()