diff g_chart_parser.py @ 0:e991d4e60c17 draft

planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author stevecassidy
date Wed, 12 Oct 2016 22:17:53 -0400
parents
children fb617586f4b2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_chart_parser.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,38 @@
+import sys
+import nltk
+import argparse
+from nltk.corpus import PlaintextCorpusReader
+
+def arguments():
+    parser = argparse.ArgumentParser(description="run NER on a text")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--grammar', required=True,  action="store", type=str, help="grammar file")
+    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+    args = parser.parse_args()
+    return args
+
+
+def chart_parse(in_file, grammar_file, out_file):
+    text = unicode(open(in_file, 'r').read(), errors='ignore')
+    output = open(out_file, 'w')
+    grammar_string = unicode(open(grammar_file, 'r').read(), errors='ignore')
+    try:
+        grammar = nltk.parse_cfg(grammar_string)
+        parser = nltk.ChartParser(grammar)
+        sentences = nltk.sent_tokenize(text)
+        for sentence in sentences:
+            words = nltk.word_tokenize(sentence)
+            tree = parser.parse(words)
+            output.write(tree.pprint())
+            output.write('\n')
+    except Exception, e:
+        message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e)
+        sys.stderr.write(message)
+        sys.exit()
+    output.close()
+
+if __name__ == '__main__':
+    args = arguments()
+    chart_parse(args.input, args.grammar, args.output)
+
+