Mercurial > repos > stevecassidy > nltktools

diff g_read_sents.py @ 0:e991d4e60c17 draft
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author: stevecassidy
date: Wed, 12 Oct 2016 22:17:53 -0400
children: fb617586f4b2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_read_sents.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,48 @@
+import sys
+import os
+import nltk
+from nltk.corpus import PlaintextCorpusReader
+import argparse
+
+def Parser():
+    the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
+    the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+
+    args = the_parser.parse_args()
+    return args
+
+def print_out(outp, text, sentences):
+    o = open(outp, 'w')
+    curr = 0
+    for sent in sentences:
+        times = count_occurences(sent, sent[-1])
+        curr = text.find(sent[0], curr)
+        end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
+        o.write(text[curr:end] + '\n')
+        curr = end
+    o.close()
+
+def find_nth(string, sub, n, offset):
+    start = string.find(sub, offset)
+    while start >= 0 and n > 1:
+        start = string.find(sub, start + len(sub))
+        n -= 1
+    return start
+
+def count_occurences(lst, string):
+    count = 0
+    for item in lst:
+        if string in item:
+            count += 1
+    return count
+
+def read_sents(inp, outp):
+    i = open(inp, 'r').read()
+    corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
+    sents = corpus.sents()
+    print_out(outp, i, sents)
+
+if __name__ == '__main__':
+    args = Parser()
+    read_sents(args.input, args.output)
author	stevecassidy
date	Wed, 12 Oct 2016 22:17:53 -0400
parents
children	fb617586f4b2