Mercurial > repos > stevecassidy > nltktools

diff g_read_sents.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author: stevecassidy
date: Wed, 01 Nov 2017 01:19:55 -0400
parents: fb617586f4b2
--- a/g_read_sents.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_read_sents.py	Wed Nov 01 01:19:55 2017 -0400
@@ -1,9 +1,12 @@
-import sys
+
 import os
 import nltk
 from nltk.corpus import PlaintextCorpusReader
 import argparse
 
+nltk.download('punkt', quiet=True)
+
+
 def Parser():
     the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
     the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
@@ -12,15 +15,15 @@
     args = the_parser.parse_args()
     return args
 
-def print_out(outp, text, sentences):
+
+def print_out(outp, sentences):
     with open(outp, 'w') as output:
-        curr = 0
         for sent in sentences:
-            times = count_occurences(sent, sent[-1])
-            curr = text.find(sent[0], curr)
-            end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
-            output.write(text[curr:end] + '\n')
-            curr = end
+            for tok in sent:
+                output.write(tok)
+                output.write(' ')
+            output.write('\n')
+
 
 def find_nth(string, sub, n, offset):
     start = string.find(sub, offset)
@@ -29,6 +32,7 @@
         n -= 1
     return start
 
+
 def count_occurences(lst, string):
     count = 0
     for item in lst:
@@ -36,12 +40,13 @@
             count += 1
     return count
 
+
 def read_sents(inp, outp):
-    with open(inp, 'r') as fd:
-        i = fd.read()
+
     corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
     sents = corpus.sents()
-    print_out(outp, i, sents)
+    print_out(outp, sents)
+
 
 if __name__ == '__main__':
     args = Parser()
author	stevecassidy
date	Wed, 01 Nov 2017 01:19:55 -0400
parents	fb617586f4b2
children