nltktools: g_read_sents.py comparison

comparison g_read_sents.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty

author	stevecassidy
date	Wed, 01 Nov 2017 01:19:55 -0400
parents	fb617586f4b2
children

comparison

equal deleted inserted replaced

-:fb617586f4b2
+:a47980ef2b96
-import sys
 import os
 import nltk
 from nltk.corpus import PlaintextCorpusReader
 import argparse
+nltk.download('punkt', quiet=True)
 def Parser():
 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
 args = the_parser.parse_args()
 return args
-def print_out(outp, text, sentences):
+def print_out(outp, sentences):
 with open(outp, 'w') as output:
-curr = 0
 for sent in sentences:
-times = count_occurences(sent, sent[-1])
+for tok in sent:
-curr = text.find(sent[0], curr)
+output.write(tok)
-end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
+output.write(' ')
-output.write(text[curr:end] + '\n')
+output.write('\n')
-curr = end
 def find_nth(string, sub, n, offset):
 start = string.find(sub, offset)
 while start >= 0 and n > 1:
 start = string.find(sub, start + len(sub))
 n -= 1
 return start
 def count_occurences(lst, string):
 count = 0
 for item in lst:
 if string in item:
 count += 1
 return count
 def read_sents(inp, outp):
-with open(inp, 'r') as fd:
-i = fd.read()
 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
 sents = corpus.sents()
-print_out(outp, i, sents)
+print_out(outp, sents)
 if __name__ == '__main__':
 args = Parser()
 read_sents(args.input, args.output)

Mercurial > repos > stevecassidy > nltktools

comparison g_read_sents.py @ 2:a47980ef2b96 draft