Mercurial > repos > stevecassidy > nltktools
comparison g_read_sents.py @ 0:e991d4e60c17 draft
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author | stevecassidy |
---|---|
date | Wed, 12 Oct 2016 22:17:53 -0400 |
parents | |
children | fb617586f4b2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e991d4e60c17 |
---|---|
1 import sys | |
2 import os | |
3 import nltk | |
4 from nltk.corpus import PlaintextCorpusReader | |
5 import argparse | |
6 | |
7 def Parser(): | |
8 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") | |
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | |
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | |
11 | |
12 args = the_parser.parse_args() | |
13 return args | |
14 | |
15 def print_out(outp, text, sentences): | |
16 o = open(outp, 'w') | |
17 curr = 0 | |
18 for sent in sentences: | |
19 times = count_occurences(sent, sent[-1]) | |
20 curr = text.find(sent[0], curr) | |
21 end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) | |
22 o.write(text[curr:end] + '\n') | |
23 curr = end | |
24 o.close() | |
25 | |
26 def find_nth(string, sub, n, offset): | |
27 start = string.find(sub, offset) | |
28 while start >= 0 and n > 1: | |
29 start = string.find(sub, start + len(sub)) | |
30 n -= 1 | |
31 return start | |
32 | |
33 def count_occurences(lst, string): | |
34 count = 0 | |
35 for item in lst: | |
36 if string in item: | |
37 count += 1 | |
38 return count | |
39 | |
40 def read_sents(inp, outp): | |
41 i = open(inp, 'r').read() | |
42 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) | |
43 sents = corpus.sents() | |
44 print_out(outp, i, sents) | |
45 | |
46 if __name__ == '__main__': | |
47 args = Parser() | |
48 read_sents(args.input, args.output) |