comparison g_read_sents.py @ 0:e991d4e60c17 draft

planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author stevecassidy
date Wed, 12 Oct 2016 22:17:53 -0400
parents
children fb617586f4b2
comparison
equal deleted inserted replaced
-1:000000000000 0:e991d4e60c17
1 import sys
2 import os
3 import nltk
4 from nltk.corpus import PlaintextCorpusReader
5 import argparse
6
7 def Parser():
8 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
11
12 args = the_parser.parse_args()
13 return args
14
15 def print_out(outp, text, sentences):
16 o = open(outp, 'w')
17 curr = 0
18 for sent in sentences:
19 times = count_occurences(sent, sent[-1])
20 curr = text.find(sent[0], curr)
21 end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
22 o.write(text[curr:end] + '\n')
23 curr = end
24 o.close()
25
26 def find_nth(string, sub, n, offset):
27 start = string.find(sub, offset)
28 while start >= 0 and n > 1:
29 start = string.find(sub, start + len(sub))
30 n -= 1
31 return start
32
33 def count_occurences(lst, string):
34 count = 0
35 for item in lst:
36 if string in item:
37 count += 1
38 return count
39
40 def read_sents(inp, outp):
41 i = open(inp, 'r').read()
42 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
43 sents = corpus.sents()
44 print_out(outp, i, sents)
45
46 if __name__ == '__main__':
47 args = Parser()
48 read_sents(args.input, args.output)