Mercurial > repos > stevecassidy > nltktools
annotate g_read_sents.py @ 0:e991d4e60c17 draft
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author | stevecassidy |
---|---|
date | Wed, 12 Oct 2016 22:17:53 -0400 |
parents | |
children | fb617586f4b2 |
rev | line source |
---|---|
0
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
1 import sys |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
2 import os |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
3 import nltk |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
4 from nltk.corpus import PlaintextCorpusReader |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
5 import argparse |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
6 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
7 def Parser(): |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
8 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
11 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
12 args = the_parser.parse_args() |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
13 return args |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
14 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
15 def print_out(outp, text, sentences): |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
16 o = open(outp, 'w') |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
17 curr = 0 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
18 for sent in sentences: |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
19 times = count_occurences(sent, sent[-1]) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
20 curr = text.find(sent[0], curr) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
21 end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
22 o.write(text[curr:end] + '\n') |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
23 curr = end |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
24 o.close() |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
25 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
26 def find_nth(string, sub, n, offset): |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
27 start = string.find(sub, offset) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
28 while start >= 0 and n > 1: |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
29 start = string.find(sub, start + len(sub)) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
30 n -= 1 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
31 return start |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
32 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
33 def count_occurences(lst, string): |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
34 count = 0 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
35 for item in lst: |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
36 if string in item: |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
37 count += 1 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
38 return count |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
39 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
40 def read_sents(inp, outp): |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
41 i = open(inp, 'r').read() |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
42 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
43 sents = corpus.sents() |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
44 print_out(outp, i, sents) |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
45 |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
46 if __name__ == '__main__': |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
47 args = Parser() |
e991d4e60c17
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
stevecassidy
parents:
diff
changeset
|
48 read_sents(args.input, args.output) |