Mercurial > repos > stevecassidy > nltktools
comparison g_read_sents.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:19:55 -0400 |
parents | fb617586f4b2 |
children |
comparison
equal
deleted
inserted
replaced
1:fb617586f4b2 | 2:a47980ef2b96 |
---|---|
1 import sys | 1 |
2 import os | 2 import os |
3 import nltk | 3 import nltk |
4 from nltk.corpus import PlaintextCorpusReader | 4 from nltk.corpus import PlaintextCorpusReader |
5 import argparse | 5 import argparse |
6 | |
7 nltk.download('punkt', quiet=True) | |
8 | |
6 | 9 |
7 def Parser(): | 10 def Parser(): |
8 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") | 11 the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") |
9 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 12 the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
10 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 13 the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
11 | 14 |
12 args = the_parser.parse_args() | 15 args = the_parser.parse_args() |
13 return args | 16 return args |
14 | 17 |
15 def print_out(outp, text, sentences): | 18 |
19 def print_out(outp, sentences): | |
16 with open(outp, 'w') as output: | 20 with open(outp, 'w') as output: |
17 curr = 0 | |
18 for sent in sentences: | 21 for sent in sentences: |
19 times = count_occurences(sent, sent[-1]) | 22 for tok in sent: |
20 curr = text.find(sent[0], curr) | 23 output.write(tok) |
21 end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) | 24 output.write(' ') |
22 output.write(text[curr:end] + '\n') | 25 output.write('\n') |
23 curr = end | 26 |
24 | 27 |
25 def find_nth(string, sub, n, offset): | 28 def find_nth(string, sub, n, offset): |
26 start = string.find(sub, offset) | 29 start = string.find(sub, offset) |
27 while start >= 0 and n > 1: | 30 while start >= 0 and n > 1: |
28 start = string.find(sub, start + len(sub)) | 31 start = string.find(sub, start + len(sub)) |
29 n -= 1 | 32 n -= 1 |
30 return start | 33 return start |
31 | 34 |
35 | |
32 def count_occurences(lst, string): | 36 def count_occurences(lst, string): |
33 count = 0 | 37 count = 0 |
34 for item in lst: | 38 for item in lst: |
35 if string in item: | 39 if string in item: |
36 count += 1 | 40 count += 1 |
37 return count | 41 return count |
38 | 42 |
43 | |
39 def read_sents(inp, outp): | 44 def read_sents(inp, outp): |
40 with open(inp, 'r') as fd: | 45 |
41 i = fd.read() | |
42 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) | 46 corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) |
43 sents = corpus.sents() | 47 sents = corpus.sents() |
44 print_out(outp, i, sents) | 48 print_out(outp, sents) |
49 | |
45 | 50 |
46 if __name__ == '__main__': | 51 if __name__ == '__main__': |
47 args = Parser() | 52 args = Parser() |
48 read_sents(args.input, args.output) | 53 read_sents(args.input, args.output) |