Mercurial > repos > stevecassidy > nltktools
changeset 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
files | colloc.dat g_chart_parser.py g_collocation.py g_frequency.py g_pos.py g_read_sents.py g_stemmer.py g_tokenize.py test-data/elephant.txt test-data/grammar.dat test-data/sample_text.txt tmp.dat |
diffstat | 12 files changed, 104 insertions(+), 769 deletions(-) [+] |
line wrap: on
line diff
--- a/colloc.dat Wed Oct 12 22:17:53 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -will be -that will -. The -is to -of the -and the
--- a/g_chart_parser.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_chart_parser.py Mon Dec 05 05:22:05 2016 -0500 @@ -13,20 +13,26 @@ def chart_parse(in_file, grammar_file, out_file): - text = unicode(open(in_file, 'r').read(), errors='ignore') - output = open(out_file, 'w') - grammar_string = unicode(open(grammar_file, 'r').read(), errors='ignore') + with open(in_file, 'r') as fd: + text = fd.read() + + with open(grammar_file, 'r') as fd: + grammar_string = fd.read() + try: - grammar = nltk.parse_cfg(grammar_string) + grammar = nltk.CFG.fromstring(grammar_string) parser = nltk.ChartParser(grammar) sentences = nltk.sent_tokenize(text) - for sentence in sentences: - words = nltk.word_tokenize(sentence) - tree = parser.parse(words) - output.write(tree.pprint()) - output.write('\n') - except Exception, e: - message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e) + with open(out_file, 'w') as output: + for sentence in sentences: + words = nltk.word_tokenize(sentence) + trees = parser.parse(words) + for t in trees: + output.write(t.pformat()) + output.write('\n') + + except Exception as e: + message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e) + "\n" sys.stderr.write(message) sys.exit() output.close() @@ -34,5 +40,3 @@ if __name__ == '__main__': args = arguments() chart_parse(args.input, args.grammar, args.output) - -
--- a/g_collocation.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_collocation.py Mon Dec 05 05:22:05 2016 -0500 @@ -18,8 +18,9 @@ def collocation(inp, outp, freq_filter, results, coll_type, pos): pos = bool(pos == 'true') - i = str(unicode(open(inp, 'r').read(), errors='ignore')) - o = open(outp, 'w') + with open(inp, 'r') as fd: + i = fd.read() + all_words = [] if pos: text = i.split(' ')[:-1] @@ -37,10 +38,10 @@ finder = TrigramCollocationFinder.from_words(all_words) finder.apply_freq_filter(int(freq_filter)) colls = finder.nbest(measures.pmi, int(results)) - for coll in colls: - o.write("%s\t%s" % coll) - o.write('\n') - o.close() + with open(outp, 'w') as output: + for coll in colls: + output.write("%s\t%s" % coll) + output.write('\n') if __name__ == '__main__': args = Parser()
--- a/g_frequency.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_frequency.py Mon Dec 05 05:22:05 2016 -0500 @@ -14,15 +14,17 @@ """Input: a text file Output: a table of word frequency with three columns for Word, Count and Percent frequency """ - text = unicode(open(in_file, 'r').read(), errors='ignore') + with open(in_file, 'r') as fd: + text = fd.read() + words = nltk.word_tokenize(text) frequency = FreqDist(words) total = float(frequency.N()) - output = open(out_file, 'w') - output.write("Word\tCount\tPercent\n") - for pair in frequency.items(): - output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) - output.close() + + with open(out_file, 'w') as output: + output.write("Word\tCount\tPercent\n") + for pair in frequency.items(): + output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) if __name__ == '__main__':
--- a/g_pos.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_pos.py Mon Dec 05 05:22:05 2016 -0500 @@ -8,27 +8,27 @@ parser.add_argument('--output', required=True, action="store", type=str, help="output file path") args = parser.parse_args() return args - + def postag(in_file, out_file): """Input: a text file with one token per line Output: a version of the text with Part of Speech tags written as word/TAG """ - text = unicode(open(in_file, 'r').read(), errors='ignore') + with open(in_file, 'r') as fd: + text = fd.read() + sentences = nltk.sent_tokenize(text) - output = open(out_file, 'w') - for sentence in sentences: - tokens = nltk.word_tokenize(sentence) - postags = nltk.pos_tag(tokens) - for postag in postags: - # print postag - output.write("%s/%s " % postag) - output.write('\n') - output.close() + + with open(out_file, 'w') as output: + for sentence in sentences: + tokens = nltk.word_tokenize(sentence) + postags = nltk.pos_tag(tokens) + for postag in postags: + # print postag + output.write("%s/%s " % postag) + output.write('\n') if __name__ == '__main__': args = arguments() postag(args.input, args.output) - - \ No newline at end of file
--- a/g_read_sents.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_read_sents.py Mon Dec 05 05:22:05 2016 -0500 @@ -13,15 +13,14 @@ return args def print_out(outp, text, sentences): - o = open(outp, 'w') - curr = 0 - for sent in sentences: - times = count_occurences(sent, sent[-1]) - curr = text.find(sent[0], curr) - end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) - o.write(text[curr:end] + '\n') - curr = end - o.close() + with open(outp, 'w') as output: + curr = 0 + for sent in sentences: + times = count_occurences(sent, sent[-1]) + curr = text.find(sent[0], curr) + end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) + output.write(text[curr:end] + '\n') + curr = end def find_nth(string, sub, n, offset): start = string.find(sub, offset) @@ -38,7 +37,8 @@ return count def read_sents(inp, outp): - i = open(inp, 'r').read() + with open(inp, 'r') as fd: + i = fd.read() corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) sents = corpus.sents() print_out(outp, i, sents)
--- a/g_stemmer.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_stemmer.py Mon Dec 05 05:22:05 2016 -0500 @@ -14,17 +14,18 @@ return args def stem_file(in_file, out_file, stemmer_type): - unsegmented = unicode(open(in_file, 'r').read(), errors='ignore') - output = open(out_file, 'w') - sentences = nltk.sent_tokenize(unsegmented) - stemmer = get_stemmer(stemmer_type) - for sentence in sentences: - words = nltk.word_tokenize(sentence) - for word in words: - stemmed_word = stemmer.stem(word) - output.write(stemmed_word) - output.write('\n') - output.close() + with open(in_file, 'r') as fd: + unsegmented = fd.read() + + with open(out_file, 'w') as output: + sentences = nltk.sent_tokenize(unsegmented) + stemmer = get_stemmer(stemmer_type) + for sentence in sentences: + words = nltk.word_tokenize(sentence) + for word in words: + stemmed_word = stemmer.stem(word) + output.write(stemmed_word) + output.write('\n') def get_stemmer(stemmer_type): if stemmer_type == 'lancaster':
--- a/g_tokenize.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_tokenize.py Mon Dec 05 05:22:05 2016 -0500 @@ -18,23 +18,24 @@ def tokenize(in_file, out_file, lower=False, nopunct=False): - text = open(in_file, 'r').read() + with open(in_file, 'r') as fd: + text = fd.read() + if lower: text = text.lower() if nopunct: text = strip_punct(text) result = [] - text = unicode(text, errors='ignore') + #text = unicode(text, errors='ignore') sentences = nltk.sent_tokenize(text) for sentence in sentences: tokens = nltk.word_tokenize(sentence) result.append(tokens) - output = open(out_file, 'w') - # write one token per line - for sentence in result: - for token in sentence: - output.write(token + "\n") - output.close() + with open(out_file, 'w') as output: + # write one token per line + for sentence in result: + for token in sentence: + output.write(token + "\n") if __name__ == '__main__':
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/elephant.txt Mon Dec 05 05:22:05 2016 -0500 @@ -0,0 +1,1 @@ +I shot an elephant in my pajamas
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/grammar.dat Mon Dec 05 05:22:05 2016 -0500 @@ -0,0 +1,8 @@ + S -> NP VP + PP -> P NP + NP -> Det N | Det N PP | 'I' + VP -> V NP | VP PP + Det -> 'an' | 'my' + N -> 'elephant' | 'pajamas' + V -> 'shot' + P -> 'in'
--- a/test-data/sample_text.txt Wed Oct 12 22:17:53 2016 -0400 +++ b/test-data/sample_text.txt Mon Dec 05 05:22:05 2016 -0500 @@ -1,696 +1,9 @@ -DADA -Project -Update -The -DADA -project -is -developing -software -for -managing -language -resources -and -exposing -them -on -the -web -. -Language -resources -are -digital -collections -of -language -as -audio -, -video -and -text -used -to -study -language -and -build -technoogy -systems -. -The -project -has -been -going -for -a -while -with -some -initial -funding -from -the -ARC -to -build -the -basic -infrastructure -and -later -from -Macquarie -University -for -some -work -on -the -Auslan -corpus -of -Australian -Sign -Language -collected -by -Trevor -Johnston -. -Recently -we -have -two -projects -which -DADA -will -be -part -of -, -and -so -the -pace -of -development -has -picked -up -a -little -. -The -Australian -National -Corpus -( -AusNC -) -is -an -effort -to -build -a -centralised -collection -of -resources -of -language -in -Australia -. -The -core -idea -is -to -take -whatever -existing -collections -we -can -get -permission -to -publish -and -make -them -available -under -a -common -technical -infrastructure -. -Using -some -funding -from -HCSNet -we -build -a -small -demonstration -site -that -allowed -free -text -search -on -two -collections -: -the -Australian -Corpus -of -English -and -the -Corpus -of -Oz -Early -English -. -We -now -have -some -funding -to -continue -this -work -and -expand -both -the -size -of -the -collection -and -the -capability -of -the -infrastructure -that -will -support -it -. -What -we -'ve -already -done -is -to -separate -the -text -in -these -corpora -from -their -meta-data -( -descriptions -of -each -text -) -and -the -annotation -( -denoting -things -within -the -texts -) -. -While -the -pilot -allows -searching -on -the -text -the -next -steps -will -allow -search -using -the -meta-data -( -look -for -this -in -texts -written -after -1900 -) -and -the -annotation -( -find -this -in -the -titles -of -articles -) -. -This -project -is -funded -by -the -Australian -National -Data -Service -( -ANDS -) -and -is -a -collaboration -with -Michael -Haugh -at -Griffith -. -The -Big -Australian -Speech -Corpus -, -more -recently -renamed -AusTalk -, -is -an -ARC -funded -project -to -collect -speech -and -video -from -1000 -Australian -speakers -for -a -new -freely -available -corpus -. -The -project -involves -many -partners -around -the -country -each -of -who -will -have -a -'black -box -' -recording -station -to -collect -audio -and -stereo -video -of -subjects -reading -words -and -sentences -, -being -interviewed -and -doing -the -Map -task -- -a -game -designed -to -elicit -natural -speech -between -two -people -. -Our -part -of -the -project -is -to -provide -the -server -infrastructure -that -will -store -the -audio -, -video -and -annotation -data -that -will -make -up -the -corpus -. -DADA -will -be -part -of -this -solution -but -the -main -driver -is -to -be -able -to -provide -a -secure -and -reliable -store -for -the -primary -data -as -it -comes -in -from -the -collection -sites -. -An -important -feature -of -the -collection -is -the -meta-data -that -will -describe -the -subjects -in -the -recording -. -Some -annotation -of -the -data -will -be -done -automatically -, -for -example -some -forced -alignment -of -the -read -words -and -sentences -. -Later -, -we -will -move -on -to -support -manual -annotation -of -some -of -the -data -- -for -example -transcripts -of -the -interviews -and -map -task -sessions -. -All -of -this -will -be -published -via -the -DADA -server -infrastructure -to -create -a -large -, -freely -available -research -collection -for -Australian -English -. -Since -the -development -of -DADA -now -involves -people -outside -Macquarie -, -I -have -started -using -a -public -bitbucket -repository -for -the -code -. -As -of -this -writing -the -code -still -needs -some -tidying -and -documentation -to -enable -third -parties -to -be -able -to -install -and -work -on -it -, -but -we -hope -to -have -that -done -within -a -month -. -The -public -DADA -demo -site -is -down -at -the -moment -due -to -network -upgrades -at -Macquarie -( -it -'s -only -visible -inside -MQ -) -- -I -hope -to -have -that -fixed -soon -with -some -new -sample -data -sets -loaded -up -for -testing -. -2011 -looks -like -it -will -be -a -significant -year -for -DADA -. -We -hope -to -end -this -year -with -a -number -of -significant -text -, -audio -and -video -corpora -hosted -on -DADA -infrastructure -and -providing -useful -services -to -the -linguistics -and -language -technology -communities -. +Some text that is nøt øß ascii + + DADA project is developing software for managing language resources and exposing them on the web. Language resources are digital collections of language as audio, video and text used to study language and build technology systems. The project has been going for a while with some initial funding from the ARC to build the basic infrastructure and later from Macquarie University for some work on the Auslan corpus of Australian Sign Language collected by Trevor Johnston. Recently we have two projects which DADA will be part of, and so the pace of development has picked up a little. + +The Australian National Corpus (AusNC) is an effort to build a centralised collection of resources of language in Australia. The core idea is to take whatever existing collections we can get permission to publish and make them available under a common technical infrastructure. Using some funding from HCSNet we build a small demonstration site that allowed free text search on two collections: the Australian Corpus of English and the Corpus of Oz Early English. We now have some funding to continue this work and expand both the size of the collection and the capability of the infrastructure that will support it. What we’ve already done is to separate the text in these corpora from their meta-data (descriptions of each text) and the annotation (denoting things within the texts). While the pilot allows searching on the text the next steps will allow search using the meta-data (look for this in texts written after 1900) and the annotation (find this in the titles of articles). This project is funded by the Australian National Data Service (ANDS) and is a collaboration with Michael Haugh at Griffith. + +The Big Australian Speech Corpus, more recently renamed AusTalk, is an ARC funded project to collect speech and video from 1000 Australian speakers for a new freely available corpus. The project involves many partners around the country each of who will have a ‘black box’ recording station to collect audio and stereo video of subjects reading words and sentences, being interviewed and doing the Map task – a game designed to elicit natural speech between two people. Our part of the project is to provide the server infrastructure that will store the audio, video and annotation data that will make up the corpus. DADA will be part of this solution but the main driver is to be able to provide a secure and reliable store for the primary data as it comes in from the collection sites. An important feature of the collection is the meta-data that will describe the subjects in the recording. Some annotation of the data will be done automatically, for example some forced alignment of the read words and sentences. Later, we will move on to support manual annotation of some of the data – for example transcripts of the interviews and map task sessions. All of this will be published via the DADA server infrastructure to create a large, freely available research collection for Australian English. + +Since the development of DADA now involves people outside Macquarie, we have started using a public bitbucket repository for the code. As of this writing the code still needs some tidying and documentation to enable third parties to be able to install and work on it, but we hope to have that done within a month. The public DADA demo site is down at the moment due to network upgrades at Macquarie (it’s only visible inside MQ) – I hope to have that fixed soon with some new sample data sets loaded up for testing. 2011 looks like it will be a significant year for DADA. We hope to end this year with a number of significant text, audio and video corpora hosted on DADA infrastructure and providing useful services to the linguistics and language technology communities.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tmp.dat Mon Dec 05 05:22:05 2016 -0500 @@ -0,0 +1,10 @@ +(S + (NP I) + (VP + (VP (V shot) (NP (Det an) (N elephant))) + (PP (P in) (NP (Det my) (N pajamas))))) +(S + (NP I) + (VP + (V shot) + (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))