# HG changeset patch # User stevecassidy # Date 1476325073 14400 # Node ID e991d4e60c17697cd5ce38e404024408844f4c1c planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty diff -r 000000000000 -r e991d4e60c17 colloc.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/colloc.dat Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,6 @@ +will be +that will +. The +is to +of the +and the diff -r 000000000000 -r e991d4e60c17 g_chart_parser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_chart_parser.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,38 @@ +import sys +import nltk +import argparse +from nltk.corpus import PlaintextCorpusReader + +def arguments(): + parser = argparse.ArgumentParser(description="run NER on a text") + parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + parser.add_argument('--grammar', required=True, action="store", type=str, help="grammar file") + parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + args = parser.parse_args() + return args + + +def chart_parse(in_file, grammar_file, out_file): + text = unicode(open(in_file, 'r').read(), errors='ignore') + output = open(out_file, 'w') + grammar_string = unicode(open(grammar_file, 'r').read(), errors='ignore') + try: + grammar = nltk.parse_cfg(grammar_string) + parser = nltk.ChartParser(grammar) + sentences = nltk.sent_tokenize(text) + for sentence in sentences: + words = nltk.word_tokenize(sentence) + tree = parser.parse(words) + output.write(tree.pprint()) + output.write('\n') + except Exception, e: + message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e) + sys.stderr.write(message) + sys.exit() + output.close() + +if __name__ == '__main__': + args = arguments() + chart_parse(args.input, args.grammar, args.output) + + diff -r 000000000000 -r e991d4e60c17 g_chart_parser.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_chart_parser.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,28 @@ + + Parse the sentence using Chart Parser and a supplied grammar + + nltk + + + + g_chart_parser.py --input $input1 --grammar $grammar --output $tab_file + + + + + + + + + + + + + + + + diff -r 000000000000 -r e991d4e60c17 g_collocation.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_collocation.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,48 @@ +import sys +import os +import nltk +from nltk.collocations import * +import argparse + +def Parser(): + the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar") + the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus") + the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results") + the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find") + the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags") + + args = the_parser.parse_args() + return args + +def collocation(inp, outp, freq_filter, results, coll_type, pos): + pos = bool(pos == 'true') + i = str(unicode(open(inp, 'r').read(), errors='ignore')) + o = open(outp, 'w') + all_words = [] + if pos: + text = i.split(' ')[:-1] + all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] + all_words = [x.strip(' ').strip('\n') for x in all_words] + else: + sents = nltk.sent_tokenize(i) + for sent in sents: + all_words += nltk.word_tokenize(sent) + if coll_type == 'bigram': + measures = nltk.collocations.BigramAssocMeasures() + finder = BigramCollocationFinder.from_words(all_words) + else: + measures = nltk.collocations.TrigramAssocMeasures() + finder = TrigramCollocationFinder.from_words(all_words) + finder.apply_freq_filter(int(freq_filter)) + colls = finder.nbest(measures.pmi, int(results)) + for coll in colls: + o.write("%s\t%s" % coll) + o.write('\n') + o.close() + +if __name__ == '__main__': + args = Parser() + + collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos) diff -r 000000000000 -r e991d4e60c17 g_collocation.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_collocation.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,38 @@ + + Generates a list of the most frequent collocations from an input sequence + + nltk + + + + g_collocation.py --input $input1 --output $tab_file --freq_filter $freq_filter --results $results --coll_type $collocation_type --pos $pos + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r e991d4e60c17 g_frequency.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_frequency.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,30 @@ +import nltk +from nltk import FreqDist +import argparse + +def arguments(): + parser = argparse.ArgumentParser(description="generate a word frequency table from a text") + parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + args = parser.parse_args() + return args + + +def frequency(in_file, out_file): + """Input: a text file + Output: a table of word frequency with three columns for Word, Count and Percent frequency + """ + text = unicode(open(in_file, 'r').read(), errors='ignore') + words = nltk.word_tokenize(text) + frequency = FreqDist(words) + total = float(frequency.N()) + output = open(out_file, 'w') + output.write("Word\tCount\tPercent\n") + for pair in frequency.items(): + output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total)) + output.close() + + +if __name__ == '__main__': + args = arguments() + frequency(args.input, args.output) diff -r 000000000000 -r e991d4e60c17 g_frequency.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_frequency.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,34 @@ + + Takes a text input and generates a frequency list + + + nltk + + + + g_frequency.py --input $input1 --output $frequency_table + + + + + + + + + + + + + + + + + + + + Generate a frequency list from a text ordered by word frequency. + + diff -r 000000000000 -r e991d4e60c17 g_pos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_pos.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,34 @@ +import nltk +import argparse +import json + +def arguments(): + parser = argparse.ArgumentParser(description="tokenize a text") + parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + args = parser.parse_args() + return args + + +def postag(in_file, out_file): + """Input: a text file with one token per line + Output: a version of the text with Part of Speech tags written as word/TAG + """ + text = unicode(open(in_file, 'r').read(), errors='ignore') + sentences = nltk.sent_tokenize(text) + output = open(out_file, 'w') + for sentence in sentences: + tokens = nltk.word_tokenize(sentence) + postags = nltk.pos_tag(tokens) + for postag in postags: + # print postag + output.write("%s/%s " % postag) + output.write('\n') + output.close() + + +if __name__ == '__main__': + args = arguments() + postag(args.input, args.output) + + \ No newline at end of file diff -r 000000000000 -r e991d4e60c17 g_pos.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_pos.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,35 @@ + + Part of Speech tagging + + + nltk + + + + g_pos.py --input $input1 --output $postags + + + + + + + + + + + + + + + + + + + + + Apply a Part of Speech (POS) tagger to a list of sentences. + + diff -r 000000000000 -r e991d4e60c17 g_read_sents.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_read_sents.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,48 @@ +import sys +import os +import nltk +from nltk.corpus import PlaintextCorpusReader +import argparse + +def Parser(): + the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") + the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + + args = the_parser.parse_args() + return args + +def print_out(outp, text, sentences): + o = open(outp, 'w') + curr = 0 + for sent in sentences: + times = count_occurences(sent, sent[-1]) + curr = text.find(sent[0], curr) + end = find_nth(text, sent[-1], times, curr) + len(sent[-1]) + o.write(text[curr:end] + '\n') + curr = end + o.close() + +def find_nth(string, sub, n, offset): + start = string.find(sub, offset) + while start >= 0 and n > 1: + start = string.find(sub, start + len(sub)) + n -= 1 + return start + +def count_occurences(lst, string): + count = 0 + for item in lst: + if string in item: + count += 1 + return count + +def read_sents(inp, outp): + i = open(inp, 'r').read() + corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp)) + sents = corpus.sents() + print_out(outp, i, sents) + +if __name__ == '__main__': + args = Parser() + read_sents(args.input, args.output) diff -r 000000000000 -r e991d4e60c17 g_read_sents.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_read_sents.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,31 @@ + + Segments the text input into separate sentences + + + nltk + + + + g_read_sents.py --input $input1 --output $tab_file + +\ + + + + + + + + + + + + + + + diff -r 000000000000 -r e991d4e60c17 g_stemmer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_stemmer.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,40 @@ +import sys +import os +import nltk +from nltk.stem import * +import argparse + + +def arguments(): + parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") + parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") + args = parser.parse_args() + return args + +def stem_file(in_file, out_file, stemmer_type): + unsegmented = unicode(open(in_file, 'r').read(), errors='ignore') + output = open(out_file, 'w') + sentences = nltk.sent_tokenize(unsegmented) + stemmer = get_stemmer(stemmer_type) + for sentence in sentences: + words = nltk.word_tokenize(sentence) + for word in words: + stemmed_word = stemmer.stem(word) + output.write(stemmed_word) + output.write('\n') + output.close() + +def get_stemmer(stemmer_type): + if stemmer_type == 'lancaster': + stemmer = LancasterStemmer() + elif stemmer_type == 'porter': + stemmer = PorterStemmer() + else: + stemmer = snowball.EnglishStemmer() + return stemmer + +if __name__ == '__main__': + args = arguments() + stem_file(args.input, args.output, args.stemmer) diff -r 000000000000 -r e991d4e60c17 g_stemmer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_stemmer.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,36 @@ + + Takes a list of tokens and generates a list of word stems using one of the stemming algorithms + + + nltk + + + + g_stemmer.py --input $input1 --output $tab_file --stemmer $stemming_algorithm + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r e991d4e60c17 g_tokenize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_tokenize.py Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,42 @@ +import nltk +import string +import argparse + +def arguments(): + parser = argparse.ArgumentParser(description="tokenize a text") + parser.add_argument('--input', required=True, action="store", type=str, help="input text file") + parser.add_argument('--output', required=True, action="store", type=str, help="output file path") + parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words") + parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters") + args = parser.parse_args() + return args + + +def strip_punct(text): + table = string.maketrans("","") + return text.translate(table, string.punctuation) + + +def tokenize(in_file, out_file, lower=False, nopunct=False): + text = open(in_file, 'r').read() + if lower: + text = text.lower() + if nopunct: + text = strip_punct(text) + result = [] + text = unicode(text, errors='ignore') + sentences = nltk.sent_tokenize(text) + for sentence in sentences: + tokens = nltk.word_tokenize(sentence) + result.append(tokens) + output = open(out_file, 'w') + # write one token per line + for sentence in result: + for token in sentence: + output.write(token + "\n") + output.close() + + +if __name__ == '__main__': + args = arguments() + tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct) diff -r 000000000000 -r e991d4e60c17 g_tokenize.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/g_tokenize.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,51 @@ + + Split a text into words + + + nltk + + + + g_tokenize.py --input $input1 --output $tokens $lower $nopunct + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tokenize a text into separate words, optionally remove punctuation and convert to lower case. + + + diff -r 000000000000 -r e991d4e60c17 test-data/dependency_resolvers_config.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dependency_resolvers_config.xml Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,6 @@ + + + + + + diff -r 000000000000 -r e991d4e60c17 test-data/sample_text.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_text.txt Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,696 @@ +DADA +Project +Update +The +DADA +project +is +developing +software +for +managing +language +resources +and +exposing +them +on +the +web +. +Language +resources +are +digital +collections +of +language +as +audio +, +video +and +text +used +to +study +language +and +build +technoogy +systems +. +The +project +has +been +going +for +a +while +with +some +initial +funding +from +the +ARC +to +build +the +basic +infrastructure +and +later +from +Macquarie +University +for +some +work +on +the +Auslan +corpus +of +Australian +Sign +Language +collected +by +Trevor +Johnston +. +Recently +we +have +two +projects +which +DADA +will +be +part +of +, +and +so +the +pace +of +development +has +picked +up +a +little +. +The +Australian +National +Corpus +( +AusNC +) +is +an +effort +to +build +a +centralised +collection +of +resources +of +language +in +Australia +. +The +core +idea +is +to +take +whatever +existing +collections +we +can +get +permission +to +publish +and +make +them +available +under +a +common +technical +infrastructure +. +Using +some +funding +from +HCSNet +we +build +a +small +demonstration +site +that +allowed +free +text +search +on +two +collections +: +the +Australian +Corpus +of +English +and +the +Corpus +of +Oz +Early +English +. +We +now +have +some +funding +to +continue +this +work +and +expand +both +the +size +of +the +collection +and +the +capability +of +the +infrastructure +that +will +support +it +. +What +we +'ve +already +done +is +to +separate +the +text +in +these +corpora +from +their +meta-data +( +descriptions +of +each +text +) +and +the +annotation +( +denoting +things +within +the +texts +) +. +While +the +pilot +allows +searching +on +the +text +the +next +steps +will +allow +search +using +the +meta-data +( +look +for +this +in +texts +written +after +1900 +) +and +the +annotation +( +find +this +in +the +titles +of +articles +) +. +This +project +is +funded +by +the +Australian +National +Data +Service +( +ANDS +) +and +is +a +collaboration +with +Michael +Haugh +at +Griffith +. +The +Big +Australian +Speech +Corpus +, +more +recently +renamed +AusTalk +, +is +an +ARC +funded +project +to +collect +speech +and +video +from +1000 +Australian +speakers +for +a +new +freely +available +corpus +. +The +project +involves +many +partners +around +the +country +each +of +who +will +have +a +'black +box +' +recording +station +to +collect +audio +and +stereo +video +of +subjects +reading +words +and +sentences +, +being +interviewed +and +doing +the +Map +task +- +a +game +designed +to +elicit +natural +speech +between +two +people +. +Our +part +of +the +project +is +to +provide +the +server +infrastructure +that +will +store +the +audio +, +video +and +annotation +data +that +will +make +up +the +corpus +. +DADA +will +be +part +of +this +solution +but +the +main +driver +is +to +be +able +to +provide +a +secure +and +reliable +store +for +the +primary +data +as +it +comes +in +from +the +collection +sites +. +An +important +feature +of +the +collection +is +the +meta-data +that +will +describe +the +subjects +in +the +recording +. +Some +annotation +of +the +data +will +be +done +automatically +, +for +example +some +forced +alignment +of +the +read +words +and +sentences +. +Later +, +we +will +move +on +to +support +manual +annotation +of +some +of +the +data +- +for +example +transcripts +of +the +interviews +and +map +task +sessions +. +All +of +this +will +be +published +via +the +DADA +server +infrastructure +to +create +a +large +, +freely +available +research +collection +for +Australian +English +. +Since +the +development +of +DADA +now +involves +people +outside +Macquarie +, +I +have +started +using +a +public +bitbucket +repository +for +the +code +. +As +of +this +writing +the +code +still +needs +some +tidying +and +documentation +to +enable +third +parties +to +be +able +to +install +and +work +on +it +, +but +we +hope +to +have +that +done +within +a +month +. +The +public +DADA +demo +site +is +down +at +the +moment +due +to +network +upgrades +at +Macquarie +( +it +'s +only +visible +inside +MQ +) +- +I +hope +to +have +that +fixed +soon +with +some +new +sample +data +sets +loaded +up +for +testing +. +2011 +looks +like +it +will +be +a +significant +year +for +DADA +. +We +hope +to +end +this +year +with +a +number +of +significant +text +, +audio +and +video +corpora +hosted +on +DADA +infrastructure +and +providing +useful +services +to +the +linguistics +and +language +technology +communities +. diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_frequency.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_text_frequency.dat Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,294 @@ +Word Count Percent +the 44 6.32 +of 26 3.74 +and 25 3.59 +. 24 3.45 +to 23 3.30 +a 15 2.16 +, 12 1.72 +for 12 1.72 +will 12 1.72 +is 11 1.58 +DADA 9 1.29 +some 8 1.15 +( 7 1.01 +be 7 1.01 +on 7 1.01 +that 7 1.01 +this 7 1.01 +Australian 7 1.01 +) 7 1.01 +The 7 1.01 +text 6 0.86 +project 6 0.86 +we 6 0.86 +infrastructure 6 0.86 +from 6 0.86 +have 6 0.86 +in 6 0.86 +video 5 0.72 +language 5 0.72 +data 5 0.72 +it 5 0.72 +collection 5 0.72 +annotation 5 0.72 +Corpus 4 0.57 +with 4 0.57 +build 4 0.57 +audio 4 0.57 +hope 3 0.43 +collections 3 0.43 +resources 3 0.43 +funding 3 0.43 +available 3 0.43 +English 3 0.43 +meta-data 3 0.43 +Macquarie 3 0.43 +done 3 0.43 +two 3 0.43 +corpus 3 0.43 +part 3 0.43 +work 3 0.43 +up 3 0.43 +at 3 0.43 +- 3 0.43 +code 2 0.29 +people 2 0.29 +We 2 0.29 +but 2 0.29 +has 2 0.29 +them 2 0.29 +example 2 0.29 +words 2 0.29 +using 2 0.29 +now 2 0.29 +collect 2 0.29 +each 2 0.29 +corpora 2 0.29 +year 2 0.29 +server 2 0.29 +new 2 0.29 +public 2 0.29 +by 2 0.29 +search 2 0.29 +store 2 0.29 +involves 2 0.29 +within 2 0.29 +texts 2 0.29 +support 2 0.29 +Language 2 0.29 +sentences 2 0.29 +freely 2 0.29 +National 2 0.29 +funded 2 0.29 +site 2 0.29 +an 2 0.29 +as 2 0.29 +able 2 0.29 +make 2 0.29 +subjects 2 0.29 +speech 2 0.29 +development 2 0.29 +recording 2 0.29 +I 2 0.29 +significant 2 0.29 +task 2 0.29 +provide 2 0.29 +ARC 2 0.29 +demo 1 0.14 +automatically 1 0.14 +What 1 0.14 +Service 1 0.14 +being 1 0.14 +both 1 0.14 +soon 1 0.14 +existing 1 0.14 +large 1 0.14 +via 1 0.14 +looks 1 0.14 +Haugh 1 0.14 +still 1 0.14 +find 1 0.14 +alignment 1 0.14 +web 1 0.14 +Recently 1 0.14 +writing 1 0.14 +linguistics 1 0.14 +only 1 0.14 +going 1 0.14 +systems 1 0.14 +under 1 0.14 +Using 1 0.14 +2011 1 0.14 +take 1 0.14 +move 1 0.14 +around 1 0.14 +get 1 0.14 +read 1 0.14 +providing 1 0.14 +Michael 1 0.14 +number 1 0.14 +Project 1 0.14 +next 1 0.14 +While 1 0.14 +Oz 1 0.14 +communities 1 0.14 +comes 1 0.14 +projects 1 0.14 +articles 1 0.14 +like 1 0.14 +visible 1 0.14 +manual 1 0.14 +solution 1 0.14 +'ve 1 0.14 +capability 1 0.14 +these 1 0.14 +continue 1 0.14 +steps 1 0.14 +common 1 0.14 +small 1 0.14 +Speech 1 0.14 +fixed 1 0.14 +Griffith 1 0.14 +searching 1 0.14 +core 1 0.14 +doing 1 0.14 +Since 1 0.14 +idea 1 0.14 +All 1 0.14 +titles 1 0.14 +are 1 0.14 +picked 1 0.14 +Some 1 0.14 +network 1 0.14 +renamed 1 0.14 +managing 1 0.14 +sites 1 0.14 +publish 1 0.14 +research 1 0.14 +Later 1 0.14 +AusNC 1 0.14 +written 1 0.14 +between 1 0.14 +technology 1 0.14 +reading 1 0.14 +can 1 0.14 +recently 1 0.14 +repository 1 0.14 +partners 1 0.14 +This 1 0.14 +University 1 0.14 +hosted 1 0.14 +free 1 0.14 +box 1 0.14 +exposing 1 0.14 +technical 1 0.14 +study 1 0.14 +allows 1 0.14 +forced 1 0.14 +Sign 1 0.14 +published 1 0.14 +map 1 0.14 +MQ 1 0.14 +month 1 0.14 +interviews 1 0.14 +software 1 0.14 +already 1 0.14 +useful 1 0.14 +secure 1 0.14 +'black 1 0.14 +primary 1 0.14 +whatever 1 0.14 +Update 1 0.14 +1000 1 0.14 +parties 1 0.14 +loaded 1 0.14 +centralised 1 0.14 +Auslan 1 0.14 +1900 1 0.14 +size 1 0.14 +little 1 0.14 +Australia 1 0.14 +initial 1 0.14 +been 1 0.14 +Early 1 0.14 +their 1 0.14 +station 1 0.14 +down 1 0.14 +basic 1 0.14 +collected 1 0.14 +: 1 0.14 +Data 1 0.14 +ANDS 1 0.14 +more 1 0.14 +describe 1 0.14 +HCSNet 1 0.14 +denoting 1 0.14 +interviewed 1 0.14 +Trevor 1 0.14 +bitbucket 1 0.14 +testing 1 0.14 +Johnston 1 0.14 +effort 1 0.14 +pilot 1 0.14 +upgrades 1 0.14 +main 1 0.14 +look 1 0.14 +developing 1 0.14 +reliable 1 0.14 +pace 1 0.14 +while 1 0.14 +technoogy 1 0.14 +install 1 0.14 +Our 1 0.14 +transcripts 1 0.14 +country 1 0.14 +descriptions 1 0.14 +due 1 0.14 +documentation 1 0.14 +allowed 1 0.14 +sample 1 0.14 +enable 1 0.14 +create 1 0.14 +demonstration 1 0.14 +Map 1 0.14 +speakers 1 0.14 +inside 1 0.14 +end 1 0.14 +sessions 1 0.14 +things 1 0.14 +permission 1 0.14 +feature 1 0.14 +who 1 0.14 +started 1 0.14 +which 1 0.14 +digital 1 0.14 +many 1 0.14 +outside 1 0.14 +used 1 0.14 +'s 1 0.14 +separate 1 0.14 +collaboration 1 0.14 +after 1 0.14 +driver 1 0.14 +needs 1 0.14 +moment 1 0.14 +important 1 0.14 +designed 1 0.14 +tidying 1 0.14 +services 1 0.14 +elicit 1 0.14 +AusTalk 1 0.14 +expand 1 0.14 +stereo 1 0.14 +natural 1 0.14 +' 1 0.14 +third 1 0.14 +later 1 0.14 +game 1 0.14 +An 1 0.14 +As 1 0.14 +so 1 0.14 +Big 1 0.14 +allow 1 0.14 +sets 1 0.14 diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_lower.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_text_lower.txt Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,696 @@ +dada +project +update +the +dada +project +is +developing +software +for +managing +language +resources +and +exposing +them +on +the +web +. +language +resources +are +digital +collections +of +language +as +audio +, +video +and +text +used +to +study +language +and +build +technoogy +systems +. +the +project +has +been +going +for +a +while +with +some +initial +funding +from +the +arc +to +build +the +basic +infrastructure +and +later +from +macquarie +university +for +some +work +on +the +auslan +corpus +of +australian +sign +language +collected +by +trevor +johnston +. +recently +we +have +two +projects +which +dada +will +be +part +of +, +and +so +the +pace +of +development +has +picked +up +a +little +. +the +australian +national +corpus +( +ausnc +) +is +an +effort +to +build +a +centralised +collection +of +resources +of +language +in +australia +. +the +core +idea +is +to +take +whatever +existing +collections +we +can +get +permission +to +publish +and +make +them +available +under +a +common +technical +infrastructure +. +using +some +funding +from +hcsnet +we +build +a +small +demonstration +site +that +allowed +free +text +search +on +two +collections +: +the +australian +corpus +of +english +and +the +corpus +of +oz +early +english +. +we +now +have +some +funding +to +continue +this +work +and +expand +both +the +size +of +the +collection +and +the +capability +of +the +infrastructure +that +will +support +it +. +what +we +'ve +already +done +is +to +separate +the +text +in +these +corpora +from +their +meta-data +( +descriptions +of +each +text +) +and +the +annotation +( +denoting +things +within +the +texts +) +. +while +the +pilot +allows +searching +on +the +text +the +next +steps +will +allow +search +using +the +meta-data +( +look +for +this +in +texts +written +after +1900 +) +and +the +annotation +( +find +this +in +the +titles +of +articles +) +. +this +project +is +funded +by +the +australian +national +data +service +( +ands +) +and +is +a +collaboration +with +michael +haugh +at +griffith +. +the +big +australian +speech +corpus +, +more +recently +renamed +austalk +, +is +an +arc +funded +project +to +collect +speech +and +video +from +1000 +australian +speakers +for +a +new +freely +available +corpus +. +the +project +involves +many +partners +around +the +country +each +of +who +will +have +a +'black +box +' +recording +station +to +collect +audio +and +stereo +video +of +subjects +reading +words +and +sentences +, +being +interviewed +and +doing +the +map +task +- +a +game +designed +to +elicit +natural +speech +between +two +people +. +our +part +of +the +project +is +to +provide +the +server +infrastructure +that +will +store +the +audio +, +video +and +annotation +data +that +will +make +up +the +corpus +. +dada +will +be +part +of +this +solution +but +the +main +driver +is +to +be +able +to +provide +a +secure +and +reliable +store +for +the +primary +data +as +it +comes +in +from +the +collection +sites +. +an +important +feature +of +the +collection +is +the +meta-data +that +will +describe +the +subjects +in +the +recording +. +some +annotation +of +the +data +will +be +done +automatically +, +for +example +some +forced +alignment +of +the +read +words +and +sentences +. +later +, +we +will +move +on +to +support +manual +annotation +of +some +of +the +data +- +for +example +transcripts +of +the +interviews +and +map +task +sessions +. +all +of +this +will +be +published +via +the +dada +server +infrastructure +to +create +a +large +, +freely +available +research +collection +for +australian +english +. +since +the +development +of +dada +now +involves +people +outside +macquarie +, +i +have +started +using +a +public +bitbucket +repository +for +the +code +. +as +of +this +writing +the +code +still +needs +some +tidying +and +documentation +to +enable +third +parties +to +be +able +to +install +and +work +on +it +, +but +we +hope +to +have +that +done +within +a +month +. +the +public +dada +demo +site +is +down +at +the +moment +due +to +network +upgrades +at +macquarie +( +it +'s +only +visible +inside +mq +) +- +i +hope +to +have +that +fixed +soon +with +some +new +sample +data +sets +loaded +up +for +testing +. +2011 +looks +like +it +will +be +a +significant +year +for +dada +. +we +hope +to +end +this +year +with +a +number +of +significant +text +, +audio +and +video +corpora +hosted +on +dada +infrastructure +and +providing +useful +services +to +the +linguistics +and +language +technology +communities +. diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_lower_nopunct.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_text_lower_nopunct.txt Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,641 @@ +dada +project +update +the +dada +project +is +developing +software +for +managing +language +resources +and +exposing +them +on +the +web +language +resources +are +digital +collections +of +language +as +audio +video +and +text +used +to +study +language +and +build +technoogy +systems +the +project +has +been +going +for +a +while +with +some +initial +funding +from +the +arc +to +build +the +basic +infrastructure +and +later +from +macquarie +university +for +some +work +on +the +auslan +corpus +of +australian +sign +language +collected +by +trevor +johnston +recently +we +have +two +projects +which +dada +will +be +part +of +and +so +the +pace +of +development +has +picked +up +a +little +the +australian +national +corpus +ausnc +is +an +effort +to +build +a +centralised +collection +of +resources +of +language +in +australia +the +core +idea +is +to +take +whatever +existing +collections +we +can +get +permission +to +publish +and +make +them +available +under +a +common +technical +infrastructure +using +some +funding +from +hcsnet +we +build +a +small +demonstration +site +that +allowed +free +text +search +on +two +collections +the +australian +corpus +of +english +and +the +corpus +of +oz +early +english +we +now +have +some +funding +to +continue +this +work +and +expand +both +the +size +of +the +collection +and +the +capability +of +the +infrastructure +that +will +support +it +what +we +'ve +already +done +is +to +separate +the +text +in +these +corpora +from +their +meta-data +descriptions +of +each +text +and +the +annotation +denoting +things +within +the +texts +while +the +pilot +allows +searching +on +the +text +the +next +steps +will +allow +search +using +the +meta-data +look +for +this +in +texts +written +after +1900 +and +the +annotation +find +this +in +the +titles +of +articles +this +project +is +funded +by +the +australian +national +data +service +ands +and +is +a +collaboration +with +michael +haugh +at +griffith +the +big +australian +speech +corpus +more +recently +renamed +austalk +is +an +arc +funded +project +to +collect +speech +and +video +from +1000 +australian +speakers +for +a +new +freely +available +corpus +the +project +involves +many +partners +around +the +country +each +of +who +will +have +a +'black +box +recording +station +to +collect +audio +and +stereo +video +of +subjects +reading +words +and +sentences +being +interviewed +and +doing +the +map +task +a +game +designed +to +elicit +natural +speech +between +two +people +our +part +of +the +project +is +to +provide +the +server +infrastructure +that +will +store +the +audio +video +and +annotation +data +that +will +make +up +the +corpus +dada +will +be +part +of +this +solution +but +the +main +driver +is +to +be +able +to +provide +a +secure +and +reliable +store +for +the +primary +data +as +it +comes +in +from +the +collection +sites +an +important +feature +of +the +collection +is +the +meta-data +that +will +describe +the +subjects +in +the +recording +some +annotation +of +the +data +will +be +done +automatically +for +example +some +forced +alignment +of +the +read +words +and +sentences +later +we +will +move +on +to +support +manual +annotation +of +some +of +the +data +for +example +transcripts +of +the +interviews +and +map +task +sessions +all +of +this +will +be +published +via +the +dada +server +infrastructure +to +create +a +large +freely +available +research +collection +for +australian +english +since +the +development +of +dada +now +involves +people +outside +macquarie +i +have +started +using +a +public +bitbucket +repository +for +the +code +as +of +this +writing +the +code +still +needs +some +tidying +and +documentation +to +enable +third +parties +to +be +able +to +install +and +work +on +it +but +we +hope +to +have +that +done +within +a +month +the +public +dada +demo +site +is +down +at +the +moment +due +to +network +upgrades +at +macquarie +it +'s +only +visible +inside +mq +i +hope +to +have +that +fixed +soon +with +some +new +sample +data +sets +loaded +up +for +testing +2011 +looks +like +it +will +be +a +significant +year +for +dada +we +hope +to +end +this +year +with +a +number +of +significant +text +audio +and +video +corpora +hosted +on +dada +infrastructure +and +providing +useful +services +to +the +linguistics +and +language +technology +communities diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_tok.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sample_text_tok.txt Wed Oct 12 22:17:53 2016 -0400 @@ -0,0 +1,696 @@ +DADA +Project +Update +The +DADA +project +is +developing +software +for +managing +language +resources +and +exposing +them +on +the +web +. +Language +resources +are +digital +collections +of +language +as +audio +, +video +and +text +used +to +study +language +and +build +technoogy +systems +. +The +project +has +been +going +for +a +while +with +some +initial +funding +from +the +ARC +to +build +the +basic +infrastructure +and +later +from +Macquarie +University +for +some +work +on +the +Auslan +corpus +of +Australian +Sign +Language +collected +by +Trevor +Johnston +. +Recently +we +have +two +projects +which +DADA +will +be +part +of +, +and +so +the +pace +of +development +has +picked +up +a +little +. +The +Australian +National +Corpus +( +AusNC +) +is +an +effort +to +build +a +centralised +collection +of +resources +of +language +in +Australia +. +The +core +idea +is +to +take +whatever +existing +collections +we +can +get +permission +to +publish +and +make +them +available +under +a +common +technical +infrastructure +. +Using +some +funding +from +HCSNet +we +build +a +small +demonstration +site +that +allowed +free +text +search +on +two +collections +: +the +Australian +Corpus +of +English +and +the +Corpus +of +Oz +Early +English +. +We +now +have +some +funding +to +continue +this +work +and +expand +both +the +size +of +the +collection +and +the +capability +of +the +infrastructure +that +will +support +it +. +What +we +'ve +already +done +is +to +separate +the +text +in +these +corpora +from +their +meta-data +( +descriptions +of +each +text +) +and +the +annotation +( +denoting +things +within +the +texts +) +. +While +the +pilot +allows +searching +on +the +text +the +next +steps +will +allow +search +using +the +meta-data +( +look +for +this +in +texts +written +after +1900 +) +and +the +annotation +( +find +this +in +the +titles +of +articles +) +. +This +project +is +funded +by +the +Australian +National +Data +Service +( +ANDS +) +and +is +a +collaboration +with +Michael +Haugh +at +Griffith +. +The +Big +Australian +Speech +Corpus +, +more +recently +renamed +AusTalk +, +is +an +ARC +funded +project +to +collect +speech +and +video +from +1000 +Australian +speakers +for +a +new +freely +available +corpus +. +The +project +involves +many +partners +around +the +country +each +of +who +will +have +a +'black +box +' +recording +station +to +collect +audio +and +stereo +video +of +subjects +reading +words +and +sentences +, +being +interviewed +and +doing +the +Map +task +- +a +game +designed +to +elicit +natural +speech +between +two +people +. +Our +part +of +the +project +is +to +provide +the +server +infrastructure +that +will +store +the +audio +, +video +and +annotation +data +that +will +make +up +the +corpus +. +DADA +will +be +part +of +this +solution +but +the +main +driver +is +to +be +able +to +provide +a +secure +and +reliable +store +for +the +primary +data +as +it +comes +in +from +the +collection +sites +. +An +important +feature +of +the +collection +is +the +meta-data +that +will +describe +the +subjects +in +the +recording +. +Some +annotation +of +the +data +will +be +done +automatically +, +for +example +some +forced +alignment +of +the +read +words +and +sentences +. +Later +, +we +will +move +on +to +support +manual +annotation +of +some +of +the +data +- +for +example +transcripts +of +the +interviews +and +map +task +sessions +. +All +of +this +will +be +published +via +the +DADA +server +infrastructure +to +create +a +large +, +freely +available +research +collection +for +Australian +English +. +Since +the +development +of +DADA +now +involves +people +outside +Macquarie +, +I +have +started +using +a +public +bitbucket +repository +for +the +code +. +As +of +this +writing +the +code +still +needs +some +tidying +and +documentation +to +enable +third +parties +to +be +able +to +install +and +work +on +it +, +but +we +hope +to +have +that +done +within +a +month +. +The +public +DADA +demo +site +is +down +at +the +moment +due +to +network +upgrades +at +Macquarie +( +it +'s +only +visible +inside +MQ +) +- +I +hope +to +have +that +fixed +soon +with +some +new +sample +data +sets +loaded +up +for +testing +. +2011 +looks +like +it +will +be +a +significant +year +for +DADA +. +We +hope +to +end +this +year +with +a +number +of +significant +text +, +audio +and +video +corpora +hosted +on +DADA +infrastructure +and +providing +useful +services +to +the +linguistics +and +language +technology +communities +.