mirror of
https://github.com/documize/community.git
synced 2025-07-23 07:09:43 +02:00
initial commit
This commit is contained in:
commit
18933c6767
1841 changed files with 810642 additions and 0 deletions
1
vendor/github.com/rookii/paicehusk/.gitignore
generated
vendored
Normal file
1
vendor/github.com/rookii/paicehusk/.gitignore
generated
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
.DS_Store
|
22
vendor/github.com/rookii/paicehusk/LICENSE
generated
vendored
Normal file
22
vendor/github.com/rookii/paicehusk/LICENSE
generated
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
Copyright (c) 2012, Aaron Groves
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
8
vendor/github.com/rookii/paicehusk/README.md
generated
vendored
Normal file
8
vendor/github.com/rookii/paicehusk/README.md
generated
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
##Golang Implementation of the Paice/Husk stemming algorithm
|
||||
This project was created for the [QUT](http://www.qut.edu.au/ "Queensland University of Technology") course [INB344](http://www.qut.edu.au/study/unit-search/unit?unitCode=INB344&idunit=43393). Details on the algorithm can be found [here](http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm "Lancaster School of Computing"). This implementation is primarily based on the [ANSI C Implementationn](http://www.comp.lancs.ac.uk/computing/research/stemming/Links/implementations.htm) by Andy Stark. Effort has been put into the correctness of the algorithm, but this is hampered by many of the existing implementations giving differing results. Any comments/assistance/pull-requests are welcome.
|
||||
|
||||
##TODO
|
||||
* Benchmarks
|
||||
|
||||
##Demo
|
||||
A demo App Engine project utilizing this package exists [here](http://paicehusk.appspot.com/).
|
385
vendor/github.com/rookii/paicehusk/stemmer.go
generated
vendored
Normal file
385
vendor/github.com/rookii/paicehusk/stemmer.go
generated
vendored
Normal file
|
@ -0,0 +1,385 @@
|
|||
// Go implementation of the Paice/Husk Stemming algorithm:
|
||||
// http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm
|
||||
// Copyright (c) 2012, Aaron Groves. All rights reserved.
|
||||
|
||||
// Package paicehusk provides an implementation of the Paice / Husk stemmer,
|
||||
// along with a default ruleset for the English Language
|
||||
package paicehusk
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A representation of a stemming rule
|
||||
type rule struct {
|
||||
|
||||
// The suffix the rule is to act on
|
||||
suf string
|
||||
|
||||
// True if the stem is required intact for the rule to operate
|
||||
intact bool
|
||||
|
||||
// Number of letters to strip off the stem
|
||||
num int
|
||||
|
||||
// A suffix to append to the stem
|
||||
app string
|
||||
|
||||
// True if the stem should be stemmed further
|
||||
cont bool
|
||||
}
|
||||
|
||||
// DefaultRules is a default ruleset for the english language.
|
||||
var DefaultRules = NewRuleTable(strings.Split(defaultRules, "\n"))
|
||||
|
||||
// RuleTable stores rules based on the final letter of the suffix they
|
||||
// act on allowing for easy lookup.
|
||||
type RuleTable struct {
|
||||
Table map[string][]*rule
|
||||
}
|
||||
|
||||
// NewRuleTable returns a new RuleTable instance
|
||||
func NewRuleTable(f []string) (table *RuleTable) {
|
||||
table = &RuleTable{Table: make(map[string][]*rule)}
|
||||
for _, s := range f {
|
||||
if r, ok := ParseRule(s); ok {
|
||||
table.Table[r.suf[:1]] = append(table.Table[r.suf[:1]], r)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Regex for ValidRule
|
||||
var reg = regexp.MustCompile("[a-zA-Z]*\\*?[0-9][a-zA-z]*[.>]")
|
||||
|
||||
// Validates a rule
|
||||
func ValidRule(s string) (rule string, ok bool) {
|
||||
ok = true
|
||||
// Find the first instance of a rule in the provided string
|
||||
rule = reg.FindString(s)
|
||||
if rule == "" {
|
||||
ok = false
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Regexes for ParseRule
|
||||
var suf = regexp.MustCompile("[a-zA-Z]+")
|
||||
var intact = regexp.MustCompile("[*]")
|
||||
var num = regexp.MustCompile("[0-9]")
|
||||
var app = regexp.MustCompile("[0-9][a-zA-Z]+")
|
||||
|
||||
// ParseRule parses a rule in the form:
|
||||
// |suffix|intact flag|number to strip|Append|Continue flag
|
||||
//
|
||||
// Eg, a rule: ht*2. Means if the stem is still intact, strip the
|
||||
// suffix th and make no further attempts to stem the word.
|
||||
//
|
||||
// Rule nois4j> Means strip the sion suffix, append a j and check
|
||||
// for any more rules to follow
|
||||
func ParseRule(s string) (r *rule, ok bool) {
|
||||
s, ok = ValidRule(s)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
r = new(rule)
|
||||
|
||||
r.suf = suf.FindString(s)
|
||||
if intact.FindString(s) == "" {
|
||||
r.intact = false
|
||||
} else {
|
||||
r.intact = true
|
||||
}
|
||||
if i, err := strconv.ParseInt(num.FindString(s), 0, 0); err != nil {
|
||||
panic(err)
|
||||
} else {
|
||||
r.num = int(i)
|
||||
}
|
||||
if append := app.FindString(s); len(append) > 0 {
|
||||
r.app = app.FindString(s)[1:]
|
||||
} else {
|
||||
r.app = ""
|
||||
}
|
||||
|
||||
if s[len(s)-1:] == ">" {
|
||||
r.cont = true
|
||||
} else {
|
||||
r.cont = false
|
||||
}
|
||||
return r, true
|
||||
}
|
||||
|
||||
// Stem a string, word, based on the rules in *RuleTable r, by following
|
||||
// the algorithm described at:
|
||||
// http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm
|
||||
func (r *RuleTable) Stem(word string) string {
|
||||
stem := []rune(strings.ToLower(word))
|
||||
current := stem
|
||||
|
||||
// Intact Flag
|
||||
intact := true
|
||||
|
||||
// If the stem is less than 3 chars, there's nothing to do, so return
|
||||
if len(stem) <= 3 {
|
||||
return string(stem)
|
||||
}
|
||||
|
||||
// Main Control Loop
|
||||
cont := true
|
||||
for cont {
|
||||
// Lookup the map to see if a rule is available for the
|
||||
// given stems last letter
|
||||
rules, ok := r.Table[string(stem[len(stem)-1:])]
|
||||
if !ok {
|
||||
// Stop the loop if a matching rule is not found
|
||||
break
|
||||
}
|
||||
// Loop through the applicable rules
|
||||
for _, rule := range rules {
|
||||
|
||||
// the length of the rule is greater than
|
||||
// the stem, so don't bother.
|
||||
if len(stem) <= len(rule.suf) {
|
||||
continue
|
||||
}
|
||||
|
||||
// The rule does not match.
|
||||
if !strings.HasSuffix(string(stem), reverse(rule.suf)) {
|
||||
continue
|
||||
}
|
||||
|
||||
// The stem is protected and should be left alone
|
||||
if rule.num == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
// The intact flag is set and the stem
|
||||
// has been operated on already.
|
||||
if rule.intact && !intact {
|
||||
continue
|
||||
}
|
||||
|
||||
s := stem[:len(stem)-rule.num]
|
||||
// The result of the rule is invalid, so do nothing.
|
||||
if !validStem(string(s) + rule.app) {
|
||||
continue
|
||||
}
|
||||
|
||||
// All criteria passed, the word should be stemmed
|
||||
cont = rule.cont
|
||||
current = []rune(string(s) + rule.app)
|
||||
|
||||
// Set the intact flag
|
||||
intact = false
|
||||
|
||||
// Break and repeat the process for the new stem
|
||||
break
|
||||
}
|
||||
|
||||
// No rule matched
|
||||
if string(current) == string(stem) {
|
||||
break
|
||||
}
|
||||
|
||||
// Set the new stem
|
||||
stem = current
|
||||
}
|
||||
return string(stem)
|
||||
}
|
||||
|
||||
// Acceptability condition: if the stem begins with a vowel, then it
|
||||
// must contain at least 2 letters, one of which must be a consonant
|
||||
//
|
||||
// If however, it begins with a consonant then it must contain three
|
||||
// letters and at least one of these must be a vowel or 'y'
|
||||
func validStem(word string) bool {
|
||||
runes := []rune(word)
|
||||
// If there's no vowel left in the stem, stem is invalid
|
||||
if !hasVowel(runes) {
|
||||
return false
|
||||
}
|
||||
|
||||
// If the word has a vowel and is longer than 3 letters, stem is valid
|
||||
if len(runes) >= 3 {
|
||||
return true
|
||||
}
|
||||
|
||||
// If the first letter is a vowel
|
||||
if vowel(runes, 0) {
|
||||
if len(runes) > 1 && consonant(runes, 1) {
|
||||
return true
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
|
||||
} else {
|
||||
// If the first letter is a consonant
|
||||
// The stem must contain 3 letters, one of which we allready know
|
||||
// to be a vowel
|
||||
if len(runes) > 2 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// consonant returns whether the letter at offset is a consonant
|
||||
func consonant(word []rune, offset int) bool {
|
||||
switch word[offset] {
|
||||
case 'A', 'E', 'I', 'O', 'U', 'a', 'e', 'i', 'o', 'u':
|
||||
return false
|
||||
case 'Y', 'y':
|
||||
if offset == 0 {
|
||||
return true
|
||||
}
|
||||
return offset > 0 && !consonant(word, offset-1)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// vowel returns whether the letter at offset is a vowel
|
||||
func vowel(word []rune, offset int) bool {
|
||||
return !consonant(word, offset)
|
||||
}
|
||||
|
||||
// hasVowel returns whether the word contains a vowel
|
||||
func hasVowel(word []rune) bool {
|
||||
for i := 0; i < len(word); i++ {
|
||||
if vowel(word, i) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Reverses a string
|
||||
func reverse(s string) string {
|
||||
runes := []rune(s)
|
||||
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
|
||||
runes[i], runes[j] = runes[j], runes[i]
|
||||
}
|
||||
return string(runes)
|
||||
}
|
||||
|
||||
// Default Paice/Husk Rules
|
||||
var defaultRules = `
|
||||
ai*2. { -ia > - if intact }
|
||||
a*1. { -a > - if intact }
|
||||
bb1. { -bb > -b }
|
||||
city3s. { -ytic > -ys }
|
||||
ci2> { -ic > - }
|
||||
cn1t> { -nc > -nt }
|
||||
dd1. { -dd > -d }
|
||||
dei3y> { -ied > -y }
|
||||
deec2ss. { -ceed > -cess }
|
||||
dee1. { -eed > -ee }
|
||||
de2> { -ed > - }
|
||||
dooh4> { -hood > - }
|
||||
e1> { -e > - }
|
||||
feil1v. { -lief > -liev }
|
||||
fi2> { -if > - }
|
||||
gni3> { -ing > - }
|
||||
gai3y. { -iag > -y }
|
||||
ga2> { -ag > - }
|
||||
gg1. { -gg > -g }
|
||||
ht*2. { -th > - if intact }
|
||||
hsiug5ct. { -guish > -ct }
|
||||
hsi3> { -ish > - }
|
||||
i*1. { -i > - if intact }
|
||||
i1y> { -i > -y }
|
||||
ji1d. { -ij > -id -- see nois4j> & vis3j> }
|
||||
juf1s. { -fuj > -fus }
|
||||
ju1d. { -uj > -ud }
|
||||
jo1d. { -oj > -od }
|
||||
jeh1r. { -hej > -her }
|
||||
jrev1t. { -verj > -vert }
|
||||
jsim2t. { -misj > -mit }
|
||||
jn1d. { -nj > -nd }
|
||||
j1s. { -j > -s }
|
||||
lbaifi6. { -ifiabl > - }
|
||||
lbai4y. { -iabl > -y }
|
||||
lba3> { -abl > - }
|
||||
lbi3. { -ibl > - }
|
||||
lib2l> { -bil > -bl }
|
||||
lc1. { -cl > c }
|
||||
lufi4y. { -iful > -y }
|
||||
luf3> { -ful > - }
|
||||
lu2. { -ul > - }
|
||||
lai3> { -ial > - }
|
||||
lau3> { -ual > - }
|
||||
la2> { -al > - }
|
||||
ll1. { -ll > -l }
|
||||
mui3. { -ium > - }
|
||||
mu*2. { -um > - if intact }
|
||||
msi3> { -ism > - }
|
||||
mm1. { -mm > -m }
|
||||
nois4j> { -sion > -j }
|
||||
noix4ct. { -xion > -ct }
|
||||
noi3> { -ion > - }
|
||||
nai3> { -ian > - }
|
||||
na2> { -an > - }
|
||||
nee0. { protect -een }
|
||||
ne2> { -en > - }
|
||||
nn1. { -nn > -n }
|
||||
pihs4> { -ship > - }
|
||||
pp1. { -pp > -p }
|
||||
re2> { -er > - }
|
||||
rae0. { protect -ear }
|
||||
ra2. { -ar > - }
|
||||
ro2> { -or > - }
|
||||
ru2> { -ur > - }
|
||||
rr1. { -rr > -r }
|
||||
rt1> { -tr > -t }
|
||||
rei3y> { -ier > -y }
|
||||
sei3y> { -ies > -y }
|
||||
sis2. { -sis > -s }
|
||||
si2> { -is > - }
|
||||
ssen4> { -ness > - }
|
||||
ss0. { protect -ss }
|
||||
suo3> { -ous > - }
|
||||
su*2. { -us > - if intact }
|
||||
s*1> { -s > - if intact }
|
||||
s0. { -s > -s }
|
||||
tacilp4y. { -plicat > -ply }
|
||||
ta2> { -at > - }
|
||||
tnem4> { -ment > - }
|
||||
tne3> { -ent > - }
|
||||
tna3> { -ant > - }
|
||||
tpir2b. { -ript > -rib }
|
||||
tpro2b. { -orpt > -orb }
|
||||
tcud1. { -duct > -duc }
|
||||
tpmus2. { -sumpt > -sum }
|
||||
tpec2iv. { -cept > -ceiv }
|
||||
tulo2v. { -olut > -olv }
|
||||
tsis0. { protect -sist }
|
||||
tsi3> { -ist > - }
|
||||
tt1. { -tt > -t }
|
||||
uqi3. { -iqu > - }
|
||||
ugo1. { -ogu > -og }
|
||||
vis3j> { -siv > -j }
|
||||
vie0. { protect -eiv }
|
||||
vi2> { -iv > - }
|
||||
ylb1> { -bly > -bl }
|
||||
yli3y> { -ily > -y }
|
||||
ylp0. { protect -ply }
|
||||
yl2> { -ly > - }
|
||||
ygo1. { -ogy > -og }
|
||||
yhp1. { -phy > -ph }
|
||||
ymo1. { -omy > -om }
|
||||
ypo1. { -opy > -op }
|
||||
yti3> { -ity > - }
|
||||
yte3> { -ety > - }
|
||||
ytl2. { -lty > -l }
|
||||
yrtsi5. { -istry > - }
|
||||
yra3> { -ary > - }
|
||||
yro3> { -ory > - }
|
||||
yfi3. { -ify > - }
|
||||
ycn2t> { -ncy > -nt }
|
||||
yca3> { -acy > - }
|
||||
zi2> { -iz > - }
|
||||
zy1s. { -yz > -ys }
|
||||
end0.
|
||||
`
|
180
vendor/github.com/rookii/paicehusk/stemmer_test.go
generated
vendored
Normal file
180
vendor/github.com/rookii/paicehusk/stemmer_test.go
generated
vendored
Normal file
|
@ -0,0 +1,180 @@
|
|||
// Test file for a Go implementation of the Paice/Husk Stemming algorithm:
|
||||
// http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm
|
||||
// Copyright (c) 2012, Aaron Groves. All rights reserved.
|
||||
|
||||
package paicehusk
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Mostly checking for the Y special cases
|
||||
var consonanttests = []struct {
|
||||
word string
|
||||
offset int
|
||||
expected bool
|
||||
}{
|
||||
{"THEY", 0, true},
|
||||
{"THEY", 1, true},
|
||||
{"THEY", 2, false},
|
||||
{"THEY", 3, true},
|
||||
{"YOKE", 0, true},
|
||||
{"synergy", 0, true},
|
||||
{"synergy", 1, false},
|
||||
{"synergy", 2, true},
|
||||
{"synergy", 3, false},
|
||||
{"synergy", 4, true},
|
||||
{"synergy", 5, true},
|
||||
{"synergy", 6, false},
|
||||
{"男孩boy", 2, true}, // Unicode tests, I hope...
|
||||
{"男孩boy", 3, false},
|
||||
{"男孩boy", 4, true},
|
||||
}
|
||||
|
||||
func TestConsonant(t *testing.T) {
|
||||
for i, tt := range consonanttests {
|
||||
s := consonant([]rune(tt.word), tt.offset)
|
||||
if s != tt.expected {
|
||||
t.Errorf("%v. consonant([]rune(\"%v\"), %v) should be %v, got %v", i, tt.word, tt.offset, tt.expected, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVowel(t *testing.T) {
|
||||
for i, tt := range consonanttests {
|
||||
s := vowel([]rune(tt.word), tt.offset)
|
||||
if s != !tt.expected {
|
||||
t.Errorf("%v. vowel([]rune(\"%v\"), %v) should be %v, got %v", i, tt.word, tt.offset, !tt.expected, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure strings are revered properly
|
||||
var reversetests = []struct {
|
||||
in string
|
||||
expected string
|
||||
}{
|
||||
{"Hello", "olleH"},
|
||||
{"Here's a more complicated string to reverse.", ".esrever ot gnirts detacilpmoc erom a s'ereH"},
|
||||
}
|
||||
|
||||
func TestReverse(t *testing.T) {
|
||||
for i, tt := range reversetests {
|
||||
s := reverse(tt.in)
|
||||
if s != tt.expected {
|
||||
t.Errorf("%v. reverse(\"%v\") should be %v, got %v", i, tt.in, tt.expected, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var ruletests = []struct {
|
||||
rule string
|
||||
valid bool
|
||||
suf string
|
||||
intact bool
|
||||
num int
|
||||
app string
|
||||
cont bool
|
||||
}{
|
||||
{"ai*2.", true, "ai", true, 2, "", false},
|
||||
{"lib3j>", true, "lib", false, 3, "j", true},
|
||||
{"There's a rule here somewhere: afab*4fla>", true, "afab", true, 4, "fla", true},
|
||||
{"ab*2 .", false, "", false, 0, "", false},
|
||||
{"fire", false, "", false, 0, "", false},
|
||||
{"asfa __ falkjlk ?!@|..", false, "", false, 0, "", false},
|
||||
}
|
||||
|
||||
// Ensure rules are validated correctly
|
||||
func TestValidRule(t *testing.T) {
|
||||
for i, tt := range ruletests {
|
||||
_, ok := ValidRule(tt.rule)
|
||||
if ok != tt.valid {
|
||||
t.Errorf("%v. ValidRule(\"%v\") should be %v, got %v", i, tt.rule, tt.valid, ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRule(t *testing.T) {
|
||||
for i, tt := range ruletests {
|
||||
r, ok := ParseRule(tt.rule)
|
||||
if ok != tt.valid {
|
||||
t.Errorf("%v. ParseRule(\"%v\") err should be %v, got %v", i, tt.rule, tt.valid, ok)
|
||||
} else if ok {
|
||||
if r.suf != tt.suf {
|
||||
t.Errorf("%v. r.suf should be \"%v\", got \"%v\"", i, tt.suf, r.suf)
|
||||
}
|
||||
if r.intact != tt.intact {
|
||||
t.Errorf("%v. r.intact should be \"%v\", got \"%v\"", i, tt.intact, r.intact)
|
||||
}
|
||||
if r.num != tt.num {
|
||||
t.Errorf("%v. r.num should be \"%v\", got \"%v\"", i, tt.num, r.num)
|
||||
}
|
||||
if r.app != tt.app {
|
||||
t.Errorf("%v. r.app should be \"%v\", got \"%v\"", i, tt.app, r.app)
|
||||
}
|
||||
if r.cont != tt.cont {
|
||||
t.Errorf("%v. r.cont should be \"%v\", got \"%v\"", i, tt.cont, r.cont)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewRuleTable(t *testing.T) {
|
||||
f := []string{ruletests[0].rule, ruletests[1].rule, ruletests[2].rule, ruletests[3].rule, ruletests[4].rule, ruletests[5].rule}
|
||||
table := NewRuleTable(f)
|
||||
if len(table.Table) != 2 {
|
||||
t.Errorf("Error: len(table.Table) should be %v, got %v", 2, len(table.Table))
|
||||
}
|
||||
if len(table.Table["a"]) != 2 {
|
||||
t.Errorf("Error: len(table.Table[\"a\"]) should be %v, got %v", 2, len(table.Table))
|
||||
}
|
||||
}
|
||||
|
||||
var validstemtests = []struct {
|
||||
stem string
|
||||
valid bool
|
||||
}{
|
||||
{"xvzf", false}, // No vowels
|
||||
{"fire", true},
|
||||
{"aa", false}, // No consonant
|
||||
{"ab", true},
|
||||
{"a", false}, // No consonant
|
||||
{"ba", false}, // A First letter consonant requires 3 letter stem
|
||||
{"baa", true},
|
||||
{"bba", true},
|
||||
}
|
||||
|
||||
func TestValidStem(t *testing.T) {
|
||||
for i, tt := range validstemtests {
|
||||
ok := validStem(tt.stem)
|
||||
if ok != tt.valid {
|
||||
t.Errorf("%v. validStem(\"%v\") should be %v, got %v", i, tt.stem, tt.valid, ok)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var stemtests = []struct {
|
||||
in string
|
||||
expecting string
|
||||
}{
|
||||
{"at", "at"}, // To short
|
||||
{"rack", "rack"}, // No 'k' rules exist
|
||||
{"aaron", "aaron"}, // 'N' rules exist but no 'n', or 'no' rule
|
||||
{"splat", "splat"}, // Resulting stem has no vowels
|
||||
{"doat", "doat"}, // Resulting stem starts with a consonant but only has 2 letters
|
||||
{"eat", "eat"}, // Resulting stem starts with a vowel but has only 1 letter
|
||||
{"ikat", "ik"}, // Resulting stem starts with a vowel and has 2 letters
|
||||
{"foreseen", "foreseen"}, // Check Protect Rule
|
||||
{"Ariaan", "aria"}, // Check intact rule
|
||||
{"explosion", "explod"}, // Check replace rule
|
||||
{"complicate", "comply"}, // Check partial replacement
|
||||
{"EXPLOSION", "explod"}, // Check all caps
|
||||
}
|
||||
|
||||
func TestStem(t *testing.T) {
|
||||
for i, tt := range stemtests {
|
||||
if test := DefaultRules.Stem(tt.in); test != tt.expecting {
|
||||
t.Errorf("%v. Error: stemming \"%v\", expected %v, got %v", i, tt.in, tt.expecting, test)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue