// Go implementation of the Paice/Husk Stemming algorithm: // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm // Copyright (c) 2012, Aaron Groves. All rights reserved. // Package paicehusk provides an implementation of the Paice / Husk stemmer, // along with a default ruleset for the English Language package paicehusk import ( "regexp" "strconv" "strings" ) // A representation of a stemming rule type rule struct { // The suffix the rule is to act on suf string // True if the stem is required intact for the rule to operate intact bool // Number of letters to strip off the stem num int // A suffix to append to the stem app string // True if the stem should be stemmed further cont bool } // DefaultRules is a default ruleset for the english language. var DefaultRules = NewRuleTable(strings.Split(defaultRules, "\n")) // RuleTable stores rules based on the final letter of the suffix they // act on allowing for easy lookup. type RuleTable struct { Table map[string][]*rule } // NewRuleTable returns a new RuleTable instance func NewRuleTable(f []string) (table *RuleTable) { table = &RuleTable{Table: make(map[string][]*rule)} for _, s := range f { if r, ok := ParseRule(s); ok { table.Table[r.suf[:1]] = append(table.Table[r.suf[:1]], r) } } return } // Regex for ValidRule var reg = regexp.MustCompile("[a-zA-Z]*\\*?[0-9][a-zA-z]*[.>]") // Validates a rule func ValidRule(s string) (rule string, ok bool) { ok = true // Find the first instance of a rule in the provided string rule = reg.FindString(s) if rule == "" { ok = false } return } // Regexes for ParseRule var suf = regexp.MustCompile("[a-zA-Z]+") var intact = regexp.MustCompile("[*]") var num = regexp.MustCompile("[0-9]") var app = regexp.MustCompile("[0-9][a-zA-Z]+") // ParseRule parses a rule in the form: // |suffix|intact flag|number to strip|Append|Continue flag // // Eg, a rule: ht*2. Means if the stem is still intact, strip the // suffix th and make no further attempts to stem the word. // // Rule nois4j> Means strip the sion suffix, append a j and check // for any more rules to follow func ParseRule(s string) (r *rule, ok bool) { s, ok = ValidRule(s) if !ok { return nil, false } r = new(rule) r.suf = suf.FindString(s) if intact.FindString(s) == "" { r.intact = false } else { r.intact = true } if i, err := strconv.ParseInt(num.FindString(s), 0, 0); err != nil { panic(err) } else { r.num = int(i) } if append := app.FindString(s); len(append) > 0 { r.app = app.FindString(s)[1:] } else { r.app = "" } if s[len(s)-1:] == ">" { r.cont = true } else { r.cont = false } return r, true } // Stem a string, word, based on the rules in *RuleTable r, by following // the algorithm described at: // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm func (r *RuleTable) Stem(word string) string { stem := []rune(strings.ToLower(word)) current := stem // Intact Flag intact := true // If the stem is less than 3 chars, there's nothing to do, so return if len(stem) <= 3 { return string(stem) } // Main Control Loop cont := true for cont { // Lookup the map to see if a rule is available for the // given stems last letter rules, ok := r.Table[string(stem[len(stem)-1:])] if !ok { // Stop the loop if a matching rule is not found break } // Loop through the applicable rules for _, rule := range rules { // the length of the rule is greater than // the stem, so don't bother. if len(stem) <= len(rule.suf) { continue } // The rule does not match. if !strings.HasSuffix(string(stem), reverse(rule.suf)) { continue } // The stem is protected and should be left alone if rule.num == 0 { break } // The intact flag is set and the stem // has been operated on already. if rule.intact && !intact { continue } s := stem[:len(stem)-rule.num] // The result of the rule is invalid, so do nothing. if !validStem(string(s) + rule.app) { continue } // All criteria passed, the word should be stemmed cont = rule.cont current = []rune(string(s) + rule.app) // Set the intact flag intact = false // Break and repeat the process for the new stem break } // No rule matched if string(current) == string(stem) { break } // Set the new stem stem = current } return string(stem) } // Acceptability condition: if the stem begins with a vowel, then it // must contain at least 2 letters, one of which must be a consonant // // If however, it begins with a consonant then it must contain three // letters and at least one of these must be a vowel or 'y' func validStem(word string) bool { runes := []rune(word) // If there's no vowel left in the stem, stem is invalid if !hasVowel(runes) { return false } // If the word has a vowel and is longer than 3 letters, stem is valid if len(runes) >= 3 { return true } // If the first letter is a vowel if vowel(runes, 0) { if len(runes) > 1 && consonant(runes, 1) { return true } else { return false } } else { // If the first letter is a consonant // The stem must contain 3 letters, one of which we allready know // to be a vowel if len(runes) > 2 { return true } } return false } // consonant returns whether the letter at offset is a consonant func consonant(word []rune, offset int) bool { switch word[offset] { case 'A', 'E', 'I', 'O', 'U', 'a', 'e', 'i', 'o', 'u': return false case 'Y', 'y': if offset == 0 { return true } return offset > 0 && !consonant(word, offset-1) } return true } // vowel returns whether the letter at offset is a vowel func vowel(word []rune, offset int) bool { return !consonant(word, offset) } // hasVowel returns whether the word contains a vowel func hasVowel(word []rune) bool { for i := 0; i < len(word); i++ { if vowel(word, i) { return true } } return false } // Reverses a string func reverse(s string) string { runes := []rune(s) for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 { runes[i], runes[j] = runes[j], runes[i] } return string(runes) } // Default Paice/Husk Rules var defaultRules = ` ai*2. { -ia > - if intact } a*1. { -a > - if intact } bb1. { -bb > -b } city3s. { -ytic > -ys } ci2> { -ic > - } cn1t> { -nc > -nt } dd1. { -dd > -d } dei3y> { -ied > -y } deec2ss. { -ceed > -cess } dee1. { -eed > -ee } de2> { -ed > - } dooh4> { -hood > - } e1> { -e > - } feil1v. { -lief > -liev } fi2> { -if > - } gni3> { -ing > - } gai3y. { -iag > -y } ga2> { -ag > - } gg1. { -gg > -g } ht*2. { -th > - if intact } hsiug5ct. { -guish > -ct } hsi3> { -ish > - } i*1. { -i > - if intact } i1y> { -i > -y } ji1d. { -ij > -id -- see nois4j> & vis3j> } juf1s. { -fuj > -fus } ju1d. { -uj > -ud } jo1d. { -oj > -od } jeh1r. { -hej > -her } jrev1t. { -verj > -vert } jsim2t. { -misj > -mit } jn1d. { -nj > -nd } j1s. { -j > -s } lbaifi6. { -ifiabl > - } lbai4y. { -iabl > -y } lba3> { -abl > - } lbi3. { -ibl > - } lib2l> { -bil > -bl } lc1. { -cl > c } lufi4y. { -iful > -y } luf3> { -ful > - } lu2. { -ul > - } lai3> { -ial > - } lau3> { -ual > - } la2> { -al > - } ll1. { -ll > -l } mui3. { -ium > - } mu*2. { -um > - if intact } msi3> { -ism > - } mm1. { -mm > -m } nois4j> { -sion > -j } noix4ct. { -xion > -ct } noi3> { -ion > - } nai3> { -ian > - } na2> { -an > - } nee0. { protect -een } ne2> { -en > - } nn1. { -nn > -n } pihs4> { -ship > - } pp1. { -pp > -p } re2> { -er > - } rae0. { protect -ear } ra2. { -ar > - } ro2> { -or > - } ru2> { -ur > - } rr1. { -rr > -r } rt1> { -tr > -t } rei3y> { -ier > -y } sei3y> { -ies > -y } sis2. { -sis > -s } si2> { -is > - } ssen4> { -ness > - } ss0. { protect -ss } suo3> { -ous > - } su*2. { -us > - if intact } s*1> { -s > - if intact } s0. { -s > -s } tacilp4y. { -plicat > -ply } ta2> { -at > - } tnem4> { -ment > - } tne3> { -ent > - } tna3> { -ant > - } tpir2b. { -ript > -rib } tpro2b. { -orpt > -orb } tcud1. { -duct > -duc } tpmus2. { -sumpt > -sum } tpec2iv. { -cept > -ceiv } tulo2v. { -olut > -olv } tsis0. { protect -sist } tsi3> { -ist > - } tt1. { -tt > -t } uqi3. { -iqu > - } ugo1. { -ogu > -og } vis3j> { -siv > -j } vie0. { protect -eiv } vi2> { -iv > - } ylb1> { -bly > -bl } yli3y> { -ily > -y } ylp0. { protect -ply } yl2> { -ly > - } ygo1. { -ogy > -og } yhp1. { -phy > -ph } ymo1. { -omy > -om } ypo1. { -opy > -op } yti3> { -ity > - } yte3> { -ety > - } ytl2. { -lty > -l } yrtsi5. { -istry > - } yra3> { -ary > - } yro3> { -ory > - } yfi3. { -ify > - } ycn2t> { -ncy > -nt } yca3> { -acy > - } zi2> { -iz > - } zy1s. { -yz > -ys } end0. `