1
0
Fork 0
mirror of https://github.com/documize/community.git synced 2025-07-19 13:19:43 +02:00
documize/core/stringutil/words.go

79 lines
2.2 KiB
Go
Raw Normal View History

2016-07-07 18:54:16 -07:00
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
2016-10-17 14:00:06 -07:00
// This software (Documize Community Edition) is licensed under
2016-07-07 18:54:16 -07:00
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
2016-10-17 14:00:06 -07:00
// by contacting <sales@documize.com>.
2016-07-07 18:54:16 -07:00
//
// https://documize.com
2017-07-18 21:55:17 +01:00
package stringutil
2016-07-07 18:54:16 -07:00
2017-07-18 21:55:17 +01:00
import (
"unicode"
nethtml "golang.org/x/net/html"
)
2016-07-07 18:54:16 -07:00
// Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word.
2017-07-18 21:55:17 +01:00
// This functionality is provided to enable verification of the text extraction algorithm across different implemntations.
2016-07-07 18:54:16 -07:00
func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) {
txt, err := ch.Text(testMode)
if err != nil {
return nil, inSqBr, err
}
txt = nethtml.UnescapeString(txt)
words := []string{""}
for _, c := range txt {
if inSqBr > 0 {
switch c {
case ']':
inSqBr--
case '[':
inSqBr++
}
} else {
if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
if testMode {
c = ' ' // NOTE only replace with a space here if we are testing
}
}
if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
if c == '[' {
inSqBr = 1
words = append(words, "[") // open square bracket means potentially elided text
words = append(words, "")
} else {
inSqBr = 0
if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) {
if words[len(words)-1] == "" {
words[len(words)-1] = string(c)
} else {
words = append(words, string(c))
}
words = append(words, "")
} else {
if unicode.IsGraphic(c) || unicode.IsSpace(c) {
if unicode.IsSpace(c) {
if words[len(words)-1] != "" {
words = append(words, "")
}
} else {
words[len(words)-1] += string(c)
}
}
}
}
}
}
}
if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation
words = append(words, ".")
}
return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end
}