mirror of
https://github.com/documize/community.git
synced 2025-07-24 15:49:44 +02:00
restructure directories
This commit is contained in:
parent
7e4ed6545b
commit
a2ce777762
159 changed files with 320 additions and 323 deletions
228
core/api/convert/excerpt/excerpt.go
Normal file
228
core/api/convert/excerpt/excerpt.go
Normal file
|
@ -0,0 +1,228 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package excerpt provides basic functionality to create excerpts of text in English.
|
||||
package excerpt
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
words "github.com/documize/community/core/wordlists/en-2012"
|
||||
|
||||
"github.com/rookii/paicehusk"
|
||||
)
|
||||
|
||||
type extractItem struct {
|
||||
sequence int
|
||||
score float64
|
||||
count int
|
||||
sentance string
|
||||
}
|
||||
|
||||
type extractList []extractItem
|
||||
|
||||
// the Sort interface
|
||||
// Len is the number of elements in the collection.
|
||||
func (a extractList) Len() int { return len(a) }
|
||||
|
||||
// Less reports whether the element with
|
||||
// index i should sort before the element with index j.
|
||||
func (a extractList) Less(i, j int) bool {
|
||||
return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
|
||||
}
|
||||
|
||||
// Swap swaps the elements with indexes i and j.
|
||||
func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
|
||||
type presentItem struct {
|
||||
sequence int
|
||||
text string
|
||||
}
|
||||
|
||||
type presentList []presentItem
|
||||
|
||||
// the Sort interface
|
||||
// Len is the number of elements in the collection.
|
||||
func (a presentList) Len() int { return len(a) }
|
||||
|
||||
// Less reports whether the element with
|
||||
// index i should sort before the element with index j.
|
||||
func (a presentList) Less(i, j int) bool {
|
||||
return a[i].sequence < a[j].sequence
|
||||
}
|
||||
|
||||
// Swap swaps the elements with indexes i and j.
|
||||
func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
|
||||
func addWd(sentance, wd string) (string, bool) {
|
||||
var isStop bool
|
||||
if len(sentance) == 0 {
|
||||
if wd != "[" {
|
||||
sentance = wd
|
||||
}
|
||||
} else {
|
||||
switch wd {
|
||||
case "[": //NoOp
|
||||
case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
|
||||
if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
|
||||
sentance += wd
|
||||
} else {
|
||||
sentance += " " + wd
|
||||
}
|
||||
case ".", "!", "?":
|
||||
isStop = true
|
||||
fallthrough
|
||||
default:
|
||||
if isPunct(wd) {
|
||||
sentance += wd
|
||||
} else {
|
||||
sentance += " " + wd
|
||||
}
|
||||
}
|
||||
}
|
||||
return sentance, isStop
|
||||
}
|
||||
|
||||
func isPunct(s string) bool {
|
||||
for _, r := range s {
|
||||
if !unicode.IsPunct(r) {
|
||||
switch r {
|
||||
case '`', '\'', '"', '(', '/': // still punct
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
|
||||
func Excerpt(titleWords, bodyWords []string) string {
|
||||
var el extractList
|
||||
|
||||
//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
|
||||
|
||||
// populate stemMap
|
||||
stemMap := make(map[string]uint64)
|
||||
for _, wd := range bodyWords {
|
||||
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
|
||||
stemMap[stem]++
|
||||
}
|
||||
for _, wd := range titleWords {
|
||||
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
|
||||
stemMap[stem]++ // TODO are words in titles more important?
|
||||
}
|
||||
|
||||
wds := append(titleWords, bodyWords...)
|
||||
|
||||
sentance := ""
|
||||
score := 0.0
|
||||
count := 0
|
||||
seq := 0
|
||||
for _, wd := range wds {
|
||||
var isStop bool
|
||||
|
||||
sentance, isStop = addWd(sentance, wd)
|
||||
|
||||
if isStop {
|
||||
//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
|
||||
// seq, score*10000/float64(count), sentance)
|
||||
var ei extractItem
|
||||
ei.count = count + 1 // must be at least 1
|
||||
ei.score = score
|
||||
ei.sentance = sentance
|
||||
ei.sequence = seq
|
||||
el = append(el, ei)
|
||||
sentance = ""
|
||||
score = 0.0
|
||||
seq++
|
||||
} else {
|
||||
uncommon := true
|
||||
// TODO Discuss correct level or maybe find a better algorithem for this
|
||||
ent, ok := words.Words[wd]
|
||||
if ok {
|
||||
if ent.Rank <= 100 {
|
||||
// do not score very common words
|
||||
uncommon = false
|
||||
}
|
||||
}
|
||||
if uncommon {
|
||||
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
|
||||
usage, used := stemMap[stem]
|
||||
if used {
|
||||
relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
|
||||
if relativeStemFreq > 0.0 {
|
||||
score += relativeStemFreq
|
||||
}
|
||||
}
|
||||
count++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sort.Sort(el)
|
||||
|
||||
return present(el)
|
||||
}
|
||||
|
||||
func present(el extractList) (ret string) {
|
||||
var pl presentList
|
||||
words := 0
|
||||
|
||||
const excerptWords = 50
|
||||
|
||||
for s, e := range el {
|
||||
if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
|
||||
notEmpty(e.sentance) {
|
||||
words += e.count
|
||||
pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
|
||||
//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
|
||||
// 1000*e.score/float64(e.count), e.sequence, e.sentance)
|
||||
}
|
||||
}
|
||||
sort.Sort(pl)
|
||||
|
||||
var lastSeq int
|
||||
for p := range pl {
|
||||
txt := strings.TrimPrefix(pl[p].text, ". ")
|
||||
if p == 0 {
|
||||
ret = txt
|
||||
lastSeq = pl[0].sequence
|
||||
} else {
|
||||
thisSeq := pl[p].sequence
|
||||
if lastSeq+1 != thisSeq {
|
||||
ret += " …" // Horizontal elipsis character
|
||||
}
|
||||
ret += " " + txt
|
||||
lastSeq = thisSeq
|
||||
}
|
||||
}
|
||||
if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
|
||||
for len(ret) > 250 {
|
||||
_, size := utf8.DecodeLastRuneInString(ret)
|
||||
ret = ret[:len(ret)-size]
|
||||
}
|
||||
return ret + "…" // Horizontal elipsis character added after truncation
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func notEmpty(wds string) bool {
|
||||
for _, r := range wds {
|
||||
if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
130
core/api/convert/excerpt/excerpt_test.go
Normal file
130
core/api/convert/excerpt/excerpt_test.go
Normal file
|
@ -0,0 +1,130 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package excerpt_test
|
||||
|
||||
import "testing"
|
||||
import "github.com/documize/community/core/api/convert/excerpt"
|
||||
import "strings"
|
||||
import "fmt"
|
||||
|
||||
func TestExerpt(t *testing.T) {
|
||||
if excerpt.Excerpt(nil, nil) != "" ||
|
||||
excerpt.Excerpt([]string{}, []string{}) != "" {
|
||||
t.Error("empty lists do not return empty string")
|
||||
}
|
||||
qbf := strings.Split("The quick brown fox jumps over the lazy dog .", " ")
|
||||
qbf2 := qbf
|
||||
for i := 0; i < 200; i++ {
|
||||
qbf2 = append(qbf2, qbf...)
|
||||
}
|
||||
tst := excerpt.Excerpt(qbf, qbf2)
|
||||
if tst !=
|
||||
"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." {
|
||||
t.Error("'quick brown fox' did not work:", tst)
|
||||
}
|
||||
|
||||
tt123 := strings.Split("Testing , testing ; 1 2 3 is fun ! Bracket [ anyone ? .", " ")
|
||||
tt123a := tt123
|
||||
for i := 0; i < 200; i++ {
|
||||
tt123a = append(tt123a, fmt.Sprintf("%d", i))
|
||||
tt123a = append(tt123a, tt123...)
|
||||
}
|
||||
tst2 := excerpt.Excerpt(tt123, tt123a)
|
||||
if tst2 !=
|
||||
"Testing, testing; 123 is fun! … Testing, testing; 123 is fun! … 0 Testing, testing; 123 is fun!" {
|
||||
t.Error("'Testing testing 123' did not work:", tst2)
|
||||
}
|
||||
|
||||
s := strings.Split(strings.Replace(`
|
||||
It's supercalifragilisticexpialidocious
|
||||
Even though the sound of it is something quite atrocious
|
||||
If you say it loud enough, you'll always sound precocious
|
||||
Supercalifragilisticexpialidocious
|
||||
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
|
||||
Because I was afraid to speak
|
||||
When I was just a lad
|
||||
My father gave me nose a tweak
|
||||
And told me I was bad
|
||||
|
||||
But then one day I learned a word
|
||||
That saved me achin' nose
|
||||
The biggest word I ever heard
|
||||
And this is how it goes, oh
|
||||
|
||||
Supercalifragilisticexpialidocious
|
||||
Even though the sound of it is something quite atrocious
|
||||
If you say it loud enough, you'll always sound precocious
|
||||
Supercalifragilisticexpialidocious
|
||||
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
|
||||
He traveled all around the world
|
||||
And everywhere he went
|
||||
He'd use his word and all would say
|
||||
There goes a clever gent
|
||||
|
||||
When Dukes and Maharajahs
|
||||
Pass the time of day with me
|
||||
I say me special word
|
||||
And then they ask me out to tea
|
||||
|
||||
Oh, supercalifragilisticexpialidocious
|
||||
Even though the sound of it is something quite atrocious
|
||||
If you say it loud enough, you'll always sound precocious
|
||||
Supercalifragilisticexpialidocious
|
||||
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
|
||||
No, you can say it backwards, which is dociousaliexpilisticfragicalirupus
|
||||
But that's going a bit too far, don't you think?
|
||||
|
||||
So when the cat has got your tongue
|
||||
There's no need for dismay
|
||||
Just summon up this word
|
||||
And then you've got a lot to say
|
||||
|
||||
But better use it carefully
|
||||
Or it could change your life
|
||||
For example, yes, one night I said it to me girl
|
||||
And now me girl's my wife, oh, and a lovely thing she's too
|
||||
|
||||
She's, supercalifragilisticexpialidocious
|
||||
Supercalifragilisticexpialidocious
|
||||
Supercalifragilisticexpialidocious
|
||||
Supercalifragilisticexpialidocious
|
||||
. `, "\n", " . ", -1), " ")
|
||||
ts := []string{"Supercalifragilisticexpialidocious", "song", "lyrics"}
|
||||
st := excerpt.Excerpt(ts, s)
|
||||
if st != "Supercalifragilisticexpialidocious song lyrics. … Um diddle, diddle diddle, um diddle ay. Um diddle, diddle diddle, um diddle ay." {
|
||||
t.Error("'Supercalifragilisticexpialidocious song lyrics' did not work:", st)
|
||||
}
|
||||
|
||||
ss := []string{"Supercalifragilisticexpialidocious", "!"}
|
||||
ssa := ss
|
||||
for i := 0; i < 100; i++ {
|
||||
ssa = append(ssa, ss...)
|
||||
}
|
||||
sst := excerpt.Excerpt(ss, ssa)
|
||||
if sst !=
|
||||
"Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious…" {
|
||||
t.Error("'Supercalifragilisticexpialidocious' did not work:", sst)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue