1
0
Fork 0
mirror of https://github.com/documize/community.git synced 2025-07-24 15:49:44 +02:00

restructure directories

This commit is contained in:
Elliott Stoneham 2016-07-20 15:58:37 +01:00
parent 7e4ed6545b
commit a2ce777762
159 changed files with 320 additions and 323 deletions

View file

@ -0,0 +1,228 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package excerpt provides basic functionality to create excerpts of text in English.
package excerpt
import (
"sort"
"strings"
"unicode"
"unicode/utf8"
words "github.com/documize/community/core/wordlists/en-2012"
"github.com/rookii/paicehusk"
)
type extractItem struct {
sequence int
score float64
count int
sentance string
}
type extractList []extractItem
// the Sort interface
// Len is the number of elements in the collection.
func (a extractList) Len() int { return len(a) }
// Less reports whether the element with
// index i should sort before the element with index j.
func (a extractList) Less(i, j int) bool {
return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
}
// Swap swaps the elements with indexes i and j.
func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
type presentItem struct {
sequence int
text string
}
type presentList []presentItem
// the Sort interface
// Len is the number of elements in the collection.
func (a presentList) Len() int { return len(a) }
// Less reports whether the element with
// index i should sort before the element with index j.
func (a presentList) Less(i, j int) bool {
return a[i].sequence < a[j].sequence
}
// Swap swaps the elements with indexes i and j.
func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func addWd(sentance, wd string) (string, bool) {
var isStop bool
if len(sentance) == 0 {
if wd != "[" {
sentance = wd
}
} else {
switch wd {
case "[": //NoOp
case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
sentance += wd
} else {
sentance += " " + wd
}
case ".", "!", "?":
isStop = true
fallthrough
default:
if isPunct(wd) {
sentance += wd
} else {
sentance += " " + wd
}
}
}
return sentance, isStop
}
func isPunct(s string) bool {
for _, r := range s {
if !unicode.IsPunct(r) {
switch r {
case '`', '\'', '"', '(', '/': // still punct
default:
return false
}
}
}
return true
}
// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
func Excerpt(titleWords, bodyWords []string) string {
var el extractList
//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
// populate stemMap
stemMap := make(map[string]uint64)
for _, wd := range bodyWords {
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
stemMap[stem]++
}
for _, wd := range titleWords {
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
stemMap[stem]++ // TODO are words in titles more important?
}
wds := append(titleWords, bodyWords...)
sentance := ""
score := 0.0
count := 0
seq := 0
for _, wd := range wds {
var isStop bool
sentance, isStop = addWd(sentance, wd)
if isStop {
//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
// seq, score*10000/float64(count), sentance)
var ei extractItem
ei.count = count + 1 // must be at least 1
ei.score = score
ei.sentance = sentance
ei.sequence = seq
el = append(el, ei)
sentance = ""
score = 0.0
seq++
} else {
uncommon := true
// TODO Discuss correct level or maybe find a better algorithem for this
ent, ok := words.Words[wd]
if ok {
if ent.Rank <= 100 {
// do not score very common words
uncommon = false
}
}
if uncommon {
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
usage, used := stemMap[stem]
if used {
relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
if relativeStemFreq > 0.0 {
score += relativeStemFreq
}
}
count++
}
}
}
sort.Sort(el)
return present(el)
}
func present(el extractList) (ret string) {
var pl presentList
words := 0
const excerptWords = 50
for s, e := range el {
if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
notEmpty(e.sentance) {
words += e.count
pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
// 1000*e.score/float64(e.count), e.sequence, e.sentance)
}
}
sort.Sort(pl)
var lastSeq int
for p := range pl {
txt := strings.TrimPrefix(pl[p].text, ". ")
if p == 0 {
ret = txt
lastSeq = pl[0].sequence
} else {
thisSeq := pl[p].sequence
if lastSeq+1 != thisSeq {
ret += " …" // Horizontal elipsis character
}
ret += " " + txt
lastSeq = thisSeq
}
}
if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
for len(ret) > 250 {
_, size := utf8.DecodeLastRuneInString(ret)
ret = ret[:len(ret)-size]
}
return ret + "…" // Horizontal elipsis character added after truncation
}
return ret
}
func notEmpty(wds string) bool {
for _, r := range wds {
if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
return true
}
}
return false
}

View file

@ -0,0 +1,130 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package excerpt_test
import "testing"
import "github.com/documize/community/core/api/convert/excerpt"
import "strings"
import "fmt"
func TestExerpt(t *testing.T) {
if excerpt.Excerpt(nil, nil) != "" ||
excerpt.Excerpt([]string{}, []string{}) != "" {
t.Error("empty lists do not return empty string")
}
qbf := strings.Split("The quick brown fox jumps over the lazy dog .", " ")
qbf2 := qbf
for i := 0; i < 200; i++ {
qbf2 = append(qbf2, qbf...)
}
tst := excerpt.Excerpt(qbf, qbf2)
if tst !=
"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." {
t.Error("'quick brown fox' did not work:", tst)
}
tt123 := strings.Split("Testing , testing ; 1 2 3 is fun ! Bracket [ anyone ? .", " ")
tt123a := tt123
for i := 0; i < 200; i++ {
tt123a = append(tt123a, fmt.Sprintf("%d", i))
tt123a = append(tt123a, tt123...)
}
tst2 := excerpt.Excerpt(tt123, tt123a)
if tst2 !=
"Testing, testing; 123 is fun! … Testing, testing; 123 is fun! … 0 Testing, testing; 123 is fun!" {
t.Error("'Testing testing 123' did not work:", tst2)
}
s := strings.Split(strings.Replace(`
It's supercalifragilisticexpialidocious
Even though the sound of it is something quite atrocious
If you say it loud enough, you'll always sound precocious
Supercalifragilisticexpialidocious
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Because I was afraid to speak
When I was just a lad
My father gave me nose a tweak
And told me I was bad
But then one day I learned a word
That saved me achin' nose
The biggest word I ever heard
And this is how it goes, oh
Supercalifragilisticexpialidocious
Even though the sound of it is something quite atrocious
If you say it loud enough, you'll always sound precocious
Supercalifragilisticexpialidocious
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
He traveled all around the world
And everywhere he went
He'd use his word and all would say
There goes a clever gent
When Dukes and Maharajahs
Pass the time of day with me
I say me special word
And then they ask me out to tea
Oh, supercalifragilisticexpialidocious
Even though the sound of it is something quite atrocious
If you say it loud enough, you'll always sound precocious
Supercalifragilisticexpialidocious
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
No, you can say it backwards, which is dociousaliexpilisticfragicalirupus
But that's going a bit too far, don't you think?
So when the cat has got your tongue
There's no need for dismay
Just summon up this word
And then you've got a lot to say
But better use it carefully
Or it could change your life
For example, yes, one night I said it to me girl
And now me girl's my wife, oh, and a lovely thing she's too
She's, supercalifragilisticexpialidocious
Supercalifragilisticexpialidocious
Supercalifragilisticexpialidocious
Supercalifragilisticexpialidocious
. `, "\n", " . ", -1), " ")
ts := []string{"Supercalifragilisticexpialidocious", "song", "lyrics"}
st := excerpt.Excerpt(ts, s)
if st != "Supercalifragilisticexpialidocious song lyrics. … Um diddle, diddle diddle, um diddle ay. Um diddle, diddle diddle, um diddle ay." {
t.Error("'Supercalifragilisticexpialidocious song lyrics' did not work:", st)
}
ss := []string{"Supercalifragilisticexpialidocious", "!"}
ssa := ss
for i := 0; i < 100; i++ {
ssa = append(ssa, ss...)
}
sst := excerpt.Excerpt(ss, ssa)
if sst !=
"Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious…" {
t.Error("'Supercalifragilisticexpialidocious' did not work:", sst)
}
}