mirror of
https://github.com/documize/community.git
synced 2025-08-05 05:25:27 +02:00
removed experimental code
This commit is contained in:
parent
fddaf9effe
commit
7455a027fc
38 changed files with 1 additions and 499578 deletions
|
@ -14,11 +14,9 @@ package convert
|
|||
|
||||
import (
|
||||
"errors"
|
||||
"github.com/documize/community/core/api/convert/excerpt"
|
||||
"github.com/documize/community/core/api/convert/html"
|
||||
"github.com/documize/community/core/api/plugins"
|
||||
api "github.com/documize/community/core/convapi"
|
||||
"github.com/documize/community/core/utility"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
@ -49,32 +47,6 @@ func Convert(ctx context.Context, xtn string, fileRequest *api.DocumentConversio
|
|||
}
|
||||
*/
|
||||
|
||||
if fileResult.Excerpt != "" {
|
||||
//fmt.Println("DEBUG supplied excerpt: " + fileResult.Excerpt)
|
||||
} else {
|
||||
titleWds := []string{}
|
||||
bodyWds := []string{}
|
||||
for p := range fileResult.Pages {
|
||||
var wds []string
|
||||
var err error
|
||||
if p > 0 { // title 0 is already the title of the document
|
||||
wds, _, err = utility.Words(utility.HTML(fileResult.Pages[p].Title), 0, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
titleWds = append(titleWds, wds...)
|
||||
titleWds = append(titleWds, ".")
|
||||
}
|
||||
wds, _, err = utility.Words(utility.HTML(string(fileResult.Pages[p].Body)), 0, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bodyWds = append(bodyWds, wds...)
|
||||
bodyWds = append(bodyWds, ".")
|
||||
}
|
||||
fileResult.Excerpt = excerpt.Excerpt(titleWds, bodyWds)
|
||||
}
|
||||
|
||||
return fileResult, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -1,228 +0,0 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package excerpt provides basic functionality to create excerpts of text in English.
|
||||
package excerpt
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
words "github.com/documize/community/core/wordlists/en-2012"
|
||||
|
||||
"github.com/rookii/paicehusk"
|
||||
)
|
||||
|
||||
type extractItem struct {
|
||||
sequence int
|
||||
score float64
|
||||
count int
|
||||
sentance string
|
||||
}
|
||||
|
||||
type extractList []extractItem
|
||||
|
||||
// the Sort interface
|
||||
// Len is the number of elements in the collection.
|
||||
func (a extractList) Len() int { return len(a) }
|
||||
|
||||
// Less reports whether the element with
|
||||
// index i should sort before the element with index j.
|
||||
func (a extractList) Less(i, j int) bool {
|
||||
return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
|
||||
}
|
||||
|
||||
// Swap swaps the elements with indexes i and j.
|
||||
func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
|
||||
type presentItem struct {
|
||||
sequence int
|
||||
text string
|
||||
}
|
||||
|
||||
type presentList []presentItem
|
||||
|
||||
// the Sort interface
|
||||
// Len is the number of elements in the collection.
|
||||
func (a presentList) Len() int { return len(a) }
|
||||
|
||||
// Less reports whether the element with
|
||||
// index i should sort before the element with index j.
|
||||
func (a presentList) Less(i, j int) bool {
|
||||
return a[i].sequence < a[j].sequence
|
||||
}
|
||||
|
||||
// Swap swaps the elements with indexes i and j.
|
||||
func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
|
||||
func addWd(sentance, wd string) (string, bool) {
|
||||
var isStop bool
|
||||
if len(sentance) == 0 {
|
||||
if wd != "[" {
|
||||
sentance = wd
|
||||
}
|
||||
} else {
|
||||
switch wd {
|
||||
case "[": //NoOp
|
||||
case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
|
||||
if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
|
||||
sentance += wd
|
||||
} else {
|
||||
sentance += " " + wd
|
||||
}
|
||||
case ".", "!", "?":
|
||||
isStop = true
|
||||
fallthrough
|
||||
default:
|
||||
if isPunct(wd) {
|
||||
sentance += wd
|
||||
} else {
|
||||
sentance += " " + wd
|
||||
}
|
||||
}
|
||||
}
|
||||
return sentance, isStop
|
||||
}
|
||||
|
||||
func isPunct(s string) bool {
|
||||
for _, r := range s {
|
||||
if !unicode.IsPunct(r) {
|
||||
switch r {
|
||||
case '`', '\'', '"', '(', '/': // still punct
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
|
||||
func Excerpt(titleWords, bodyWords []string) string {
|
||||
var el extractList
|
||||
|
||||
//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
|
||||
|
||||
// populate stemMap
|
||||
stemMap := make(map[string]uint64)
|
||||
for _, wd := range bodyWords {
|
||||
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
|
||||
stemMap[stem]++
|
||||
}
|
||||
for _, wd := range titleWords {
|
||||
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
|
||||
stemMap[stem]++ // TODO are words in titles more important?
|
||||
}
|
||||
|
||||
wds := append(titleWords, bodyWords...)
|
||||
|
||||
sentance := ""
|
||||
score := 0.0
|
||||
count := 0
|
||||
seq := 0
|
||||
for _, wd := range wds {
|
||||
var isStop bool
|
||||
|
||||
sentance, isStop = addWd(sentance, wd)
|
||||
|
||||
if isStop {
|
||||
//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
|
||||
// seq, score*10000/float64(count), sentance)
|
||||
var ei extractItem
|
||||
ei.count = count + 1 // must be at least 1
|
||||
ei.score = score
|
||||
ei.sentance = sentance
|
||||
ei.sequence = seq
|
||||
el = append(el, ei)
|
||||
sentance = ""
|
||||
score = 0.0
|
||||
seq++
|
||||
} else {
|
||||
uncommon := true
|
||||
// TODO Discuss correct level or maybe find a better algorithem for this
|
||||
ent, ok := words.Words[wd]
|
||||
if ok {
|
||||
if ent.Rank <= 100 {
|
||||
// do not score very common words
|
||||
uncommon = false
|
||||
}
|
||||
}
|
||||
if uncommon {
|
||||
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
|
||||
usage, used := stemMap[stem]
|
||||
if used {
|
||||
relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
|
||||
if relativeStemFreq > 0.0 {
|
||||
score += relativeStemFreq
|
||||
}
|
||||
}
|
||||
count++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sort.Sort(el)
|
||||
|
||||
return present(el)
|
||||
}
|
||||
|
||||
func present(el extractList) (ret string) {
|
||||
var pl presentList
|
||||
words := 0
|
||||
|
||||
const excerptWords = 50
|
||||
|
||||
for s, e := range el {
|
||||
if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
|
||||
notEmpty(e.sentance) {
|
||||
words += e.count
|
||||
pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
|
||||
//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
|
||||
// 1000*e.score/float64(e.count), e.sequence, e.sentance)
|
||||
}
|
||||
}
|
||||
sort.Sort(pl)
|
||||
|
||||
var lastSeq int
|
||||
for p := range pl {
|
||||
txt := strings.TrimPrefix(pl[p].text, ". ")
|
||||
if p == 0 {
|
||||
ret = txt
|
||||
lastSeq = pl[0].sequence
|
||||
} else {
|
||||
thisSeq := pl[p].sequence
|
||||
if lastSeq+1 != thisSeq {
|
||||
ret += " …" // Horizontal elipsis character
|
||||
}
|
||||
ret += " " + txt
|
||||
lastSeq = thisSeq
|
||||
}
|
||||
}
|
||||
if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
|
||||
for len(ret) > 250 {
|
||||
_, size := utf8.DecodeLastRuneInString(ret)
|
||||
ret = ret[:len(ret)-size]
|
||||
}
|
||||
return ret + "…" // Horizontal elipsis character added after truncation
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func notEmpty(wds string) bool {
|
||||
for _, r := range wds {
|
||||
if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
|
@ -1,130 +0,0 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package excerpt_test
|
||||
|
||||
import "testing"
|
||||
import "github.com/documize/community/core/api/convert/excerpt"
|
||||
import "strings"
|
||||
import "fmt"
|
||||
|
||||
func TestExerpt(t *testing.T) {
|
||||
if excerpt.Excerpt(nil, nil) != "" ||
|
||||
excerpt.Excerpt([]string{}, []string{}) != "" {
|
||||
t.Error("empty lists do not return empty string")
|
||||
}
|
||||
qbf := strings.Split("The quick brown fox jumps over the lazy dog .", " ")
|
||||
qbf2 := qbf
|
||||
for i := 0; i < 200; i++ {
|
||||
qbf2 = append(qbf2, qbf...)
|
||||
}
|
||||
tst := excerpt.Excerpt(qbf, qbf2)
|
||||
if tst !=
|
||||
"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." {
|
||||
t.Error("'quick brown fox' did not work:", tst)
|
||||
}
|
||||
|
||||
tt123 := strings.Split("Testing , testing ; 1 2 3 is fun ! Bracket [ anyone ? .", " ")
|
||||
tt123a := tt123
|
||||
for i := 0; i < 200; i++ {
|
||||
tt123a = append(tt123a, fmt.Sprintf("%d", i))
|
||||
tt123a = append(tt123a, tt123...)
|
||||
}
|
||||
tst2 := excerpt.Excerpt(tt123, tt123a)
|
||||
if tst2 !=
|
||||
"Testing, testing; 123 is fun! … Testing, testing; 123 is fun! … 0 Testing, testing; 123 is fun!" {
|
||||
t.Error("'Testing testing 123' did not work:", tst2)
|
||||
}
|
||||
|
||||
s := strings.Split(strings.Replace(`
|
||||
It's supercalifragilisticexpialidocious
|
||||
Even though the sound of it is something quite atrocious
|
||||
If you say it loud enough, you'll always sound precocious
|
||||
Supercalifragilisticexpialidocious
|
||||
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
|
||||
Because I was afraid to speak
|
||||
When I was just a lad
|
||||
My father gave me nose a tweak
|
||||
And told me I was bad
|
||||
|
||||
But then one day I learned a word
|
||||
That saved me achin' nose
|
||||
The biggest word I ever heard
|
||||
And this is how it goes, oh
|
||||
|
||||
Supercalifragilisticexpialidocious
|
||||
Even though the sound of it is something quite atrocious
|
||||
If you say it loud enough, you'll always sound precocious
|
||||
Supercalifragilisticexpialidocious
|
||||
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
|
||||
He traveled all around the world
|
||||
And everywhere he went
|
||||
He'd use his word and all would say
|
||||
There goes a clever gent
|
||||
|
||||
When Dukes and Maharajahs
|
||||
Pass the time of day with me
|
||||
I say me special word
|
||||
And then they ask me out to tea
|
||||
|
||||
Oh, supercalifragilisticexpialidocious
|
||||
Even though the sound of it is something quite atrocious
|
||||
If you say it loud enough, you'll always sound precocious
|
||||
Supercalifragilisticexpialidocious
|
||||
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
Um diddle, diddle diddle, um diddle ay
|
||||
|
||||
No, you can say it backwards, which is dociousaliexpilisticfragicalirupus
|
||||
But that's going a bit too far, don't you think?
|
||||
|
||||
So when the cat has got your tongue
|
||||
There's no need for dismay
|
||||
Just summon up this word
|
||||
And then you've got a lot to say
|
||||
|
||||
But better use it carefully
|
||||
Or it could change your life
|
||||
For example, yes, one night I said it to me girl
|
||||
And now me girl's my wife, oh, and a lovely thing she's too
|
||||
|
||||
She's, supercalifragilisticexpialidocious
|
||||
Supercalifragilisticexpialidocious
|
||||
Supercalifragilisticexpialidocious
|
||||
Supercalifragilisticexpialidocious
|
||||
. `, "\n", " . ", -1), " ")
|
||||
ts := []string{"Supercalifragilisticexpialidocious", "song", "lyrics"}
|
||||
st := excerpt.Excerpt(ts, s)
|
||||
if st != "Supercalifragilisticexpialidocious song lyrics. … Um diddle, diddle diddle, um diddle ay. Um diddle, diddle diddle, um diddle ay." {
|
||||
t.Error("'Supercalifragilisticexpialidocious song lyrics' did not work:", st)
|
||||
}
|
||||
|
||||
ss := []string{"Supercalifragilisticexpialidocious", "!"}
|
||||
ssa := ss
|
||||
for i := 0; i < 100; i++ {
|
||||
ssa = append(ssa, ss...)
|
||||
}
|
||||
sst := excerpt.Excerpt(ss, ssa)
|
||||
if sst !=
|
||||
"Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious…" {
|
||||
t.Error("'Supercalifragilisticexpialidocious' did not work:", sst)
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -1,4 +0,0 @@
|
|||
Total files: 23406
|
||||
Unique word count: 521426
|
||||
Total word count: 145376051
|
||||
Overall word count: 193225723
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load diff
|
@ -1,149 +0,0 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package main creates ordered lists of english words and their stems,
|
||||
// based on their frequency.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"sort"
|
||||
|
||||
"github.com/rookii/paicehusk"
|
||||
)
|
||||
|
||||
type wordFreqEntry struct {
|
||||
rawFreq int
|
||||
Freq float64
|
||||
}
|
||||
|
||||
type wordFreqMap map[string]wordFreqEntry
|
||||
|
||||
type wordFreqSortEntry struct {
|
||||
Name string
|
||||
Freq float64
|
||||
}
|
||||
type wordFreqSort []wordFreqSortEntry
|
||||
|
||||
// Len is the number of elements in the collection.
|
||||
func (wfs wordFreqSort) Len() int { return len(wfs) }
|
||||
|
||||
// Less reports whether the element with
|
||||
// index i should sort before the element with index j.
|
||||
func (wfs wordFreqSort) Less(i, j int) bool { return wfs[i].Freq > wfs[j].Freq }
|
||||
|
||||
// Swap swaps the elements with indexes i and j.
|
||||
func (wfs wordFreqSort) Swap(i, j int) { wfs[j], wfs[i] = wfs[i], wfs[j] }
|
||||
|
||||
func main() {
|
||||
|
||||
txt, err := ioutil.ReadFile("./en-2012/en.txt")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
lines := bytes.Split(txt, []byte("\n"))
|
||||
|
||||
wfm := make(wordFreqMap)
|
||||
rfTot := 0
|
||||
for r, l := range lines {
|
||||
words := bytes.Split(l, []byte(" "))
|
||||
if len(words) >= 2 {
|
||||
var rf int
|
||||
_, err = fmt.Sscanf(string(words[1]), "%d", &rf)
|
||||
if err == nil && len(words[0]) > 0 {
|
||||
if r < 10000 { // only look at the most common 10k words, 100k makes go compile/link unworkable
|
||||
stem := string(words[0]) // NOTE not stemming at present
|
||||
entry, alredythere := wfm[stem]
|
||||
if alredythere {
|
||||
entry.rawFreq += rf
|
||||
wfm[stem] = entry
|
||||
} else {
|
||||
wfm[stem] = wordFreqEntry{rawFreq: rf, Freq: 0.0}
|
||||
}
|
||||
}
|
||||
rfTot += rf
|
||||
}
|
||||
}
|
||||
}
|
||||
for k, v := range wfm {
|
||||
v.Freq = float64(v.rawFreq) / float64(rfTot)
|
||||
wfm[k] = v
|
||||
}
|
||||
|
||||
wfs := make(wordFreqSort, len(wfm))
|
||||
idx := 0
|
||||
for k, v := range wfm {
|
||||
wfs[idx].Name = k
|
||||
wfs[idx].Freq = v.Freq
|
||||
idx++
|
||||
}
|
||||
sort.Sort(wfs)
|
||||
writeWords(wfs, wfm)
|
||||
}
|
||||
|
||||
func writeWords(wfs wordFreqSort, wfm wordFreqMap) {
|
||||
var goprog bytes.Buffer
|
||||
var err error
|
||||
|
||||
fmt.Fprintf(&goprog, `
|
||||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package words was auto-generated !
|
||||
// From base data at http://invokeit.wordpress.com/frequency-word-lists/ .
|
||||
// The word stems were produced using github.com/rookii/paicehusk .
|
||||
// DO NOT EDIT BY HAND.
|
||||
package words
|
||||
|
||||
// Entry type describes the rank and frequency of a prarticular word.
|
||||
type Entry struct {
|
||||
Rank int // Word Rank order, 1 most frequent.
|
||||
Freq float64 // Word Frequency, a fraction, larger is more frequent.
|
||||
}
|
||||
|
||||
// Map type provides the Entry information for each word.
|
||||
type Map map[string]Entry
|
||||
|
||||
// Words gives the Entry information on the most frequent words.
|
||||
var Words = Map{
|
||||
`)
|
||||
for i, v := range wfs {
|
||||
fmt.Fprintf(&goprog, "\t"+`"%s": Entry{Rank:%d,Freq:%g},`+"\n", v.Name, i+1, v.Freq)
|
||||
}
|
||||
fmt.Fprintf(&goprog, "}\n\n")
|
||||
|
||||
sfm := make(map[string]float64)
|
||||
for k, v := range wfm {
|
||||
sfm[paicehusk.DefaultRules.Stem(k)] += v.Freq
|
||||
}
|
||||
fmt.Fprintf(&goprog, "// Stems gives the frequency of word-stems.\nvar Stems = map[string]float64{\n")
|
||||
for k, v := range sfm {
|
||||
fmt.Fprintf(&goprog, "\t"+`"%s": %g,`+"\n", k, v)
|
||||
}
|
||||
fmt.Fprintf(&goprog, "}\n\n")
|
||||
|
||||
err = ioutil.WriteFile("./en-2012/englishwords.go", goprog.Bytes(), 0666)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue