removed experimental code

2025-08-05 05:25:27 +02:00 · 2017-07-18 13:10:52 +01:00 · 2017-07-18 13:10:52 +01:00 · 7455a027fc
commit 7455a027fc
parent fddaf9effe
38 changed files with 1 additions and 499578 deletions
--- a/core/api/convert/convert.go
+++ b/core/api/convert/convert.go
@ -14,11 +14,9 @@ package convert

 import (
 	"errors"
-	"github.com/documize/community/core/api/convert/excerpt"
 	"github.com/documize/community/core/api/convert/html"
 	"github.com/documize/community/core/api/plugins"
 	api "github.com/documize/community/core/convapi"
-	"github.com/documize/community/core/utility"

 	"golang.org/x/net/context"
 )
@ -49,32 +47,6 @@ func Convert(ctx context.Context, xtn string, fileRequest *api.DocumentConversio
 	}
 	*/

-	if fileResult.Excerpt != "" {
-		//fmt.Println("DEBUG supplied excerpt: " + fileResult.Excerpt)
-	} else {
-		titleWds := []string{}
-		bodyWds := []string{}
-		for p := range fileResult.Pages {
-			var wds []string
-			var err error
-			if p > 0 { // title 0 is already the title of the document
-				wds, _, err = utility.Words(utility.HTML(fileResult.Pages[p].Title), 0, false)
-				if err != nil {
-					return nil, err
-				}
-				titleWds = append(titleWds, wds...)
-				titleWds = append(titleWds, ".")
-			}
-			wds, _, err = utility.Words(utility.HTML(string(fileResult.Pages[p].Body)), 0, false)
-			if err != nil {
-				return nil, err
-			}
-			bodyWds = append(bodyWds, wds...)
-			bodyWds = append(bodyWds, ".")
-		}
-		fileResult.Excerpt = excerpt.Excerpt(titleWds, bodyWds)
-	}
-
 	return fileResult, nil
 }

--- a/core/api/convert/excerpt/excerpt.go
+++ b/core/api/convert/excerpt/excerpt.go
@ -1,228 +0,0 @@
-// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
-//
-// This software (Documize Community Edition) is licensed under
-// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
-//
-// You can operate outside the AGPL restrictions by purchasing
-// Documize Enterprise Edition and obtaining a commercial license
-// by contacting <sales@documize.com>.
-//
-// https://documize.com
-
-// Package excerpt provides basic functionality to create excerpts of text in English.
-package excerpt
-
-import (
-	"sort"
-	"strings"
-	"unicode"
-	"unicode/utf8"
-
-	words "github.com/documize/community/core/wordlists/en-2012"
-
-	"github.com/rookii/paicehusk"
-)
-
-type extractItem struct {
-	sequence int
-	score    float64
-	count    int
-	sentance string
-}
-
-type extractList []extractItem
-
-// the Sort interface
-// Len is the number of elements in the collection.
-func (a extractList) Len() int { return len(a) }
-
-// Less reports whether the element with
-// index i should sort before the element with index j.
-func (a extractList) Less(i, j int) bool {
-	return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
-}
-
-// Swap swaps the elements with indexes i and j.
-func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
-
-type presentItem struct {
-	sequence int
-	text     string
-}
-
-type presentList []presentItem
-
-// the Sort interface
-// Len is the number of elements in the collection.
-func (a presentList) Len() int { return len(a) }
-
-// Less reports whether the element with
-// index i should sort before the element with index j.
-func (a presentList) Less(i, j int) bool {
-	return a[i].sequence < a[j].sequence
-}
-
-// Swap swaps the elements with indexes i and j.
-func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
-
-func addWd(sentance, wd string) (string, bool) {
-	var isStop bool
-	if len(sentance) == 0 {
-		if wd != "[" {
-			sentance = wd
-		}
-	} else {
-		switch wd {
-		case "[": //NoOp
-		case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
-			if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
-				sentance += wd
-			} else {
-				sentance += " " + wd
-			}
-		case ".", "!", "?":
-			isStop = true
-			fallthrough
-		default:
-			if isPunct(wd) {
-				sentance += wd
-			} else {
-				sentance += " " + wd
-			}
-		}
-	}
-	return sentance, isStop
-}
-
-func isPunct(s string) bool {
-	for _, r := range s {
-		if !unicode.IsPunct(r) {
-			switch r {
-			case '`', '\'', '"', '(', '/': // still punct
-			default:
-				return false
-			}
-		}
-	}
-	return true
-}
-
-// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
-func Excerpt(titleWords, bodyWords []string) string {
-	var el extractList
-
-	//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
-
-	// populate stemMap
-	stemMap := make(map[string]uint64)
-	for _, wd := range bodyWords {
-		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
-		stemMap[stem]++
-	}
-	for _, wd := range titleWords {
-		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
-		stemMap[stem]++                         // TODO are words in titles more important?
-	}
-
-	wds := append(titleWords, bodyWords...)
-
-	sentance := ""
-	score := 0.0
-	count := 0
-	seq := 0
-	for _, wd := range wds {
-		var isStop bool
-
-		sentance, isStop = addWd(sentance, wd)
-
-		if isStop {
-			//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
-			//	seq, score*10000/float64(count), sentance)
-			var ei extractItem
-			ei.count = count + 1 // must be at least 1
-			ei.score = score
-			ei.sentance = sentance
-			ei.sequence = seq
-			el = append(el, ei)
-			sentance = ""
-			score = 0.0
-			seq++
-		} else {
-			uncommon := true
-			// TODO Discuss correct level or maybe find a better algorithem for this
-			ent, ok := words.Words[wd]
-			if ok {
-				if ent.Rank <= 100 {
-					// do not score very common words
-					uncommon = false
-				}
-			}
-			if uncommon {
-				stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
-				usage, used := stemMap[stem]
-				if used {
-					relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
-					if relativeStemFreq > 0.0 {
-						score += relativeStemFreq
-					}
-				}
-				count++
-			}
-		}
-	}
-
-	sort.Sort(el)
-
-	return present(el)
-}
-
-func present(el extractList) (ret string) {
-	var pl presentList
-	words := 0
-
-	const excerptWords = 50
-
-	for s, e := range el {
-		if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
-			notEmpty(e.sentance) {
-			words += e.count
-			pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
-			//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
-			//	1000*e.score/float64(e.count), e.sequence, e.sentance)
-		}
-	}
-	sort.Sort(pl)
-
-	var lastSeq int
-	for p := range pl {
-		txt := strings.TrimPrefix(pl[p].text, ". ")
-		if p == 0 {
-			ret = txt
-			lastSeq = pl[0].sequence
-		} else {
-			thisSeq := pl[p].sequence
-			if lastSeq+1 != thisSeq {
-				ret += " …" // Horizontal elipsis character
-			}
-			ret += " " + txt
-			lastSeq = thisSeq
-		}
-	}
-	if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
-		for len(ret) > 250 {
-			_, size := utf8.DecodeLastRuneInString(ret)
-			ret = ret[:len(ret)-size]
-		}
-		return ret + "…" // Horizontal elipsis character added after truncation
-	}
-	return ret
-}
-
-func notEmpty(wds string) bool {
-	for _, r := range wds {
-		if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
-			return true
-		}
-	}
-	return false
-}
--- a/core/api/convert/excerpt/excerpt_test.go
+++ b/core/api/convert/excerpt/excerpt_test.go
@ -1,130 +0,0 @@
-// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
-//
-// This software (Documize Community Edition) is licensed under
-// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
-//
-// You can operate outside the AGPL restrictions by purchasing
-// Documize Enterprise Edition and obtaining a commercial license
-// by contacting <sales@documize.com>.
-//
-// https://documize.com
-
-package excerpt_test
-
-import "testing"
-import "github.com/documize/community/core/api/convert/excerpt"
-import "strings"
-import "fmt"
-
-func TestExerpt(t *testing.T) {
-	if excerpt.Excerpt(nil, nil) != "" ||
-		excerpt.Excerpt([]string{}, []string{}) != "" {
-		t.Error("empty lists do not return empty string")
-	}
-	qbf := strings.Split("The quick brown fox jumps over the lazy dog .", " ")
-	qbf2 := qbf
-	for i := 0; i < 200; i++ {
-		qbf2 = append(qbf2, qbf...)
-	}
-	tst := excerpt.Excerpt(qbf, qbf2)
-	if tst !=
-		"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." {
-		t.Error("'quick brown fox' did not work:", tst)
-	}
-
-	tt123 := strings.Split("Testing , testing ; 1 2 3 is fun ! Bracket [ anyone ? .", " ")
-	tt123a := tt123
-	for i := 0; i < 200; i++ {
-		tt123a = append(tt123a, fmt.Sprintf("%d", i))
-		tt123a = append(tt123a, tt123...)
-	}
-	tst2 := excerpt.Excerpt(tt123, tt123a)
-	if tst2 !=
-		"Testing, testing; 123 is fun! … Testing, testing; 123 is fun! … 0 Testing, testing; 123 is fun!" {
-		t.Error("'Testing testing 123' did not work:", tst2)
-	}
-
-	s := strings.Split(strings.Replace(`
-It's supercalifragilisticexpialidocious
-Even though the sound of it is something quite atrocious
-If you say it loud enough, you'll always sound precocious
-Supercalifragilisticexpialidocious
-
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-
-Because I was afraid to speak
-When I was just a lad
-My father gave me nose a tweak
-And told me I was bad
-
-But then one day I learned a word
-That saved me achin' nose
-The biggest word I ever heard
-And this is how it goes, oh
-
-Supercalifragilisticexpialidocious
-Even though the sound of it is something quite atrocious
-If you say it loud enough, you'll always sound precocious
-Supercalifragilisticexpialidocious
-
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-
-He traveled all around the world
-And everywhere he went
-He'd use his word and all would say
-There goes a clever gent
-
-When Dukes and Maharajahs
-Pass the time of day with me
-I say me special word
-And then they ask me out to tea
-
-Oh, supercalifragilisticexpialidocious
-Even though the sound of it is something quite atrocious
-If you say it loud enough, you'll always sound precocious
-Supercalifragilisticexpialidocious
-
-Um diddle, diddle diddle, um diddle ay
-Um diddle, diddle diddle, um diddle ay
-
-No, you can say it backwards, which is dociousaliexpilisticfragicalirupus
-But that's going a bit too far, don't you think?
-
-So when the cat has got your tongue
-There's no need for dismay
-Just summon up this word
-And then you've got a lot to say
-
-But better use it carefully
-Or it could change your life
-For example, yes, one night I said it to me girl
-And now me girl's my wife, oh, and a lovely thing she's too
-
-She's, supercalifragilisticexpialidocious
-Supercalifragilisticexpialidocious
-Supercalifragilisticexpialidocious
-Supercalifragilisticexpialidocious
-.	`, "\n", " . ", -1), " ")
-	ts := []string{"Supercalifragilisticexpialidocious", "song", "lyrics"}
-	st := excerpt.Excerpt(ts, s)
-	if st != "Supercalifragilisticexpialidocious song lyrics. … Um diddle, diddle diddle, um diddle ay. Um diddle, diddle diddle, um diddle ay." {
-		t.Error("'Supercalifragilisticexpialidocious song lyrics' did not work:", st)
-	}
-
-	ss := []string{"Supercalifragilisticexpialidocious", "!"}
-	ssa := ss
-	for i := 0; i < 100; i++ {
-		ssa = append(ssa, ss...)
-	}
-	sst := excerpt.Excerpt(ss, ssa)
-	if sst !=
-		"Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious…" {
-		t.Error("'Supercalifragilisticexpialidocious' did not work:", sst)
-	}
-}
--- a/core/wordlists/en-2012/en-s.log
+++ b/core/wordlists/en-2012/en-s.log
--- a/core/wordlists/en-2012/en.log
+++ b/core/wordlists/en-2012/en.log
@ -1,4 +0,0 @@
-Total files: 23406
-Unique word count: 521426
-Total word count: 145376051
-Overall word count: 193225723
--- a/core/wordlists/en-2012/en.txt
+++ b/core/wordlists/en-2012/en.txt
--- a/core/wordlists/en-2012/englishwords.go
+++ b/core/wordlists/en-2012/englishwords.go
--- a/core/wordlists/makewordlist.go
+++ b/core/wordlists/makewordlist.go
@ -1,149 +0,0 @@
-// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
-//
-// This software (Documize Community Edition) is licensed under
-// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
-//
-// You can operate outside the AGPL restrictions by purchasing
-// Documize Enterprise Edition and obtaining a commercial license
-// by contacting <sales@documize.com>.
-//
-// https://documize.com
-
-// Package main creates ordered lists of english words and their stems,
-// based on their frequency.
-package main
-
-import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
-	"sort"
-
-	"github.com/rookii/paicehusk"
-)
-
-type wordFreqEntry struct {
-	rawFreq int
-	Freq    float64
-}
-
-type wordFreqMap map[string]wordFreqEntry
-
-type wordFreqSortEntry struct {
-	Name string
-	Freq float64
-}
-type wordFreqSort []wordFreqSortEntry
-
-// Len is the number of elements in the collection.
-func (wfs wordFreqSort) Len() int { return len(wfs) }
-
-// Less reports whether the element with
-// index i should sort before the element with index j.
-func (wfs wordFreqSort) Less(i, j int) bool { return wfs[i].Freq > wfs[j].Freq }
-
-// Swap swaps the elements with indexes i and j.
-func (wfs wordFreqSort) Swap(i, j int) { wfs[j], wfs[i] = wfs[i], wfs[j] }
-
-func main() {
-
-	txt, err := ioutil.ReadFile("./en-2012/en.txt")
-	if err != nil {
-		panic(err)
-	}
-
-	lines := bytes.Split(txt, []byte("\n"))
-
-	wfm := make(wordFreqMap)
-	rfTot := 0
-	for r, l := range lines {
-		words := bytes.Split(l, []byte(" "))
-		if len(words) >= 2 {
-			var rf int
-			_, err = fmt.Sscanf(string(words[1]), "%d", &rf)
-			if err == nil && len(words[0]) > 0 {
-				if r < 10000 { // only look at the most common 10k words, 100k makes go compile/link unworkable
-					stem := string(words[0]) // NOTE not stemming at present
-					entry, alredythere := wfm[stem]
-					if alredythere {
-						entry.rawFreq += rf
-						wfm[stem] = entry
-					} else {
-						wfm[stem] = wordFreqEntry{rawFreq: rf, Freq: 0.0}
-					}
-				}
-				rfTot += rf
-			}
-		}
-	}
-	for k, v := range wfm {
-		v.Freq = float64(v.rawFreq) / float64(rfTot)
-		wfm[k] = v
-	}
-
-	wfs := make(wordFreqSort, len(wfm))
-	idx := 0
-	for k, v := range wfm {
-		wfs[idx].Name = k
-		wfs[idx].Freq = v.Freq
-		idx++
-	}
-	sort.Sort(wfs)
-	writeWords(wfs, wfm)
-}
-
-func writeWords(wfs wordFreqSort, wfm wordFreqMap) {
-	var goprog bytes.Buffer
-	var err error
-
-	fmt.Fprintf(&goprog, `
-// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
-//
-// This software (Documize Community Edition) is licensed under 
-// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
-//
-// You can operate outside the AGPL restrictions by purchasing
-// Documize Enterprise Edition and obtaining a commercial license
-// by contacting <sales@documize.com>. 
-//
-// https://documize.com
-
-// Package words was auto-generated !
-// From base data at http://invokeit.wordpress.com/frequency-word-lists/ .
-// The word stems were produced using github.com/rookii/paicehusk .
-// DO NOT EDIT BY HAND.
-package words
-
-// Entry type describes the rank and frequency of a prarticular word.
-type Entry struct {
-	Rank    int      // Word Rank order, 1 most frequent.
-	Freq    float64  // Word Frequency, a fraction, larger is more frequent. 
-}
-
-// Map type provides the Entry information for each word.
-type Map map[string]Entry
-
-// Words gives the Entry information on the most frequent words.
-var Words = Map{
-`)
-	for i, v := range wfs {
-		fmt.Fprintf(&goprog, "\t"+`"%s": Entry{Rank:%d,Freq:%g},`+"\n", v.Name, i+1, v.Freq)
-	}
-	fmt.Fprintf(&goprog, "}\n\n")
-
-	sfm := make(map[string]float64)
-	for k, v := range wfm {
-		sfm[paicehusk.DefaultRules.Stem(k)] += v.Freq
-	}
-	fmt.Fprintf(&goprog, "// Stems gives the frequency of word-stems.\nvar Stems = map[string]float64{\n")
-	for k, v := range sfm {
-		fmt.Fprintf(&goprog, "\t"+`"%s": %g,`+"\n", k, v)
-	}
-	fmt.Fprintf(&goprog, "}\n\n")
-
-	err = ioutil.WriteFile("./en-2012/englishwords.go", goprog.Bytes(), 0666)
-
-	if err != nil {
-		panic(err)
-	}
-}