documize/documize/api/convert/excerpt/excerpt.go

// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under 
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>. 
//
// https://documize.com

// Package excerpt provides basic functionality to create excerpts of text in English.
package excerpt

import (
	"sort"
	"strings"
	"unicode"
	"unicode/utf8"

	words "github.com/documize/community/wordsmith/wordlists/en-2012"

	"github.com/rookii/paicehusk"
)

type extractItem struct {
	sequence int
	score    float64
	count    int
	sentance string
}

type extractList []extractItem

// the Sort interface
// Len is the number of elements in the collection.
func (a extractList) Len() int { return len(a) }

// Less reports whether the element with
// index i should sort before the element with index j.
func (a extractList) Less(i, j int) bool {
	return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
}

// Swap swaps the elements with indexes i and j.
func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

type presentItem struct {
	sequence int
	text     string
}

type presentList []presentItem

// the Sort interface
// Len is the number of elements in the collection.
func (a presentList) Len() int { return len(a) }

// Less reports whether the element with
// index i should sort before the element with index j.
func (a presentList) Less(i, j int) bool {
	return a[i].sequence < a[j].sequence
}

// Swap swaps the elements with indexes i and j.
func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

func addWd(sentance, wd string) (string, bool) {
	var isStop bool
	if len(sentance) == 0 {
		if wd != "[" {
			sentance = wd
		}
	} else {
		switch wd {
		case "[": //NoOp
		case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
			if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
				sentance += wd
			} else {
				sentance += " " + wd
			}
		case ".", "!", "?":
			isStop = true
			fallthrough
		default:
			if isPunct(wd) {
				sentance += wd
			} else {
				sentance += " " + wd
			}
		}
	}
	return sentance, isStop
}

func isPunct(s string) bool {
	for _, r := range s {
		if !unicode.IsPunct(r) {
			switch r {
			case '`', '\'', '"', '(', '/': // still punct
			default:
				return false
			}
		}
	}
	return true
}

// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
func Excerpt(titleWords, bodyWords []string) string {
	var el extractList

	//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))

	// populate stemMap
	stemMap := make(map[string]uint64)
	for _, wd := range bodyWords {
		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
		stemMap[stem]++
	}
	for _, wd := range titleWords {
		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
		stemMap[stem]++                         // TODO are words in titles more important?
	}

	wds := append(titleWords, bodyWords...)

	sentance := ""
	score := 0.0
	count := 0
	seq := 0
	for _, wd := range wds {
		var isStop bool

		sentance, isStop = addWd(sentance, wd)

		if isStop {
			//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
			//	seq, score*10000/float64(count), sentance)
			var ei extractItem
			ei.count = count + 1 // must be at least 1
			ei.score = score
			ei.sentance = sentance
			ei.sequence = seq
			el = append(el, ei)
			sentance = ""
			score = 0.0
			seq++
		} else {
			uncommon := true
			// TODO Discuss correct level or maybe find a better algorithem for this
			ent, ok := words.Words[wd]
			if ok {
				if ent.Rank <= 100 {
					// do not score very common words
					uncommon = false
				}
			}
			if uncommon {
				stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
				usage, used := stemMap[stem]
				if used {
					relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
					if relativeStemFreq > 0.0 {
						score += relativeStemFreq
					}
				}
				count++
			}
		}
	}

	sort.Sort(el)

	return present(el)
}

func present(el extractList) (ret string) {
	var pl presentList
	words := 0

	const excerptWords = 50

	for s, e := range el {
		if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
			notEmpty(e.sentance) {
			words += e.count
			pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
			//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
			//	1000*e.score/float64(e.count), e.sequence, e.sentance)
		}
	}
	sort.Sort(pl)

	var lastSeq int
	for p := range pl {
		txt := strings.TrimPrefix(pl[p].text, ". ")
		if p == 0 {
			ret = txt
			lastSeq = pl[0].sequence
		} else {
			thisSeq := pl[p].sequence
			if lastSeq+1 != thisSeq {
				ret += " …" // Horizontal elipsis character
			}
			ret += " " + txt
			lastSeq = thisSeq
		}
	}
	if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
		for len(ret) > 250 {
			_, size := utf8.DecodeLastRuneInString(ret)
			ret = ret[:len(ret)-size]
		}
		return ret + "…" // Horizontal elipsis character added after truncation
	}
	return ret
}

func notEmpty(wds string) bool {
	for _, r := range wds {
		if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
			return true
		}
	}
	return false
}
initial commit 2016-07-07 18:54:16 -07:00			`// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.`
			`//`
			`// This software (Documize Community Edition) is licensed under`
			`// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html`
			`//`
			`// You can operate outside the AGPL restrictions by purchasing`
			`// Documize Enterprise Edition and obtaining a commercial license`
			`// by contacting <sales@documize.com>.`
			`//`
			`// https://documize.com`

			`// Package excerpt provides basic functionality to create excerpts of text in English.`
			`package excerpt`

			`import (`
			`"sort"`
			`"strings"`
			`"unicode"`
			`"unicode/utf8"`

			`words "github.com/documize/community/wordsmith/wordlists/en-2012"`

			`"github.com/rookii/paicehusk"`
			`)`

			`type extractItem struct {`
			`sequence int`
			`score float64`
			`count int`
			`sentance string`
			`}`

			`type extractList []extractItem`

			`// the Sort interface`
			`// Len is the number of elements in the collection.`
			`func (a extractList) Len() int { return len(a) }`

			`// Less reports whether the element with`
			`// index i should sort before the element with index j.`
			`func (a extractList) Less(i, j int) bool {`
			`return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))`
			`}`

			`// Swap swaps the elements with indexes i and j.`
			`func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }`

			`type presentItem struct {`
			`sequence int`
			`text string`
			`}`

			`type presentList []presentItem`

			`// the Sort interface`
			`// Len is the number of elements in the collection.`
			`func (a presentList) Len() int { return len(a) }`

			`// Less reports whether the element with`
			`// index i should sort before the element with index j.`
			`func (a presentList) Less(i, j int) bool {`
			`return a[i].sequence < a[j].sequence`
			`}`

			`// Swap swaps the elements with indexes i and j.`
			`func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }`

			`func addWd(sentance, wd string) (string, bool) {`
			`var isStop bool`
			`if len(sentance) == 0 {`
			`if wd != "[" {`
			`sentance = wd`
			`}`
			`} else {`
			`switch wd {`
			`case "[": //NoOp`
			`case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":`
			`if unicode.IsDigit(rune(sentance[len(sentance)-1])) {`
			`sentance += wd`
			`} else {`
			`sentance += " " + wd`
			`}`
			`case ".", "!", "?":`
			`isStop = true`
			`fallthrough`
			`default:`
			`if isPunct(wd) {`
			`sentance += wd`
			`} else {`
			`sentance += " " + wd`
			`}`
			`}`
			`}`
			`return sentance, isStop`
			`}`

			`func isPunct(s string) bool {`
			`for _, r := range s {`
			`if !unicode.IsPunct(r) {`
			`switch r {`
			case '`', '\'', '"', '(', '/': // still punct
			`default:`
			`return false`
			`}`
			`}`
			`}`
			`return true`
			`}`

			`// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field`
			`func Excerpt(titleWords, bodyWords []string) string {`
			`var el extractList`

			`//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))`

			`// populate stemMap`
			`stemMap := make(map[string]uint64)`
			`for _, wd := range bodyWords {`
			`stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word`
			`stemMap[stem]++`
			`}`
			`for _, wd := range titleWords {`
			`stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word`
			`stemMap[stem]++ // TODO are words in titles more important?`
			`}`

			`wds := append(titleWords, bodyWords...)`

			`sentance := ""`
			`score := 0.0`
			`count := 0`
			`seq := 0`
			`for _, wd := range wds {`
			`var isStop bool`

			`sentance, isStop = addWd(sentance, wd)`

			`if isStop {`
			`//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",`
			`// seq, score*10000/float64(count), sentance)`
			`var ei extractItem`
			`ei.count = count + 1 // must be at least 1`
			`ei.score = score`
			`ei.sentance = sentance`
			`ei.sequence = seq`
			`el = append(el, ei)`
			`sentance = ""`
			`score = 0.0`
			`seq++`
			`} else {`
			`uncommon := true`
			`// TODO Discuss correct level or maybe find a better algorithem for this`
			`ent, ok := words.Words[wd]`
			`if ok {`
			`if ent.Rank <= 100 {`
			`// do not score very common words`
			`uncommon = false`
			`}`
			`}`
			`if uncommon {`
			`stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word`
			`usage, used := stemMap[stem]`
			`if used {`
			`relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]`
			`if relativeStemFreq > 0.0 {`
			`score += relativeStemFreq`
			`}`
			`}`
			`count++`
			`}`
			`}`
			`}`

			`sort.Sort(el)`

			`return present(el)`
			`}`

			`func present(el extractList) (ret string) {`
			`var pl presentList`
			`words := 0`

			`const excerptWords = 50`

			`for s, e := range el {`
			`if (words < excerptWords \|\| s == 0) && len(e.sentance) > 1 &&`
			`notEmpty(e.sentance) {`
			`words += e.count`
			`pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})`
			`//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",`
			`// 1000*e.score/float64(e.count), e.sequence, e.sentance)`
			`}`
			`}`
			`sort.Sort(pl)`

			`var lastSeq int`
			`for p := range pl {`
			`txt := strings.TrimPrefix(pl[p].text, ". ")`
			`if p == 0 {`
			`ret = txt`
			`lastSeq = pl[0].sequence`
			`} else {`
			`thisSeq := pl[p].sequence`
			`if lastSeq+1 != thisSeq {`
			`ret += " …" // Horizontal elipsis character`
			`}`
			`ret += " " + txt`
			`lastSeq = thisSeq`
			`}`
			`}`
			`if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required`
			`for len(ret) > 250 {`
			`_, size := utf8.DecodeLastRuneInString(ret)`
			`ret = ret[:len(ret)-size]`
			`}`
			`return ret + "…" // Horizontal elipsis character added after truncation`
			`}`
			`return ret`
			`}`

			`func notEmpty(wds string) bool {`
			`for _, r := range wds {`
			`if !unicode.IsPunct(r) && !unicode.IsSpace(r) {`
			`return true`
			`}`
			`}`
			`return false`
			`}`