major package structure refactoring

2025-07-19 05:09:42 +02:00 · 2017-07-18 21:55:17 +01:00 · 2017-07-18 21:55:17 +01:00 · cf58f8164d
commit cf58f8164d
parent 7b8cec9a6c
73 changed files with 549 additions and 389 deletions
--- a/core/stringutil/beautify.go
+++ b/core/stringutil/beautify.go
@ -0,0 +1,66 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"path/filepath"
+	"strings"
+	"unicode"
+)
+
+// BeautifyFilename takes a filename and attempts to turn it into a readable form,
+// as TitleCase natural language, suitable for the top level of a Document.
+func BeautifyFilename(fn string) string {
+	_, file := filepath.Split(fn)
+	splits := strings.Split(file, ".")
+	r := []rune(strings.Join(splits[:len(splits)-1], "."))
+
+	// make any non-letter/digit characters space
+	for i := range r {
+		if !(unicode.IsLetter(r[i]) || unicode.IsDigit(r[i]) || r[i] == '.') {
+			r[i] = ' '
+		}
+	}
+
+	// insert spaces in front of any Upper/Lowwer 2-letter combinations
+addSpaces:
+	for i := range r {
+		if i > 1 { // do not insert a space at the start of the file name
+			if unicode.IsLower(r[i]) && unicode.IsUpper(r[i-1]) && r[i-2] != ' ' {
+				n := make([]rune, len(r)+1)
+				for j := 0; j < i-1; j++ {
+					n[j] = r[j]
+				}
+				n[i-1] = ' '
+				for j := i - 1; j < len(r); j++ {
+					n[j+1] = r[j]
+				}
+				r = n
+				goto addSpaces
+			}
+		}
+	}
+
+	// make the first letter of each word upper case
+	for i := range r {
+		switch i {
+		case 0:
+			r[i] = unicode.ToUpper(r[i])
+		case 1: // the zero element should never be space
+		default:
+			if r[i-1] == ' ' {
+				r[i] = unicode.ToUpper(r[i])
+			}
+		}
+	}
+	return string(r)
+}
--- a/core/stringutil/beautify_test.go
+++ b/core/stringutil/beautify_test.go
@ -0,0 +1,25 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import "testing"
+
+func TestBeautify(t *testing.T) {
+	bs(t, "DooDah$day.zip", "Doo Dah Day")
+}
+
+func bs(t *testing.T, in, out string) {
+	got := BeautifyFilename(in)
+	if got != out {
+		t.Errorf("BeautifyFilename input `%s` got `%s` expected `%s`\n", in, got, out)
+	}
+}
--- a/core/stringutil/conjoin.go
+++ b/core/stringutil/conjoin.go
@ -0,0 +1,38 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"strings"
+)
+
+// Conjoin returns "Suzzane, Fatima and Brian" from string of items.
+func Conjoin(conj string, items []string) string {
+	if len(items) == 0 {
+		return ""
+	}
+	if len(items) == 1 {
+		return items[0]
+	}
+	if len(items) == 2 { // "a and b" not "a, and b"
+		return items[0] + " " + conj + " " + items[1]
+	}
+
+	sep := ", "
+	pieces := []string{items[0]}
+	for _, item := range items[1 : len(items)-1] {
+		pieces = append(pieces, sep, item)
+	}
+	pieces = append(pieces, sep, conj, " ", items[len(items)-1])
+
+	return strings.Replace(strings.Join(pieces, ""), ", and ", " and ", 1)
+}
--- a/core/stringutil/html.go
+++ b/core/stringutil/html.go
@ -0,0 +1,158 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+	"unicode/utf8"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+)
+
+// HTML describes a chunk of HTML, Text() method returns plain text.
+type HTML string
+
+// write out the textual element of the html node, if present, then iterate through the child nodes.
+func writeText(n *html.Node, b io.Writer, isTest bool) {
+	if !excluded(n) {
+		switch n.Type {
+		case html.TextNode:
+			_, err := b.Write([]byte(n.Data + string(rune(0x200B)))) // + http://en.wikipedia.org/wiki/Zero-width_space
+			if err != nil {
+			}
+			// TODO This use of zero-width-space (subsequently replaced by ' ' or ignored, depending on context)
+			// TODO works well for in-word breaks, but at the expense of concatenating some words in error.
+			// TODO It may be that better examination of the HTML structure could be used to determine
+			// TODO when a space is, or is not, required. In that event we would not use zero-width-space.
+
+		default:
+			for c := n.FirstChild; c != nil; c = c.NextSibling {
+				writeText(c, b, isTest)
+			}
+			switch n.DataAtom {
+			case 0:
+				if n.Data == "documize" {
+					for _, a := range n.Attr {
+						if a.Key == "type" {
+							if isTest {
+								var err error
+								switch a.Val {
+								case "field-start":
+									_, err = b.Write([]byte(" [ "))
+								case "field-end":
+									_, err = b.Write([]byte(" ] "))
+								default:
+									_, err = b.Write([]byte(" [ ] "))
+								}
+								if err != nil {
+								}
+							}
+							return
+						}
+					}
+				}
+			case atom.Span, atom.U, atom.B, atom.I, atom.Del, atom.Sub, atom.Sup:
+				//NoOp
+			default:
+				_, err := b.Write([]byte(" ")) // add a space after each main element
+				if err != nil {
+				}
+			}
+		}
+	}
+}
+
+func excluded(n *html.Node) bool {
+	if n.DataAtom == atom.Div {
+		for _, a := range n.Attr {
+			if a.Key == "class" {
+				switch a.Val {
+				case "documize-first-page",
+					"documize-exotic-image",
+					"documize-footnote",
+					"documize-graphictext",
+					"documize-math":
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+// findBody finds the body HTML node if it exists in the tree. Required to bypass the page title text.
+func findBody(n *html.Node) *html.Node {
+	if n.DataAtom == atom.Body {
+		return n
+	}
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		r := findBody(c)
+		if r != nil {
+			return r
+		}
+	}
+	return nil
+}
+
+// Text returns only the plain text elements of the HTML Chunk, concatanated with "\n",
+// for use in the TOC or for text indexing.
+func (ch HTML) Text(isTest bool) (string, error) {
+	var b bytes.Buffer
+	doc, err := html.Parse(strings.NewReader(string(ch)))
+	if err != nil {
+		return "", err
+	}
+	body := findBody(doc)
+	if body == nil {
+		body = doc
+	}
+	writeText(body, &b, isTest)
+	return string(b.Bytes()), nil
+}
+
+// EscapeHTMLcomplexChars looks for "complex" characters within HTML
+// and replaces them with the HTML escape codes which describe them.
+// "Complex" characters are those encoded in more than one byte by UTF8.
+func EscapeHTMLcomplexChars(s string) string {
+	ret := ""
+	for _, r := range s {
+		if utf8.RuneLen(r) > 1 {
+			ret += fmt.Sprintf("&#%d;", r)
+		} else {
+			ret += string(r)
+		}
+	}
+	return ret
+}
+
+// EscapeHTMLcomplexCharsByte looks for "complex" characters within HTML
+// and replaces them with the HTML escape codes which describe them.
+// "Complex" characters are those encoded in more than one byte by UTF8.
+func EscapeHTMLcomplexCharsByte(b []byte) []byte {
+	var ret bytes.Buffer
+	for len(b) > 0 {
+		r, size := utf8.DecodeRune(b)
+		if utf8.RuneLen(r) > 1 {
+			fmt.Fprintf(&ret, "&#%d;", r)
+		} else {
+			_, err := ret.Write(b[:size])
+			if err != nil {
+			}
+		}
+		b = b[size:]
+	}
+	return ret.Bytes()
+}
--- a/core/stringutil/html_test.go
+++ b/core/stringutil/html_test.go
@ -0,0 +1,83 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import "testing"
+
+func TestHTML(t *testing.T) {
+	type testConv struct {
+		htm, txt string
+		istest   bool
+	}
+	convTest := []testConv{
+		{
+			`<html><head><title>HTML TITLE</title></head><body><p>This <I>is</I>:</p><ul><li><a href="foo">Example</a><li><a href="/bar/baz">HTML text.</a><div class="documize-math">exclueded</div></ul></body></html>`,
+			"This is : Example HTML text. ", false,
+		},
+		{
+			`<p>This is:</p><ul><li><documize type="field-start"></documize> <documize type="field-end"></documize><documize type="unknown"></documize><li><a href="/bar/baz">HTML text.</a></ul>`,
+			"This is: [ ] [ ] HTML text. ", true,
+		},
+	}
+	for _, tst := range convTest {
+		var ch HTML
+		ch = HTML([]byte(tst.htm))
+		//t.Logf("HTML: %s", ch)
+		txt, err := ch.Text(tst.istest)
+		if err != nil {
+			t.Log(err)
+			t.Fail()
+		}
+		expected := compressSpaces(tst.txt)
+		got := compressSpaces(string(txt))
+		if expected != got {
+			t.Errorf("Conversion to text for `%s`, expected: `%s` got: `%s`\n",
+				ch, expected, got)
+		} //else {
+		//	t.Logf("Text: %s", txt)
+		//}
+	}
+}
+
+func compressSpaces(s string) string {
+	ret := ""
+	inSpace := false
+	for _, r := range s {
+		switch r {
+		case ' ', '\t', '\n', '\u200b' /*zero width space*/ :
+			if !inSpace {
+				ret += " "
+			}
+			inSpace = true
+		default:
+			inSpace = false
+			ret += string(r)
+		}
+	}
+	return ret
+}
+
+func TestHTMLescape(t *testing.T) {
+	tianchao := "兲朝 test"
+	expected := "&#20850;&#26397; test"
+
+	gotString := EscapeHTMLcomplexChars(tianchao)
+	if gotString != expected {
+		t.Errorf("EscapeHTMLcomplexChars error got `%s` expected `%s`\n", gotString, expected)
+	}
+
+	gotBytes := EscapeHTMLcomplexCharsByte([]byte(tianchao))
+	if string(gotBytes) != expected {
+		t.Errorf("EscapeHTMLcomplexCharsByte error got `%s` expected `%s`\n", string(gotBytes), expected)
+	}
+
+}
--- a/core/stringutil/initials.go
+++ b/core/stringutil/initials.go
@ -0,0 +1,34 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"strings"
+)
+
+// MakeInitials returns user initials from firstname and lastname.
+func MakeInitials(firstname, lastname string) string {
+	firstname = strings.TrimSpace(firstname)
+	lastname = strings.TrimSpace(lastname)
+	a := ""
+	b := ""
+
+	if len(firstname) > 0 {
+		a = firstname[:1]
+	}
+
+	if len(lastname) > 0 {
+		b = lastname[:1]
+	}
+
+	return strings.ToUpper(a + b)
+}
--- a/core/stringutil/initials_test.go
+++ b/core/stringutil/initials_test.go
@ -0,0 +1,28 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import "testing"
+
+func TestInitials(t *testing.T) {
+	in(t, "Harvey", "Kandola", "HK")
+	in(t, "Harvey", "", "H")
+	in(t, "", "Kandola", "K")
+	in(t, "", "", "")
+}
+
+func in(t *testing.T, firstname, lastname, expecting string) {
+	initials := MakeInitials(firstname, lastname)
+	if initials != expecting {
+		t.Errorf("expecting initials of `%s` got `%s`\n", expecting, initials)
+	}
+}
--- a/core/stringutil/slug.go
+++ b/core/stringutil/slug.go
@ -0,0 +1,37 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"strings"
+	"unicode"
+)
+
+// MakeSlug creates a slug, suitable for use in a URL, from a string
+func MakeSlug(str string) string {
+	slg := strings.Map(
+		func(r rune) rune { // individual mapping of runes into a format suitable for use in a URL
+			r = unicode.ToLower(r)
+			if unicode.IsLower(r) || unicode.IsDigit(r) {
+				return r
+			}
+			return '-'
+		}, str)
+	slg = strings.NewReplacer("---", "-", "--", "-").Replace(slg)
+	for strings.HasSuffix(slg, "-") {
+		slg = strings.TrimSuffix(slg, "-")
+	}
+	for strings.HasPrefix(slg, "-") {
+		slg = strings.TrimPrefix(slg, "-")
+	}
+	return slg
+}
--- a/core/stringutil/slug_test.go
+++ b/core/stringutil/slug_test.go
@ -0,0 +1,25 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import "testing"
+
+func TestSlug(t *testing.T) {
+	st(t, " Zip--up ", "zip-up")
+}
+
+func st(t *testing.T, in, out string) {
+	got := MakeSlug(in)
+	if got != out {
+		t.Errorf("slug input `%s` got `%s` expected `%s`\n", in, got, out)
+	}
+}
--- a/core/stringutil/words.go
+++ b/core/stringutil/words.go
@ -0,0 +1,78 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"unicode"
+
+	nethtml "golang.org/x/net/html"
+)
+
+// Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word.
+// This functionality is provided to enable verification of the text extraction algorithm across different implemntations.
+func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) {
+	txt, err := ch.Text(testMode)
+	if err != nil {
+		return nil, inSqBr, err
+	}
+	txt = nethtml.UnescapeString(txt)
+
+	words := []string{""}
+
+	for _, c := range txt {
+		if inSqBr > 0 {
+			switch c {
+			case ']':
+				inSqBr--
+			case '[':
+				inSqBr++
+			}
+		} else {
+			if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
+				if testMode {
+					c = ' ' // NOTE only replace with a space here if we are testing
+				}
+			}
+			if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
+				if c == '[' {
+					inSqBr = 1
+					words = append(words, "[") // open square bracket means potentially elided text
+					words = append(words, "")
+				} else {
+					inSqBr = 0
+					if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) {
+						if words[len(words)-1] == "" {
+							words[len(words)-1] = string(c)
+						} else {
+							words = append(words, string(c))
+						}
+						words = append(words, "")
+					} else {
+						if unicode.IsGraphic(c) || unicode.IsSpace(c) {
+							if unicode.IsSpace(c) {
+								if words[len(words)-1] != "" {
+									words = append(words, "")
+								}
+							} else {
+								words[len(words)-1] += string(c)
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation
+		words = append(words, ".")
+	}
+	return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end
+}
--- a/core/stringutil/words_test.go
+++ b/core/stringutil/words_test.go
@ -0,0 +1,57 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package stringutil
+
+import (
+	"sort"
+	"strings"
+	"testing"
+)
+
+func TestWords(t *testing.T) {
+	ws(t, " the quick brown fox jumps over the lazy dog [ ] ["+string(rune(0x200B)), 0, true,
+		"the quick brown fox jumps over the lazy dog [ [", 1)
+	ws(t, "the quick brown [ dog jumps over the lazy ] fox", 0, false,
+		"the quick brown [ fox .", 0)
+	ws(t, "the quick brown;fox;", 0, false,
+		"the quick brown ; fox ; .", 0)
+	ws(t, "the ] quick brown fox ", 1, true,
+		"quick brown fox", 0)
+}
+
+func ws(t *testing.T, in string, bktIn int, isTest bool, out string, bktOut int) {
+	wds := strings.Split(out, " ")
+	gotX, bo, e := Words(HTML(in), bktIn, isTest)
+	if e != nil {
+		t.Fatal(e)
+	}
+	if bo != bktOut {
+		t.Errorf("wrong bracket count returned: input `%s` bktIn %d bktOut %d\n", in, bktIn, bktOut)
+	}
+	got := make([]string, 0, len(gotX))
+	for _, v := range gotX { // remove empty entries
+		if v != "" {
+			got = append(got, v)
+		}
+	}
+	if len(got) != len(wds) {
+		t.Errorf("wrong number of words found: input `%s` got %d %v expected %d %v`\n", in, len(got), got, len(wds), wds)
+	} else {
+		sort.Strings(wds)
+		sort.Strings(got)
+		for i := range wds {
+			if wds[i] != got[i] {
+				t.Errorf("wrong word[%d]: input `%s` got %v expected %v\n", i, in, got, wds)
+			}
+		}
+	}
+}