1
0
Fork 0
mirror of https://github.com/documize/community.git synced 2025-08-08 23:15:29 +02:00

initial commit

This commit is contained in:
Harvey Kandola 2016-07-07 18:54:16 -07:00
commit 18933c6767
1841 changed files with 810642 additions and 0 deletions

4
wordsmith/README.md Normal file
View file

@ -0,0 +1,4 @@
Wordsmith
=========
Provides common code for all

View file

@ -0,0 +1,44 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package api provides the defininitions of types used for communication between different components of the Documize system.
package api
// DocumentConversionRequest is what is passed to a Convert plugin.
type DocumentConversionRequest struct {
Filename string
Filedata []byte
PageBreakLevel uint
Token string // authorisation token
}
// Page holds the contents of a Documize page,
// which is a Body of html with a Title and a Level,
type Page struct {
Level uint64 // overall document is level 1, <H1> => level 2
Title string
Body []byte
}
// EmbeddedFile holds the contents of an embedded file.
type EmbeddedFile struct {
ID, Type, Name string // name must have the same extension as the type e.g. Type="txt" Name="foo.txt"
Data []byte
}
// DocumentConversionResponse is the response from a Convert plugin.
type DocumentConversionResponse struct {
Err string
PagesHTML []byte // If empty, use Pages
Pages []Page
EmbeddedFiles []EmbeddedFile
Excerpt string
}

26
wordsmith/api/request.go Normal file
View file

@ -0,0 +1,26 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package api
// ConversionJobRequest is the information used to set-up a conversion job.
type ConversionJobRequest struct {
Job string
IndexDepth uint
OrgID string
}
// DocumentExport is the type used by a document export plugin.
type DocumentExport struct {
Filename string
Format string
File []byte
}

90
wordsmith/api/response.go Normal file
View file

@ -0,0 +1,90 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package api
import (
"github.com/documize/community/wordsmith/log"
"encoding/json"
"net/http"
)
// apiJSONResponse is the structure of a JSON response to a Documize client.
type apiJSONResponse struct {
Code int
Success bool
Message string
Data interface{}
}
// SetJSONResponse sets the response type to "application/json" in the HTTP header.
func SetJSONResponse(w http.ResponseWriter) {
w.Header().Set("Content-Type", "application/json; charset=utf-8")
}
// WriteError to the http.ResponseWriter, taking care to provide the correct
// response error code within the JSON response.
func WriteError(w http.ResponseWriter, err error) {
response := apiJSONResponse{}
response.Message = err.Error()
response.Success = false
response.Data = nil
switch err.Error() {
case "BadRequest":
response.Code = 400
w.WriteHeader(http.StatusBadRequest)
case "Unauthorized":
response.Code = 401
w.WriteHeader(http.StatusUnauthorized)
case "Forbidden":
response.Code = 403
w.WriteHeader(http.StatusForbidden)
case "NotFound":
response.Code = 404
w.WriteHeader(http.StatusNotFound)
default:
response.Code = 500
w.WriteHeader(http.StatusInternalServerError)
}
json, err := json.Marshal(response)
if err != nil {
log.Error("json.Marshal", err)
}
if _, err := w.Write(json); err != nil {
log.Error("write to ResponseWriter", err)
}
}
// WriteErrorBadRequest provides feedback to a Documize client on an error,
// where that error is described in a string.
func WriteErrorBadRequest(w http.ResponseWriter, message string) {
response := apiJSONResponse{}
response.Message = message
response.Success = false
response.Data = nil
response.Code = 400
w.WriteHeader(http.StatusBadRequest)
json, err := json.Marshal(response)
if err != nil {
log.Error("json.Marshal", err)
}
if _, err := w.Write(json); err != nil {
log.Error("write to ResponseWriter", err)
}
}

View file

@ -0,0 +1,119 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package environment allow environment variables to be obtained from either the environment or the command line.
// Environment variables are always uppercase, with the Prefix; flags are always lowercase without.
package environment
import (
"flag"
"fmt"
"os"
"sort"
"strings"
)
// CallbackT is the type signature of the callback function of GetString().
type CallbackT func(*string, string) bool
type varT struct {
target *string
name, setter, value string
required bool
callback CallbackT
}
type varsT struct {
vv []varT
}
var vars varsT
// Len is part of sort.Interface.
func (v *varsT) Len() int {
return len(v.vv)
}
// Swap is part of sort.Interface.
func (v *varsT) Swap(i, j int) {
v.vv[i], v.vv[j] = v.vv[j], v.vv[i]
}
// Less is part of sort.Interface.
func (v *varsT) Less(i, j int) bool {
return v.vv[i].name < v.vv[j].name
}
// Prefix provides the prefix for all Environment variables
const Prefix = "DOCUMIZE"
const goInit = "(default)"
// GetString sets-up the flag for later use, it must be called before ParseOK(), usually in an init().
func GetString(target *string, name string, required bool, usage string, callback CallbackT) {
name = strings.ToLower(strings.TrimSpace(name))
setter := Prefix + strings.ToUpper(name)
value := os.Getenv(setter)
if value == "" {
value = *target // use the Go initialized value
setter = goInit
}
flag.StringVar(target, name, value, usage)
vars.vv = append(vars.vv, varT{target: target, name: name, required: required, callback: callback, value: value, setter: setter})
}
var showSettings = flag.Bool("showsettings", false, "if true, show settings in the log (WARNING: these settings may include passwords)")
// Parse calls flag.Parse() then checks that the required environment variables are all set.
// It should be the first thing called by any main() that uses this library.
// If all the required variables are not present, it prints an error and calls os.Exit(2) like flag.Parse().
func Parse(doFirst string) {
flag.Parse()
sort.Sort(&vars)
for pass := 1; pass <= 2; pass++ {
for vi, v := range vars.vv {
if (pass == 1 && v.name == doFirst) || (pass == 2 && v.name != doFirst) {
typ := "Optional"
if v.value != *(v.target) || (v.value != "" && *(v.target) == "") {
vars.vv[vi].setter = "-" + v.name // v is a local copy, not the underlying data
}
if v.callback != nil {
if v.callback(v.target, v.name) {
vars.vv[vi].setter = "setting:" + v.name // v is a local copy, not the underlying data
}
}
if v.required {
if *(v.target) == "" {
fmt.Fprintln(os.Stderr)
fmt.Fprintln(os.Stderr, "In order to run", os.Args[0], "the following must be provided:")
for _, vv := range vars.vv {
if vv.required {
fmt.Fprintf(os.Stderr, "* setting from environment variable '%s' or flag '-%s' or an application setting '%s', current value: '%s' set by '%s'\n",
Prefix+strings.ToUpper(vv.name), vv.name, vv.name, *(vv.target), vv.setter)
}
}
fmt.Fprintln(os.Stderr)
flag.Usage()
os.Exit(2)
return
}
typ = "Required"
}
if *showSettings {
if *(v.target) != "" && vars.vv[vi].setter != goInit {
fmt.Fprintf(os.Stdout, "%s setting from '%s' is: '%s'\n",
typ, vars.vv[vi].setter, *(v.target))
}
}
}
}
}
}

78
wordsmith/log/logger.go Normal file
View file

@ -0,0 +1,78 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package log provides centralized logging for the Documize application.
package log
import (
"bytes"
"fmt"
"os"
"runtime"
log "github.com/Sirupsen/logrus"
env "github.com/documize/community/wordsmith/environment"
)
var environment = "Non-production"
func init() {
log.SetFormatter(new(log.TextFormatter))
log.SetOutput(os.Stdout)
log.SetLevel(log.DebugLevel)
env.GetString(&environment, "log", false,
"system being logged e.g. 'PRODUCTION'",
func(*string, string) bool {
log.Infoln(environment + " environment logging enabled")
return false
})
}
// Debug logs a message for debug purposes.
func Debug(message string) {
log.WithFields(log.Fields{"env": environment}).Debug(message)
}
// Info logs a message for information purposes.
func Info(message string) {
log.WithFields(log.Fields{"env": environment}).Info(message)
}
// TestIfErr is used by the test code to signal that a test being run should error, it is reset if an error occurs.
var TestIfErr bool
// ErrorString logs an error, where there is not an error value.
func ErrorString(message string) {
TestIfErr = false
log.WithFields(log.Fields{"env": environment}).Error(message)
}
// Error logs an error, if non-nil, with a message to give some context.
func Error(message string, err error) {
if err != nil {
TestIfErr = false
stack := make([]byte, 4096)
runtime.Stack(stack, false)
if idx := bytes.IndexByte(stack, 0); idx > 0 && idx < len(stack) {
stack = stack[:idx] // remove trailing nulls from stack dump
}
log.WithFields(log.Fields{"env": environment, "error": err.Error(), "stack": fmt.Sprintf("%s", stack)}).Error(message)
//log.WithField("error: "+message, err.Error()).Errorf("%q\n%s\n", err, stack[:])
}
}
// IfErr logs an error if one exists.
// It is a convenience wrapper for Error(), with no context message.
func IfErr(err error) {
Error("", err)
}

View file

@ -0,0 +1,66 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"path/filepath"
"strings"
"unicode"
)
// BeautifyFilename takes a filename and attempts to turn it into a readable form,
// as TitleCase natural language, suitable for the top level of a Document.
func BeautifyFilename(fn string) string {
_, file := filepath.Split(fn)
splits := strings.Split(file, ".")
r := []rune(strings.Join(splits[:len(splits)-1], "."))
// make any non-letter/digit characters space
for i := range r {
if !(unicode.IsLetter(r[i]) || unicode.IsDigit(r[i]) || r[i] == '.') {
r[i] = ' '
}
}
// insert spaces in front of any Upper/Lowwer 2-letter combinations
addSpaces:
for i := range r {
if i > 1 { // do not insert a space at the start of the file name
if unicode.IsLower(r[i]) && unicode.IsUpper(r[i-1]) && r[i-2] != ' ' {
n := make([]rune, len(r)+1)
for j := 0; j < i-1; j++ {
n[j] = r[j]
}
n[i-1] = ' '
for j := i - 1; j < len(r); j++ {
n[j+1] = r[j]
}
r = n
goto addSpaces
}
}
}
// make the first letter of each word upper case
for i := range r {
switch i {
case 0:
r[i] = unicode.ToUpper(r[i])
case 1: // the zero element should never be space
default:
if r[i-1] == ' ' {
r[i] = unicode.ToUpper(r[i])
}
}
}
return string(r)
}

View file

@ -0,0 +1,25 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
func TestBeautify(t *testing.T) {
bs(t, "DooDah$day.zip", "Doo Dah Day")
}
func bs(t *testing.T, in, out string) {
got := BeautifyFilename(in)
if got != out {
t.Errorf("BeautifyFilename input `%s` got `%s` expected `%s`\n", in, got, out)
}
}

View file

@ -0,0 +1,59 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"bytes"
"github.com/documize/community/wordsmith/log"
"errors"
"os/exec"
"time"
)
var errTimeout = errors.New("conversion timelimit exceeded")
// CommandWithTimeout runs a command but stops it if it does not finish within the timout above.
func CommandWithTimeout(command *exec.Cmd, timeout time.Duration) ([]byte, error) {
var output bytes.Buffer
//fmt.Println("DEBUG CommandWithTimeout: %v", command.Args)
command.Stdout = &output
command.Stderr = &output
if err := command.Start(); err != nil {
return nil, err
}
done := make(chan error, 1)
defer close(done)
go func() {
done <- command.Wait()
}()
select {
case <-time.After(timeout):
if err := command.Process.Kill(); err != nil {
log.Error("failed to kill: ", err)
}
<-done // prevent memory leak
//fmt.Println("DEBUG timeout")
return nil, errTimeout
case err := <-done:
if err != nil {
//fmt.Println("DEBUG error return")
return output.Bytes(), err
}
if !command.ProcessState.Success() {
err = errors.New(string(output.Bytes()))
//fmt.Println("DEBUG not successful")
return nil, err
}
//fmt.Println("DEBUG successful return")
return output.Bytes(), nil
}
}

View file

@ -0,0 +1,39 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
import "os/exec"
import "time"
func TestCmd(t *testing.T) {
cmd := exec.Command("echo", "test")
buf, err := CommandWithTimeout(cmd,time.Second)
if err != nil {
t.Error(err)
return
}
if string(buf) != "test\n" {
t.Error("command did not return `test` it returned:" + string(buf))
}
cmd2 := exec.Command("dingbat doodah")
_, err2 := CommandWithTimeout(cmd2,time.Second)
if err2 == nil {
t.Error("bad command did not return an error")
}
timeout := 5 * time.Second
cmd3 := exec.Command("sleep", "50")
_, err3 := CommandWithTimeout(cmd3,timeout)
if err3 != errTimeout {
t.Error("sleep command did not timeout:", err3)
}
}

View file

@ -0,0 +1,26 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "io"
import "github.com/documize/community/wordsmith/log"
// Close is a convenience function to close an io.Closer, usually in a defer.
func Close(f interface{}) {
if f != nil {
if ff, ok := f.(io.Closer); ok {
if ff != io.Closer(nil) {
log.IfErr(ff.Close())
}
}
}
}

View file

@ -0,0 +1,25 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
import "os"
import "github.com/documize/community/wordsmith/log"
func TestDefclose(t *testing.T) {
var f *os.File
log.TestIfErr=true
Close(f)
if log.TestIfErr {
t.Error("Close() did not error when it should have")
}
}

13
wordsmith/utility/doc.go Normal file
View file

@ -0,0 +1,13 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package utility contains utility functions used by the whole Documize ecosystem.
package utility

164
wordsmith/utility/html.go Normal file
View file

@ -0,0 +1,164 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"bytes"
"fmt"
"io"
"strings"
"unicode/utf8"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/documize/community/wordsmith/log"
)
// HTML describes a chunk of HTML, Text() method returns plain text.
type HTML string
// write out the textual element of the html node, if present, then iterate through the child nodes.
func writeText(n *html.Node, b io.Writer, isTest bool) {
if !excluded(n) {
switch n.Type {
case html.TextNode:
_, err := b.Write([]byte(n.Data + string(rune(0x200B)))) // + http://en.wikipedia.org/wiki/Zero-width_space
if err != nil {
log.Error("write TextNode", err)
}
// TODO This use of zero-width-space (subsequently replaced by ' ' or ignored, depending on context)
// TODO works well for in-word breaks, but at the expense of concatenating some words in error.
// TODO It may be that better examination of the HTML structure could be used to determine
// TODO when a space is, or is not, required. In that event we would not use zero-width-space.
default:
for c := n.FirstChild; c != nil; c = c.NextSibling {
writeText(c, b, isTest)
}
switch n.DataAtom {
case 0:
if n.Data == "documize" {
for _, a := range n.Attr {
if a.Key == "type" {
if isTest {
var err error
switch a.Val {
case "field-start":
_, err = b.Write([]byte(" [ "))
case "field-end":
_, err = b.Write([]byte(" ] "))
default:
_, err = b.Write([]byte(" [ ] "))
}
if err != nil {
log.Error("write []", err)
}
}
return
}
}
}
case atom.Span, atom.U, atom.B, atom.I, atom.Del, atom.Sub, atom.Sup:
//NoOp
default:
_, err := b.Write([]byte(" ")) // add a space after each main element
if err != nil {
log.Error("write space", err)
}
}
}
}
}
func excluded(n *html.Node) bool {
if n.DataAtom == atom.Div {
for _, a := range n.Attr {
if a.Key == "class" {
switch a.Val {
case "documize-first-page",
"documize-exotic-image",
"documize-footnote",
"documize-graphictext",
"documize-math":
return true
}
}
}
}
return false
}
// findBody finds the body HTML node if it exists in the tree. Required to bypass the page title text.
func findBody(n *html.Node) *html.Node {
if n.DataAtom == atom.Body {
return n
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
r := findBody(c)
if r != nil {
return r
}
}
return nil
}
// Text returns only the plain text elements of the HTML Chunk, concatanated with "\n",
// for use in the TOC or for text indexing.
func (ch HTML) Text(isTest bool) (string, error) {
var b bytes.Buffer
doc, err := html.Parse(strings.NewReader(string(ch)))
if err != nil {
return "", err
}
body := findBody(doc)
if body == nil {
body = doc
}
writeText(body, &b, isTest)
return string(b.Bytes()), nil
}
// EscapeHTMLcomplexChars looks for "complex" characters within HTML
// and replaces them with the HTML escape codes which describe them.
// "Complex" characters are those encoded in more than one byte by UTF8.
func EscapeHTMLcomplexChars(s string) string {
ret := ""
for _, r := range s {
if utf8.RuneLen(r) > 1 {
ret += fmt.Sprintf("&#%d;", r)
} else {
ret += string(r)
}
}
return ret
}
// EscapeHTMLcomplexCharsByte looks for "complex" characters within HTML
// and replaces them with the HTML escape codes which describe them.
// "Complex" characters are those encoded in more than one byte by UTF8.
func EscapeHTMLcomplexCharsByte(b []byte) []byte {
var ret bytes.Buffer
for len(b) > 0 {
r, size := utf8.DecodeRune(b)
if utf8.RuneLen(r) > 1 {
fmt.Fprintf(&ret, "&#%d;", r)
} else {
_, err := ret.Write(b[:size])
if err != nil {
log.Error("EscapeHTMLcomplexCharsByte", err)
}
}
b = b[size:]
}
return ret.Bytes()
}

View file

@ -0,0 +1,83 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
func TestHTML(t *testing.T) {
type testConv struct {
htm, txt string
istest bool
}
convTest := []testConv{
{
`<html><head><title>HTML TITLE</title></head><body><p>This <I>is</I>:</p><ul><li><a href="foo">Example</a><li><a href="/bar/baz">HTML text.</a><div class="documize-math">exclueded</div></ul></body></html>`,
"This is : Example HTML text. ", false,
},
{
`<p>This is:</p><ul><li><documize type="field-start"></documize> <documize type="field-end"></documize><documize type="unknown"></documize><li><a href="/bar/baz">HTML text.</a></ul>`,
"This is: [ ] [ ] HTML text. ", true,
},
}
for _, tst := range convTest {
var ch HTML
ch = HTML([]byte(tst.htm))
//t.Logf("HTML: %s", ch)
txt, err := ch.Text(tst.istest)
if err != nil {
t.Log(err)
t.Fail()
}
expected := compressSpaces(tst.txt)
got := compressSpaces(string(txt))
if expected != got {
t.Errorf("Conversion to text for `%s`, expected: `%s` got: `%s`\n",
ch, expected, got)
} //else {
// t.Logf("Text: %s", txt)
//}
}
}
func compressSpaces(s string) string {
ret := ""
inSpace := false
for _, r := range s {
switch r {
case ' ', '\t', '\n', '\u200b' /*zero width space*/ :
if !inSpace {
ret += " "
}
inSpace = true
default:
inSpace = false
ret += string(r)
}
}
return ret
}
func TestHTMLescape(t *testing.T) {
tianchao := "兲朝 test"
expected := "&#20850;&#26397; test"
gotString := EscapeHTMLcomplexChars(tianchao)
if gotString != expected {
t.Errorf("EscapeHTMLcomplexChars error got `%s` expected `%s`\n", gotString, expected)
}
gotBytes := EscapeHTMLcomplexCharsByte([]byte(tianchao))
if string(gotBytes) != expected {
t.Errorf("EscapeHTMLcomplexCharsByte error got `%s` expected `%s`\n", string(gotBytes), expected)
}
}

View file

@ -0,0 +1,88 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"encoding/base64"
"errors"
"io"
)
var key = []byte("8456FHkQW1566etydT46jk39ghjfFhg4") // 32 bytes
// MakeMD5 returns the MD5 hash of a given string, usually a password.
/*
func MakeMD5(password string) []byte {
hash := md5.New()
if _, err := io.WriteString(hash, password); err != nil {
log.Error("error in MakeMD5", err)
}
return hash.Sum(nil)
}
*/
// MakeAES creates an AES encryption of of a given string,
// using a hard-wired key value,
// suitable for use as an authentication token.
func MakeAES(secret string) ([]byte, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, err
}
b := EncodeBase64([]byte(secret))
ciphertext := make([]byte, aes.BlockSize+len(b))
iv := ciphertext[:aes.BlockSize]
if _, err := io.ReadFull(rand.Reader, iv); err != nil {
return nil, err
}
cfb := cipher.NewCFBEncrypter(block, iv)
cfb.XORKeyStream(ciphertext[aes.BlockSize:], b)
return ciphertext, nil
}
// DecryptAES decrypts an AES encoded []byte,
// using a hard-wired key value,
// suitable for use when reading an authentication token.
func DecryptAES(text []byte) ([]byte, error) {
block, err := aes.NewCipher(key)
if err != nil {
return nil, errors.New("aes.NewCipher failure: " + err.Error())
}
if len(text) < aes.BlockSize {
return nil, errors.New("ciphertext too short")
}
iv := text[:aes.BlockSize]
text = text[aes.BlockSize:]
cfb := cipher.NewCFBDecrypter(block, iv)
cfb.XORKeyStream(text, text)
return DecodeBase64(text)
}
// EncodeBase64 is a convenience function to encode using StdEncoding.
func EncodeBase64(b []byte) []byte {
return []byte(base64.StdEncoding.EncodeToString(b))
}
// EncodeBase64AsString is a convenience function to encode using StdEncoding.
/*
func EncodeBase64AsString(b []byte) string {
return base64.StdEncoding.EncodeToString(b)
}
*/
// DecodeBase64 is a convenience function to decode using StdEncoding.
func DecodeBase64(b []byte) ([]byte, error) {
return base64.StdEncoding.DecodeString(string(b))
}

View file

@ -0,0 +1,35 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
func TestSecrets(t *testing.T) {
mimi := "007"
b, e := MakeAES(mimi)
if e != nil {
t.Fatal(e)
}
mm, e2 := DecryptAES(b)
if e2 != nil {
t.Fatal(e2)
}
if mimi != string(mm) {
t.Errorf("wanted %s got %s", mimi, string(mm))
}
_, ee := DecryptAES([]byte{})
if ee == nil {
t.Error("should have errored on empty cypher")
}
}

37
wordsmith/utility/slug.go Normal file
View file

@ -0,0 +1,37 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"strings"
"unicode"
)
// MakeSlug creates a slug, suitable for use in a URL, from a string
func MakeSlug(str string) string {
slg := strings.Map(
func(r rune) rune { // individual mapping of runes into a format suitable for use in a URL
r = unicode.ToLower(r)
if unicode.IsLower(r) || unicode.IsDigit(r) {
return r
}
return '-'
}, str)
slg = strings.NewReplacer("---", "-", "--", "-").Replace(slg)
for strings.HasSuffix(slg, "-") {
slg = strings.TrimSuffix(slg, "-")
}
for strings.HasPrefix(slg, "-") {
slg = strings.TrimPrefix(slg, "-")
}
return slg
}

View file

@ -0,0 +1,25 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
func TestSlug(t *testing.T) {
st(t, " Zip--up ", "zip-up")
}
func st(t *testing.T, in, out string) {
got := MakeSlug(in)
if got != out {
t.Errorf("slug input `%s` got `%s` expected `%s`\n", in, got, out)
}
}

34
wordsmith/utility/user.go Normal file
View file

@ -0,0 +1,34 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"strings"
)
// MakeInitials returns user initials from firstname and lastname.
func MakeInitials(firstname, lastname string) string {
firstname = strings.TrimSpace(firstname)
lastname = strings.TrimSpace(lastname)
a := ""
b := ""
if len(firstname) > 0 {
a = firstname[:1]
}
if len(lastname) > 0 {
b = lastname[:1]
}
return strings.ToUpper(a + b)
}

View file

@ -0,0 +1,28 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "testing"
func TestInitials(t *testing.T) {
in(t, "Harvey", "Kandola", "HK")
in(t, "Harvey", "", "H")
in(t, "", "Kandola", "K")
in(t, "", "", "")
}
func in(t *testing.T, firstname, lastname, expecting string) {
initials := MakeInitials(firstname, lastname)
if initials != expecting {
t.Errorf("expecting initials of `%s` got `%s`\n", expecting, initials)
}
}

View file

@ -0,0 +1,75 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "unicode"
import nethtml "golang.org/x/net/html"
// Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word.
// This functionality is provided to enable verification of the text extraction algorithem across different implemntations.
func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) {
txt, err := ch.Text(testMode)
if err != nil {
return nil, inSqBr, err
}
txt = nethtml.UnescapeString(txt)
words := []string{""}
for _, c := range txt {
if inSqBr > 0 {
switch c {
case ']':
inSqBr--
case '[':
inSqBr++
}
} else {
if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
if testMode {
c = ' ' // NOTE only replace with a space here if we are testing
}
}
if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
if c == '[' {
inSqBr = 1
words = append(words, "[") // open square bracket means potentially elided text
words = append(words, "")
} else {
inSqBr = 0
if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) {
if words[len(words)-1] == "" {
words[len(words)-1] = string(c)
} else {
words = append(words, string(c))
}
words = append(words, "")
} else {
if unicode.IsGraphic(c) || unicode.IsSpace(c) {
if unicode.IsSpace(c) {
if words[len(words)-1] != "" {
words = append(words, "")
}
} else {
words[len(words)-1] += string(c)
}
}
}
}
}
}
}
if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation
words = append(words, ".")
}
return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end
}

View file

@ -0,0 +1,57 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import (
"sort"
"strings"
"testing"
)
func TestWords(t *testing.T) {
ws(t, " the quick brown fox jumps over the lazy dog [ ] [" +string(rune(0x200B)), 0, true,
"the quick brown fox jumps over the lazy dog [ [", 1)
ws(t, "the quick brown [ dog jumps over the lazy ] fox", 0, false,
"the quick brown [ fox .", 0)
ws(t, "the quick brown;fox;", 0, false,
"the quick brown ; fox ; .", 0)
ws(t, "the ] quick brown fox ", 1, true,
"quick brown fox", 0)
}
func ws(t *testing.T, in string, bktIn int, isTest bool, out string, bktOut int) {
wds := strings.Split(out, " ")
gotX, bo, e := Words(HTML(in), bktIn, isTest)
if e != nil {
t.Fatal(e)
}
if bo != bktOut {
t.Errorf("wrong bracket count returned: input `%s` bktIn %d bktOut %d\n", in, bktIn, bktOut)
}
got := make([]string, 0, len(gotX))
for _, v := range gotX { // remove empty entries
if v != "" {
got = append(got, v)
}
}
if len(got) != len(wds) {
t.Errorf("wrong number of words found: input `%s` got %d %v expected %d %v`\n", in, len(got), got, len(wds), wds)
} else {
sort.Strings(wds)
sort.Strings(got)
for i := range wds {
if wds[i] != got[i] {
t.Errorf("wrong word[%d]: input `%s` got %v expected %v\n", i, in, got, wds)
}
}
}
}

23406
wordsmith/wordlists/en-2012/en-s.log Executable file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,4 @@
Total files: 23406
Unique word count: 521426
Total word count: 145376051
Overall word count: 193225723

456631
wordsmith/wordlists/en-2012/en.txt Executable file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,149 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package main creates ordered lists of english words and their stems,
// based on their frequency.
package main
import (
"bytes"
"fmt"
"io/ioutil"
"sort"
"github.com/rookii/paicehusk"
)
type wordFreqEntry struct {
rawFreq int
Freq float64
}
type wordFreqMap map[string]wordFreqEntry
type wordFreqSortEntry struct {
Name string
Freq float64
}
type wordFreqSort []wordFreqSortEntry
// Len is the number of elements in the collection.
func (wfs wordFreqSort) Len() int { return len(wfs) }
// Less reports whether the element with
// index i should sort before the element with index j.
func (wfs wordFreqSort) Less(i, j int) bool { return wfs[i].Freq > wfs[j].Freq }
// Swap swaps the elements with indexes i and j.
func (wfs wordFreqSort) Swap(i, j int) { wfs[j], wfs[i] = wfs[i], wfs[j] }
func main() {
txt, err := ioutil.ReadFile("./en-2012/en.txt")
if err != nil {
panic(err)
}
lines := bytes.Split(txt, []byte("\n"))
wfm := make(wordFreqMap)
rfTot := 0
for r, l := range lines {
words := bytes.Split(l, []byte(" "))
if len(words) >= 2 {
var rf int
_, err = fmt.Sscanf(string(words[1]), "%d", &rf)
if err == nil && len(words[0]) > 0 {
if r < 10000 { // only look at the most common 10k words, 100k makes go compile/link unworkable
stem := string(words[0]) // NOTE not stemming at present
entry, alredythere := wfm[stem]
if alredythere {
entry.rawFreq += rf
wfm[stem] = entry
} else {
wfm[stem] = wordFreqEntry{rawFreq: rf, Freq: 0.0}
}
}
rfTot += rf
}
}
}
for k, v := range wfm {
v.Freq = float64(v.rawFreq) / float64(rfTot)
wfm[k] = v
}
wfs := make(wordFreqSort, len(wfm))
idx := 0
for k, v := range wfm {
wfs[idx].Name = k
wfs[idx].Freq = v.Freq
idx++
}
sort.Sort(wfs)
writeWords(wfs, wfm)
}
func writeWords(wfs wordFreqSort, wfm wordFreqMap) {
var goprog bytes.Buffer
var err error
fmt.Fprintf(&goprog, `
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package words was auto-generated !
// From base data at http://invokeit.wordpress.com/frequency-word-lists/ .
// The word stems were produced using github.com/rookii/paicehusk .
// DO NOT EDIT BY HAND.
package words
// Entry type describes the rank and frequency of a prarticular word.
type Entry struct {
Rank int // Word Rank order, 1 most frequent.
Freq float64 // Word Frequency, a fraction, larger is more frequent.
}
// Map type provides the Entry information for each word.
type Map map[string]Entry
// Words gives the Entry information on the most frequent words.
var Words = Map{
`)
for i, v := range wfs {
fmt.Fprintf(&goprog, "\t"+`"%s": Entry{Rank:%d,Freq:%g},`+"\n", v.Name, i+1, v.Freq)
}
fmt.Fprintf(&goprog, "}\n\n")
sfm := make(map[string]float64)
for k, v := range wfm {
sfm[paicehusk.DefaultRules.Stem(k)] += v.Freq
}
fmt.Fprintf(&goprog, "// Stems gives the frequency of word-stems.\nvar Stems = map[string]float64{\n")
for k, v := range sfm {
fmt.Fprintf(&goprog, "\t"+`"%s": %g,`+"\n", k, v)
}
fmt.Fprintf(&goprog, "}\n\n")
err = ioutil.WriteFile("./en-2012/englishwords.go", goprog.Bytes(), 0666)
if err != nil {
panic(err)
}
}