mirror of
https://github.com/documize/community.git
synced 2025-08-08 23:15:29 +02:00
initial commit
This commit is contained in:
commit
18933c6767
1841 changed files with 810642 additions and 0 deletions
4
wordsmith/README.md
Normal file
4
wordsmith/README.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
Wordsmith
|
||||
=========
|
||||
|
||||
Provides common code for all
|
44
wordsmith/api/convertapi.go
Normal file
44
wordsmith/api/convertapi.go
Normal file
|
@ -0,0 +1,44 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package api provides the defininitions of types used for communication between different components of the Documize system.
|
||||
package api
|
||||
|
||||
// DocumentConversionRequest is what is passed to a Convert plugin.
|
||||
type DocumentConversionRequest struct {
|
||||
Filename string
|
||||
Filedata []byte
|
||||
PageBreakLevel uint
|
||||
Token string // authorisation token
|
||||
}
|
||||
|
||||
// Page holds the contents of a Documize page,
|
||||
// which is a Body of html with a Title and a Level,
|
||||
type Page struct {
|
||||
Level uint64 // overall document is level 1, <H1> => level 2
|
||||
Title string
|
||||
Body []byte
|
||||
}
|
||||
|
||||
// EmbeddedFile holds the contents of an embedded file.
|
||||
type EmbeddedFile struct {
|
||||
ID, Type, Name string // name must have the same extension as the type e.g. Type="txt" Name="foo.txt"
|
||||
Data []byte
|
||||
}
|
||||
|
||||
// DocumentConversionResponse is the response from a Convert plugin.
|
||||
type DocumentConversionResponse struct {
|
||||
Err string
|
||||
PagesHTML []byte // If empty, use Pages
|
||||
Pages []Page
|
||||
EmbeddedFiles []EmbeddedFile
|
||||
Excerpt string
|
||||
}
|
26
wordsmith/api/request.go
Normal file
26
wordsmith/api/request.go
Normal file
|
@ -0,0 +1,26 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package api
|
||||
|
||||
// ConversionJobRequest is the information used to set-up a conversion job.
|
||||
type ConversionJobRequest struct {
|
||||
Job string
|
||||
IndexDepth uint
|
||||
OrgID string
|
||||
}
|
||||
|
||||
// DocumentExport is the type used by a document export plugin.
|
||||
type DocumentExport struct {
|
||||
Filename string
|
||||
Format string
|
||||
File []byte
|
||||
}
|
90
wordsmith/api/response.go
Normal file
90
wordsmith/api/response.go
Normal file
|
@ -0,0 +1,90 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package api
|
||||
|
||||
import (
|
||||
"github.com/documize/community/wordsmith/log"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// apiJSONResponse is the structure of a JSON response to a Documize client.
|
||||
type apiJSONResponse struct {
|
||||
Code int
|
||||
Success bool
|
||||
Message string
|
||||
Data interface{}
|
||||
}
|
||||
|
||||
// SetJSONResponse sets the response type to "application/json" in the HTTP header.
|
||||
func SetJSONResponse(w http.ResponseWriter) {
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
}
|
||||
|
||||
// WriteError to the http.ResponseWriter, taking care to provide the correct
|
||||
// response error code within the JSON response.
|
||||
func WriteError(w http.ResponseWriter, err error) {
|
||||
|
||||
response := apiJSONResponse{}
|
||||
response.Message = err.Error()
|
||||
response.Success = false
|
||||
response.Data = nil
|
||||
|
||||
switch err.Error() {
|
||||
case "BadRequest":
|
||||
response.Code = 400
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
case "Unauthorized":
|
||||
response.Code = 401
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
case "Forbidden":
|
||||
response.Code = 403
|
||||
w.WriteHeader(http.StatusForbidden)
|
||||
case "NotFound":
|
||||
response.Code = 404
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
default:
|
||||
response.Code = 500
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
json, err := json.Marshal(response)
|
||||
if err != nil {
|
||||
log.Error("json.Marshal", err)
|
||||
}
|
||||
|
||||
if _, err := w.Write(json); err != nil {
|
||||
log.Error("write to ResponseWriter", err)
|
||||
}
|
||||
}
|
||||
|
||||
// WriteErrorBadRequest provides feedback to a Documize client on an error,
|
||||
// where that error is described in a string.
|
||||
func WriteErrorBadRequest(w http.ResponseWriter, message string) {
|
||||
|
||||
response := apiJSONResponse{}
|
||||
response.Message = message
|
||||
response.Success = false
|
||||
response.Data = nil
|
||||
|
||||
response.Code = 400
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
|
||||
json, err := json.Marshal(response)
|
||||
if err != nil {
|
||||
log.Error("json.Marshal", err)
|
||||
}
|
||||
|
||||
if _, err := w.Write(json); err != nil {
|
||||
log.Error("write to ResponseWriter", err)
|
||||
}
|
||||
}
|
119
wordsmith/environment/environment.go
Normal file
119
wordsmith/environment/environment.go
Normal file
|
@ -0,0 +1,119 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package environment allow environment variables to be obtained from either the environment or the command line.
|
||||
// Environment variables are always uppercase, with the Prefix; flags are always lowercase without.
|
||||
package environment
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// CallbackT is the type signature of the callback function of GetString().
|
||||
type CallbackT func(*string, string) bool
|
||||
|
||||
type varT struct {
|
||||
target *string
|
||||
name, setter, value string
|
||||
required bool
|
||||
callback CallbackT
|
||||
}
|
||||
|
||||
type varsT struct {
|
||||
vv []varT
|
||||
}
|
||||
|
||||
var vars varsT
|
||||
|
||||
// Len is part of sort.Interface.
|
||||
func (v *varsT) Len() int {
|
||||
return len(v.vv)
|
||||
}
|
||||
|
||||
// Swap is part of sort.Interface.
|
||||
func (v *varsT) Swap(i, j int) {
|
||||
v.vv[i], v.vv[j] = v.vv[j], v.vv[i]
|
||||
}
|
||||
|
||||
// Less is part of sort.Interface.
|
||||
func (v *varsT) Less(i, j int) bool {
|
||||
return v.vv[i].name < v.vv[j].name
|
||||
}
|
||||
|
||||
// Prefix provides the prefix for all Environment variables
|
||||
const Prefix = "DOCUMIZE"
|
||||
|
||||
const goInit = "(default)"
|
||||
|
||||
// GetString sets-up the flag for later use, it must be called before ParseOK(), usually in an init().
|
||||
func GetString(target *string, name string, required bool, usage string, callback CallbackT) {
|
||||
name = strings.ToLower(strings.TrimSpace(name))
|
||||
setter := Prefix + strings.ToUpper(name)
|
||||
value := os.Getenv(setter)
|
||||
if value == "" {
|
||||
value = *target // use the Go initialized value
|
||||
setter = goInit
|
||||
}
|
||||
flag.StringVar(target, name, value, usage)
|
||||
vars.vv = append(vars.vv, varT{target: target, name: name, required: required, callback: callback, value: value, setter: setter})
|
||||
}
|
||||
|
||||
var showSettings = flag.Bool("showsettings", false, "if true, show settings in the log (WARNING: these settings may include passwords)")
|
||||
|
||||
// Parse calls flag.Parse() then checks that the required environment variables are all set.
|
||||
// It should be the first thing called by any main() that uses this library.
|
||||
// If all the required variables are not present, it prints an error and calls os.Exit(2) like flag.Parse().
|
||||
func Parse(doFirst string) {
|
||||
flag.Parse()
|
||||
sort.Sort(&vars)
|
||||
for pass := 1; pass <= 2; pass++ {
|
||||
for vi, v := range vars.vv {
|
||||
if (pass == 1 && v.name == doFirst) || (pass == 2 && v.name != doFirst) {
|
||||
typ := "Optional"
|
||||
if v.value != *(v.target) || (v.value != "" && *(v.target) == "") {
|
||||
vars.vv[vi].setter = "-" + v.name // v is a local copy, not the underlying data
|
||||
}
|
||||
if v.callback != nil {
|
||||
if v.callback(v.target, v.name) {
|
||||
vars.vv[vi].setter = "setting:" + v.name // v is a local copy, not the underlying data
|
||||
}
|
||||
}
|
||||
if v.required {
|
||||
if *(v.target) == "" {
|
||||
fmt.Fprintln(os.Stderr)
|
||||
fmt.Fprintln(os.Stderr, "In order to run", os.Args[0], "the following must be provided:")
|
||||
for _, vv := range vars.vv {
|
||||
if vv.required {
|
||||
fmt.Fprintf(os.Stderr, "* setting from environment variable '%s' or flag '-%s' or an application setting '%s', current value: '%s' set by '%s'\n",
|
||||
Prefix+strings.ToUpper(vv.name), vv.name, vv.name, *(vv.target), vv.setter)
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(os.Stderr)
|
||||
flag.Usage()
|
||||
os.Exit(2)
|
||||
return
|
||||
}
|
||||
typ = "Required"
|
||||
}
|
||||
if *showSettings {
|
||||
if *(v.target) != "" && vars.vv[vi].setter != goInit {
|
||||
fmt.Fprintf(os.Stdout, "%s setting from '%s' is: '%s'\n",
|
||||
typ, vars.vv[vi].setter, *(v.target))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
78
wordsmith/log/logger.go
Normal file
78
wordsmith/log/logger.go
Normal file
|
@ -0,0 +1,78 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package log provides centralized logging for the Documize application.
|
||||
package log
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
log "github.com/Sirupsen/logrus"
|
||||
|
||||
env "github.com/documize/community/wordsmith/environment"
|
||||
)
|
||||
|
||||
var environment = "Non-production"
|
||||
|
||||
func init() {
|
||||
log.SetFormatter(new(log.TextFormatter))
|
||||
log.SetOutput(os.Stdout)
|
||||
log.SetLevel(log.DebugLevel)
|
||||
env.GetString(&environment, "log", false,
|
||||
"system being logged e.g. 'PRODUCTION'",
|
||||
func(*string, string) bool {
|
||||
log.Infoln(environment + " environment logging enabled")
|
||||
return false
|
||||
})
|
||||
}
|
||||
|
||||
// Debug logs a message for debug purposes.
|
||||
func Debug(message string) {
|
||||
log.WithFields(log.Fields{"env": environment}).Debug(message)
|
||||
}
|
||||
|
||||
// Info logs a message for information purposes.
|
||||
func Info(message string) {
|
||||
log.WithFields(log.Fields{"env": environment}).Info(message)
|
||||
}
|
||||
|
||||
// TestIfErr is used by the test code to signal that a test being run should error, it is reset if an error occurs.
|
||||
var TestIfErr bool
|
||||
|
||||
// ErrorString logs an error, where there is not an error value.
|
||||
func ErrorString(message string) {
|
||||
TestIfErr = false
|
||||
log.WithFields(log.Fields{"env": environment}).Error(message)
|
||||
}
|
||||
|
||||
// Error logs an error, if non-nil, with a message to give some context.
|
||||
func Error(message string, err error) {
|
||||
if err != nil {
|
||||
TestIfErr = false
|
||||
stack := make([]byte, 4096)
|
||||
runtime.Stack(stack, false)
|
||||
if idx := bytes.IndexByte(stack, 0); idx > 0 && idx < len(stack) {
|
||||
stack = stack[:idx] // remove trailing nulls from stack dump
|
||||
}
|
||||
log.WithFields(log.Fields{"env": environment, "error": err.Error(), "stack": fmt.Sprintf("%s", stack)}).Error(message)
|
||||
|
||||
//log.WithField("error: "+message, err.Error()).Errorf("%q\n%s\n", err, stack[:])
|
||||
}
|
||||
}
|
||||
|
||||
// IfErr logs an error if one exists.
|
||||
// It is a convenience wrapper for Error(), with no context message.
|
||||
func IfErr(err error) {
|
||||
Error("", err)
|
||||
}
|
66
wordsmith/utility/beautify.go
Normal file
66
wordsmith/utility/beautify.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// BeautifyFilename takes a filename and attempts to turn it into a readable form,
|
||||
// as TitleCase natural language, suitable for the top level of a Document.
|
||||
func BeautifyFilename(fn string) string {
|
||||
_, file := filepath.Split(fn)
|
||||
splits := strings.Split(file, ".")
|
||||
r := []rune(strings.Join(splits[:len(splits)-1], "."))
|
||||
|
||||
// make any non-letter/digit characters space
|
||||
for i := range r {
|
||||
if !(unicode.IsLetter(r[i]) || unicode.IsDigit(r[i]) || r[i] == '.') {
|
||||
r[i] = ' '
|
||||
}
|
||||
}
|
||||
|
||||
// insert spaces in front of any Upper/Lowwer 2-letter combinations
|
||||
addSpaces:
|
||||
for i := range r {
|
||||
if i > 1 { // do not insert a space at the start of the file name
|
||||
if unicode.IsLower(r[i]) && unicode.IsUpper(r[i-1]) && r[i-2] != ' ' {
|
||||
n := make([]rune, len(r)+1)
|
||||
for j := 0; j < i-1; j++ {
|
||||
n[j] = r[j]
|
||||
}
|
||||
n[i-1] = ' '
|
||||
for j := i - 1; j < len(r); j++ {
|
||||
n[j+1] = r[j]
|
||||
}
|
||||
r = n
|
||||
goto addSpaces
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make the first letter of each word upper case
|
||||
for i := range r {
|
||||
switch i {
|
||||
case 0:
|
||||
r[i] = unicode.ToUpper(r[i])
|
||||
case 1: // the zero element should never be space
|
||||
default:
|
||||
if r[i-1] == ' ' {
|
||||
r[i] = unicode.ToUpper(r[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
return string(r)
|
||||
}
|
25
wordsmith/utility/beautify_test.go
Normal file
25
wordsmith/utility/beautify_test.go
Normal file
|
@ -0,0 +1,25 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestBeautify(t *testing.T) {
|
||||
bs(t, "DooDah$day.zip", "Doo Dah Day")
|
||||
}
|
||||
|
||||
func bs(t *testing.T, in, out string) {
|
||||
got := BeautifyFilename(in)
|
||||
if got != out {
|
||||
t.Errorf("BeautifyFilename input `%s` got `%s` expected `%s`\n", in, got, out)
|
||||
}
|
||||
}
|
59
wordsmith/utility/command.go
Normal file
59
wordsmith/utility/command.go
Normal file
|
@ -0,0 +1,59 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/documize/community/wordsmith/log"
|
||||
"errors"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
var errTimeout = errors.New("conversion timelimit exceeded")
|
||||
|
||||
// CommandWithTimeout runs a command but stops it if it does not finish within the timout above.
|
||||
func CommandWithTimeout(command *exec.Cmd, timeout time.Duration) ([]byte, error) {
|
||||
var output bytes.Buffer
|
||||
//fmt.Println("DEBUG CommandWithTimeout: %v", command.Args)
|
||||
command.Stdout = &output
|
||||
command.Stderr = &output
|
||||
if err := command.Start(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
done := make(chan error, 1)
|
||||
defer close(done)
|
||||
go func() {
|
||||
done <- command.Wait()
|
||||
}()
|
||||
select {
|
||||
case <-time.After(timeout):
|
||||
if err := command.Process.Kill(); err != nil {
|
||||
log.Error("failed to kill: ", err)
|
||||
}
|
||||
<-done // prevent memory leak
|
||||
//fmt.Println("DEBUG timeout")
|
||||
return nil, errTimeout
|
||||
case err := <-done:
|
||||
if err != nil {
|
||||
//fmt.Println("DEBUG error return")
|
||||
return output.Bytes(), err
|
||||
}
|
||||
if !command.ProcessState.Success() {
|
||||
err = errors.New(string(output.Bytes()))
|
||||
//fmt.Println("DEBUG not successful")
|
||||
return nil, err
|
||||
}
|
||||
//fmt.Println("DEBUG successful return")
|
||||
return output.Bytes(), nil
|
||||
}
|
||||
}
|
39
wordsmith/utility/command_test.go
Normal file
39
wordsmith/utility/command_test.go
Normal file
|
@ -0,0 +1,39 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
import "os/exec"
|
||||
import "time"
|
||||
|
||||
func TestCmd(t *testing.T) {
|
||||
cmd := exec.Command("echo", "test")
|
||||
buf, err := CommandWithTimeout(cmd,time.Second)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
return
|
||||
}
|
||||
if string(buf) != "test\n" {
|
||||
t.Error("command did not return `test` it returned:" + string(buf))
|
||||
}
|
||||
cmd2 := exec.Command("dingbat doodah")
|
||||
_, err2 := CommandWithTimeout(cmd2,time.Second)
|
||||
if err2 == nil {
|
||||
t.Error("bad command did not return an error")
|
||||
}
|
||||
timeout := 5 * time.Second
|
||||
cmd3 := exec.Command("sleep", "50")
|
||||
_, err3 := CommandWithTimeout(cmd3,timeout)
|
||||
if err3 != errTimeout {
|
||||
t.Error("sleep command did not timeout:", err3)
|
||||
}
|
||||
}
|
26
wordsmith/utility/defclose.go
Normal file
26
wordsmith/utility/defclose.go
Normal file
|
@ -0,0 +1,26 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "io"
|
||||
import "github.com/documize/community/wordsmith/log"
|
||||
|
||||
// Close is a convenience function to close an io.Closer, usually in a defer.
|
||||
func Close(f interface{}) {
|
||||
if f != nil {
|
||||
if ff, ok := f.(io.Closer); ok {
|
||||
if ff != io.Closer(nil) {
|
||||
log.IfErr(ff.Close())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
25
wordsmith/utility/defclose_test.go
Normal file
25
wordsmith/utility/defclose_test.go
Normal file
|
@ -0,0 +1,25 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
import "os"
|
||||
import "github.com/documize/community/wordsmith/log"
|
||||
|
||||
func TestDefclose(t *testing.T) {
|
||||
var f *os.File
|
||||
log.TestIfErr=true
|
||||
Close(f)
|
||||
if log.TestIfErr {
|
||||
t.Error("Close() did not error when it should have")
|
||||
}
|
||||
}
|
13
wordsmith/utility/doc.go
Normal file
13
wordsmith/utility/doc.go
Normal file
|
@ -0,0 +1,13 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package utility contains utility functions used by the whole Documize ecosystem.
|
||||
package utility
|
164
wordsmith/utility/html.go
Normal file
164
wordsmith/utility/html.go
Normal file
|
@ -0,0 +1,164 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
"golang.org/x/net/html/atom"
|
||||
|
||||
"github.com/documize/community/wordsmith/log"
|
||||
)
|
||||
|
||||
// HTML describes a chunk of HTML, Text() method returns plain text.
|
||||
type HTML string
|
||||
|
||||
// write out the textual element of the html node, if present, then iterate through the child nodes.
|
||||
func writeText(n *html.Node, b io.Writer, isTest bool) {
|
||||
if !excluded(n) {
|
||||
switch n.Type {
|
||||
case html.TextNode:
|
||||
_, err := b.Write([]byte(n.Data + string(rune(0x200B)))) // + http://en.wikipedia.org/wiki/Zero-width_space
|
||||
if err != nil {
|
||||
log.Error("write TextNode", err)
|
||||
}
|
||||
// TODO This use of zero-width-space (subsequently replaced by ' ' or ignored, depending on context)
|
||||
// TODO works well for in-word breaks, but at the expense of concatenating some words in error.
|
||||
// TODO It may be that better examination of the HTML structure could be used to determine
|
||||
// TODO when a space is, or is not, required. In that event we would not use zero-width-space.
|
||||
|
||||
default:
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
writeText(c, b, isTest)
|
||||
}
|
||||
switch n.DataAtom {
|
||||
case 0:
|
||||
if n.Data == "documize" {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == "type" {
|
||||
if isTest {
|
||||
var err error
|
||||
switch a.Val {
|
||||
case "field-start":
|
||||
_, err = b.Write([]byte(" [ "))
|
||||
case "field-end":
|
||||
_, err = b.Write([]byte(" ] "))
|
||||
default:
|
||||
_, err = b.Write([]byte(" [ ] "))
|
||||
}
|
||||
if err != nil {
|
||||
log.Error("write []", err)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
case atom.Span, atom.U, atom.B, atom.I, atom.Del, atom.Sub, atom.Sup:
|
||||
//NoOp
|
||||
default:
|
||||
_, err := b.Write([]byte(" ")) // add a space after each main element
|
||||
if err != nil {
|
||||
log.Error("write space", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func excluded(n *html.Node) bool {
|
||||
if n.DataAtom == atom.Div {
|
||||
for _, a := range n.Attr {
|
||||
if a.Key == "class" {
|
||||
switch a.Val {
|
||||
case "documize-first-page",
|
||||
"documize-exotic-image",
|
||||
"documize-footnote",
|
||||
"documize-graphictext",
|
||||
"documize-math":
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// findBody finds the body HTML node if it exists in the tree. Required to bypass the page title text.
|
||||
func findBody(n *html.Node) *html.Node {
|
||||
if n.DataAtom == atom.Body {
|
||||
return n
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
r := findBody(c)
|
||||
if r != nil {
|
||||
return r
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Text returns only the plain text elements of the HTML Chunk, concatanated with "\n",
|
||||
// for use in the TOC or for text indexing.
|
||||
func (ch HTML) Text(isTest bool) (string, error) {
|
||||
var b bytes.Buffer
|
||||
doc, err := html.Parse(strings.NewReader(string(ch)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
body := findBody(doc)
|
||||
if body == nil {
|
||||
body = doc
|
||||
}
|
||||
writeText(body, &b, isTest)
|
||||
return string(b.Bytes()), nil
|
||||
}
|
||||
|
||||
// EscapeHTMLcomplexChars looks for "complex" characters within HTML
|
||||
// and replaces them with the HTML escape codes which describe them.
|
||||
// "Complex" characters are those encoded in more than one byte by UTF8.
|
||||
func EscapeHTMLcomplexChars(s string) string {
|
||||
ret := ""
|
||||
for _, r := range s {
|
||||
if utf8.RuneLen(r) > 1 {
|
||||
ret += fmt.Sprintf("&#%d;", r)
|
||||
} else {
|
||||
ret += string(r)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// EscapeHTMLcomplexCharsByte looks for "complex" characters within HTML
|
||||
// and replaces them with the HTML escape codes which describe them.
|
||||
// "Complex" characters are those encoded in more than one byte by UTF8.
|
||||
func EscapeHTMLcomplexCharsByte(b []byte) []byte {
|
||||
var ret bytes.Buffer
|
||||
for len(b) > 0 {
|
||||
r, size := utf8.DecodeRune(b)
|
||||
if utf8.RuneLen(r) > 1 {
|
||||
fmt.Fprintf(&ret, "&#%d;", r)
|
||||
} else {
|
||||
_, err := ret.Write(b[:size])
|
||||
if err != nil {
|
||||
log.Error("EscapeHTMLcomplexCharsByte", err)
|
||||
}
|
||||
}
|
||||
b = b[size:]
|
||||
}
|
||||
return ret.Bytes()
|
||||
}
|
83
wordsmith/utility/html_test.go
Normal file
83
wordsmith/utility/html_test.go
Normal file
|
@ -0,0 +1,83 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestHTML(t *testing.T) {
|
||||
type testConv struct {
|
||||
htm, txt string
|
||||
istest bool
|
||||
}
|
||||
convTest := []testConv{
|
||||
{
|
||||
`<html><head><title>HTML TITLE</title></head><body><p>This <I>is</I>:</p><ul><li><a href="foo">Example</a><li><a href="/bar/baz">HTML text.</a><div class="documize-math">exclueded</div></ul></body></html>`,
|
||||
"This is : Example HTML text. ", false,
|
||||
},
|
||||
{
|
||||
`<p>This is:</p><ul><li><documize type="field-start"></documize> <documize type="field-end"></documize><documize type="unknown"></documize><li><a href="/bar/baz">HTML text.</a></ul>`,
|
||||
"This is: [ ] [ ] HTML text. ", true,
|
||||
},
|
||||
}
|
||||
for _, tst := range convTest {
|
||||
var ch HTML
|
||||
ch = HTML([]byte(tst.htm))
|
||||
//t.Logf("HTML: %s", ch)
|
||||
txt, err := ch.Text(tst.istest)
|
||||
if err != nil {
|
||||
t.Log(err)
|
||||
t.Fail()
|
||||
}
|
||||
expected := compressSpaces(tst.txt)
|
||||
got := compressSpaces(string(txt))
|
||||
if expected != got {
|
||||
t.Errorf("Conversion to text for `%s`, expected: `%s` got: `%s`\n",
|
||||
ch, expected, got)
|
||||
} //else {
|
||||
// t.Logf("Text: %s", txt)
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
func compressSpaces(s string) string {
|
||||
ret := ""
|
||||
inSpace := false
|
||||
for _, r := range s {
|
||||
switch r {
|
||||
case ' ', '\t', '\n', '\u200b' /*zero width space*/ :
|
||||
if !inSpace {
|
||||
ret += " "
|
||||
}
|
||||
inSpace = true
|
||||
default:
|
||||
inSpace = false
|
||||
ret += string(r)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func TestHTMLescape(t *testing.T) {
|
||||
tianchao := "兲朝 test"
|
||||
expected := "兲朝 test"
|
||||
|
||||
gotString := EscapeHTMLcomplexChars(tianchao)
|
||||
if gotString != expected {
|
||||
t.Errorf("EscapeHTMLcomplexChars error got `%s` expected `%s`\n", gotString, expected)
|
||||
}
|
||||
|
||||
gotBytes := EscapeHTMLcomplexCharsByte([]byte(tianchao))
|
||||
if string(gotBytes) != expected {
|
||||
t.Errorf("EscapeHTMLcomplexCharsByte error got `%s` expected `%s`\n", string(gotBytes), expected)
|
||||
}
|
||||
|
||||
}
|
88
wordsmith/utility/secrets.go
Normal file
88
wordsmith/utility/secrets.go
Normal file
|
@ -0,0 +1,88 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"io"
|
||||
)
|
||||
|
||||
var key = []byte("8456FHkQW1566etydT46jk39ghjfFhg4") // 32 bytes
|
||||
|
||||
// MakeMD5 returns the MD5 hash of a given string, usually a password.
|
||||
/*
|
||||
func MakeMD5(password string) []byte {
|
||||
hash := md5.New()
|
||||
if _, err := io.WriteString(hash, password); err != nil {
|
||||
log.Error("error in MakeMD5", err)
|
||||
}
|
||||
return hash.Sum(nil)
|
||||
}
|
||||
*/
|
||||
|
||||
// MakeAES creates an AES encryption of of a given string,
|
||||
// using a hard-wired key value,
|
||||
// suitable for use as an authentication token.
|
||||
func MakeAES(secret string) ([]byte, error) {
|
||||
block, err := aes.NewCipher(key)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b := EncodeBase64([]byte(secret))
|
||||
ciphertext := make([]byte, aes.BlockSize+len(b))
|
||||
iv := ciphertext[:aes.BlockSize]
|
||||
if _, err := io.ReadFull(rand.Reader, iv); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cfb := cipher.NewCFBEncrypter(block, iv)
|
||||
cfb.XORKeyStream(ciphertext[aes.BlockSize:], b)
|
||||
return ciphertext, nil
|
||||
}
|
||||
|
||||
// DecryptAES decrypts an AES encoded []byte,
|
||||
// using a hard-wired key value,
|
||||
// suitable for use when reading an authentication token.
|
||||
func DecryptAES(text []byte) ([]byte, error) {
|
||||
block, err := aes.NewCipher(key)
|
||||
if err != nil {
|
||||
return nil, errors.New("aes.NewCipher failure: " + err.Error())
|
||||
}
|
||||
if len(text) < aes.BlockSize {
|
||||
return nil, errors.New("ciphertext too short")
|
||||
}
|
||||
iv := text[:aes.BlockSize]
|
||||
text = text[aes.BlockSize:]
|
||||
cfb := cipher.NewCFBDecrypter(block, iv)
|
||||
cfb.XORKeyStream(text, text)
|
||||
return DecodeBase64(text)
|
||||
}
|
||||
|
||||
// EncodeBase64 is a convenience function to encode using StdEncoding.
|
||||
func EncodeBase64(b []byte) []byte {
|
||||
return []byte(base64.StdEncoding.EncodeToString(b))
|
||||
}
|
||||
|
||||
// EncodeBase64AsString is a convenience function to encode using StdEncoding.
|
||||
/*
|
||||
func EncodeBase64AsString(b []byte) string {
|
||||
return base64.StdEncoding.EncodeToString(b)
|
||||
}
|
||||
*/
|
||||
|
||||
// DecodeBase64 is a convenience function to decode using StdEncoding.
|
||||
func DecodeBase64(b []byte) ([]byte, error) {
|
||||
return base64.StdEncoding.DecodeString(string(b))
|
||||
}
|
35
wordsmith/utility/secrets_test.go
Normal file
35
wordsmith/utility/secrets_test.go
Normal file
|
@ -0,0 +1,35 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSecrets(t *testing.T) {
|
||||
mimi := "007"
|
||||
b, e := MakeAES(mimi)
|
||||
if e != nil {
|
||||
t.Fatal(e)
|
||||
}
|
||||
mm, e2 := DecryptAES(b)
|
||||
if e2 != nil {
|
||||
t.Fatal(e2)
|
||||
}
|
||||
if mimi != string(mm) {
|
||||
t.Errorf("wanted %s got %s", mimi, string(mm))
|
||||
}
|
||||
|
||||
_, ee := DecryptAES([]byte{})
|
||||
if ee == nil {
|
||||
t.Error("should have errored on empty cypher")
|
||||
}
|
||||
|
||||
}
|
37
wordsmith/utility/slug.go
Normal file
37
wordsmith/utility/slug.go
Normal file
|
@ -0,0 +1,37 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// MakeSlug creates a slug, suitable for use in a URL, from a string
|
||||
func MakeSlug(str string) string {
|
||||
slg := strings.Map(
|
||||
func(r rune) rune { // individual mapping of runes into a format suitable for use in a URL
|
||||
r = unicode.ToLower(r)
|
||||
if unicode.IsLower(r) || unicode.IsDigit(r) {
|
||||
return r
|
||||
}
|
||||
return '-'
|
||||
}, str)
|
||||
slg = strings.NewReplacer("---", "-", "--", "-").Replace(slg)
|
||||
for strings.HasSuffix(slg, "-") {
|
||||
slg = strings.TrimSuffix(slg, "-")
|
||||
}
|
||||
for strings.HasPrefix(slg, "-") {
|
||||
slg = strings.TrimPrefix(slg, "-")
|
||||
}
|
||||
return slg
|
||||
}
|
25
wordsmith/utility/slug_test.go
Normal file
25
wordsmith/utility/slug_test.go
Normal file
|
@ -0,0 +1,25 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSlug(t *testing.T) {
|
||||
st(t, " Zip--up ", "zip-up")
|
||||
}
|
||||
|
||||
func st(t *testing.T, in, out string) {
|
||||
got := MakeSlug(in)
|
||||
if got != out {
|
||||
t.Errorf("slug input `%s` got `%s` expected `%s`\n", in, got, out)
|
||||
}
|
||||
}
|
34
wordsmith/utility/user.go
Normal file
34
wordsmith/utility/user.go
Normal file
|
@ -0,0 +1,34 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// MakeInitials returns user initials from firstname and lastname.
|
||||
func MakeInitials(firstname, lastname string) string {
|
||||
firstname = strings.TrimSpace(firstname)
|
||||
lastname = strings.TrimSpace(lastname)
|
||||
a := ""
|
||||
b := ""
|
||||
|
||||
if len(firstname) > 0 {
|
||||
a = firstname[:1]
|
||||
}
|
||||
|
||||
if len(lastname) > 0 {
|
||||
b = lastname[:1]
|
||||
}
|
||||
|
||||
return strings.ToUpper(a + b)
|
||||
}
|
28
wordsmith/utility/user_test.go
Normal file
28
wordsmith/utility/user_test.go
Normal file
|
@ -0,0 +1,28 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInitials(t *testing.T) {
|
||||
in(t, "Harvey", "Kandola", "HK")
|
||||
in(t, "Harvey", "", "H")
|
||||
in(t, "", "Kandola", "K")
|
||||
in(t, "", "", "")
|
||||
}
|
||||
|
||||
func in(t *testing.T, firstname, lastname, expecting string) {
|
||||
initials := MakeInitials(firstname, lastname)
|
||||
if initials != expecting {
|
||||
t.Errorf("expecting initials of `%s` got `%s`\n", expecting, initials)
|
||||
}
|
||||
}
|
75
wordsmith/utility/words.go
Normal file
75
wordsmith/utility/words.go
Normal file
|
@ -0,0 +1,75 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import "unicode"
|
||||
import nethtml "golang.org/x/net/html"
|
||||
|
||||
// Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word.
|
||||
// This functionality is provided to enable verification of the text extraction algorithem across different implemntations.
|
||||
func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) {
|
||||
txt, err := ch.Text(testMode)
|
||||
if err != nil {
|
||||
return nil, inSqBr, err
|
||||
}
|
||||
txt = nethtml.UnescapeString(txt)
|
||||
|
||||
words := []string{""}
|
||||
|
||||
for _, c := range txt {
|
||||
if inSqBr > 0 {
|
||||
switch c {
|
||||
case ']':
|
||||
inSqBr--
|
||||
case '[':
|
||||
inSqBr++
|
||||
}
|
||||
} else {
|
||||
if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
|
||||
if testMode {
|
||||
c = ' ' // NOTE only replace with a space here if we are testing
|
||||
}
|
||||
}
|
||||
if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
|
||||
if c == '[' {
|
||||
inSqBr = 1
|
||||
words = append(words, "[") // open square bracket means potentially elided text
|
||||
words = append(words, "")
|
||||
} else {
|
||||
inSqBr = 0
|
||||
if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) {
|
||||
if words[len(words)-1] == "" {
|
||||
words[len(words)-1] = string(c)
|
||||
} else {
|
||||
words = append(words, string(c))
|
||||
}
|
||||
words = append(words, "")
|
||||
} else {
|
||||
if unicode.IsGraphic(c) || unicode.IsSpace(c) {
|
||||
if unicode.IsSpace(c) {
|
||||
if words[len(words)-1] != "" {
|
||||
words = append(words, "")
|
||||
}
|
||||
} else {
|
||||
words[len(words)-1] += string(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation
|
||||
words = append(words, ".")
|
||||
}
|
||||
return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end
|
||||
}
|
57
wordsmith/utility/words_test.go
Normal file
57
wordsmith/utility/words_test.go
Normal file
|
@ -0,0 +1,57 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
package utility
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWords(t *testing.T) {
|
||||
ws(t, " the quick brown fox jumps over the lazy dog [ ] [" +string(rune(0x200B)), 0, true,
|
||||
"the quick brown fox jumps over the lazy dog [ [", 1)
|
||||
ws(t, "the quick brown [ dog jumps over the lazy ] fox", 0, false,
|
||||
"the quick brown [ fox .", 0)
|
||||
ws(t, "the quick brown;fox;", 0, false,
|
||||
"the quick brown ; fox ; .", 0)
|
||||
ws(t, "the ] quick brown fox ", 1, true,
|
||||
"quick brown fox", 0)
|
||||
}
|
||||
|
||||
func ws(t *testing.T, in string, bktIn int, isTest bool, out string, bktOut int) {
|
||||
wds := strings.Split(out, " ")
|
||||
gotX, bo, e := Words(HTML(in), bktIn, isTest)
|
||||
if e != nil {
|
||||
t.Fatal(e)
|
||||
}
|
||||
if bo != bktOut {
|
||||
t.Errorf("wrong bracket count returned: input `%s` bktIn %d bktOut %d\n", in, bktIn, bktOut)
|
||||
}
|
||||
got := make([]string, 0, len(gotX))
|
||||
for _, v := range gotX { // remove empty entries
|
||||
if v != "" {
|
||||
got = append(got, v)
|
||||
}
|
||||
}
|
||||
if len(got) != len(wds) {
|
||||
t.Errorf("wrong number of words found: input `%s` got %d %v expected %d %v`\n", in, len(got), got, len(wds), wds)
|
||||
} else {
|
||||
sort.Strings(wds)
|
||||
sort.Strings(got)
|
||||
for i := range wds {
|
||||
if wds[i] != got[i] {
|
||||
t.Errorf("wrong word[%d]: input `%s` got %v expected %v\n", i, in, got, wds)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
23406
wordsmith/wordlists/en-2012/en-s.log
Executable file
23406
wordsmith/wordlists/en-2012/en-s.log
Executable file
File diff suppressed because it is too large
Load diff
4
wordsmith/wordlists/en-2012/en.log
Executable file
4
wordsmith/wordlists/en-2012/en.log
Executable file
|
@ -0,0 +1,4 @@
|
|||
Total files: 23406
|
||||
Unique word count: 521426
|
||||
Total word count: 145376051
|
||||
Overall word count: 193225723
|
456631
wordsmith/wordlists/en-2012/en.txt
Executable file
456631
wordsmith/wordlists/en-2012/en.txt
Executable file
File diff suppressed because one or more lines are too long
16037
wordsmith/wordlists/en-2012/englishwords.go
Normal file
16037
wordsmith/wordlists/en-2012/englishwords.go
Normal file
File diff suppressed because it is too large
Load diff
149
wordsmith/wordlists/makewordlist.go
Normal file
149
wordsmith/wordlists/makewordlist.go
Normal file
|
@ -0,0 +1,149 @@
|
|||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package main creates ordered lists of english words and their stems,
|
||||
// based on their frequency.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"sort"
|
||||
|
||||
"github.com/rookii/paicehusk"
|
||||
)
|
||||
|
||||
type wordFreqEntry struct {
|
||||
rawFreq int
|
||||
Freq float64
|
||||
}
|
||||
|
||||
type wordFreqMap map[string]wordFreqEntry
|
||||
|
||||
type wordFreqSortEntry struct {
|
||||
Name string
|
||||
Freq float64
|
||||
}
|
||||
type wordFreqSort []wordFreqSortEntry
|
||||
|
||||
// Len is the number of elements in the collection.
|
||||
func (wfs wordFreqSort) Len() int { return len(wfs) }
|
||||
|
||||
// Less reports whether the element with
|
||||
// index i should sort before the element with index j.
|
||||
func (wfs wordFreqSort) Less(i, j int) bool { return wfs[i].Freq > wfs[j].Freq }
|
||||
|
||||
// Swap swaps the elements with indexes i and j.
|
||||
func (wfs wordFreqSort) Swap(i, j int) { wfs[j], wfs[i] = wfs[i], wfs[j] }
|
||||
|
||||
func main() {
|
||||
|
||||
txt, err := ioutil.ReadFile("./en-2012/en.txt")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
lines := bytes.Split(txt, []byte("\n"))
|
||||
|
||||
wfm := make(wordFreqMap)
|
||||
rfTot := 0
|
||||
for r, l := range lines {
|
||||
words := bytes.Split(l, []byte(" "))
|
||||
if len(words) >= 2 {
|
||||
var rf int
|
||||
_, err = fmt.Sscanf(string(words[1]), "%d", &rf)
|
||||
if err == nil && len(words[0]) > 0 {
|
||||
if r < 10000 { // only look at the most common 10k words, 100k makes go compile/link unworkable
|
||||
stem := string(words[0]) // NOTE not stemming at present
|
||||
entry, alredythere := wfm[stem]
|
||||
if alredythere {
|
||||
entry.rawFreq += rf
|
||||
wfm[stem] = entry
|
||||
} else {
|
||||
wfm[stem] = wordFreqEntry{rawFreq: rf, Freq: 0.0}
|
||||
}
|
||||
}
|
||||
rfTot += rf
|
||||
}
|
||||
}
|
||||
}
|
||||
for k, v := range wfm {
|
||||
v.Freq = float64(v.rawFreq) / float64(rfTot)
|
||||
wfm[k] = v
|
||||
}
|
||||
|
||||
wfs := make(wordFreqSort, len(wfm))
|
||||
idx := 0
|
||||
for k, v := range wfm {
|
||||
wfs[idx].Name = k
|
||||
wfs[idx].Freq = v.Freq
|
||||
idx++
|
||||
}
|
||||
sort.Sort(wfs)
|
||||
writeWords(wfs, wfm)
|
||||
}
|
||||
|
||||
func writeWords(wfs wordFreqSort, wfm wordFreqMap) {
|
||||
var goprog bytes.Buffer
|
||||
var err error
|
||||
|
||||
fmt.Fprintf(&goprog, `
|
||||
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
|
||||
//
|
||||
// This software (Documize Community Edition) is licensed under
|
||||
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
|
||||
//
|
||||
// You can operate outside the AGPL restrictions by purchasing
|
||||
// Documize Enterprise Edition and obtaining a commercial license
|
||||
// by contacting <sales@documize.com>.
|
||||
//
|
||||
// https://documize.com
|
||||
|
||||
// Package words was auto-generated !
|
||||
// From base data at http://invokeit.wordpress.com/frequency-word-lists/ .
|
||||
// The word stems were produced using github.com/rookii/paicehusk .
|
||||
// DO NOT EDIT BY HAND.
|
||||
package words
|
||||
|
||||
// Entry type describes the rank and frequency of a prarticular word.
|
||||
type Entry struct {
|
||||
Rank int // Word Rank order, 1 most frequent.
|
||||
Freq float64 // Word Frequency, a fraction, larger is more frequent.
|
||||
}
|
||||
|
||||
// Map type provides the Entry information for each word.
|
||||
type Map map[string]Entry
|
||||
|
||||
// Words gives the Entry information on the most frequent words.
|
||||
var Words = Map{
|
||||
`)
|
||||
for i, v := range wfs {
|
||||
fmt.Fprintf(&goprog, "\t"+`"%s": Entry{Rank:%d,Freq:%g},`+"\n", v.Name, i+1, v.Freq)
|
||||
}
|
||||
fmt.Fprintf(&goprog, "}\n\n")
|
||||
|
||||
sfm := make(map[string]float64)
|
||||
for k, v := range wfm {
|
||||
sfm[paicehusk.DefaultRules.Stem(k)] += v.Freq
|
||||
}
|
||||
fmt.Fprintf(&goprog, "// Stems gives the frequency of word-stems.\nvar Stems = map[string]float64{\n")
|
||||
for k, v := range sfm {
|
||||
fmt.Fprintf(&goprog, "\t"+`"%s": %g,`+"\n", k, v)
|
||||
}
|
||||
fmt.Fprintf(&goprog, "}\n\n")
|
||||
|
||||
err = ioutil.WriteFile("./en-2012/englishwords.go", goprog.Bytes(), 0666)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue