1
0
Fork 0
mirror of https://github.com/documize/community.git synced 2025-07-24 15:49:44 +02:00

restructure directories

This commit is contained in:
Elliott Stoneham 2016-07-20 15:58:37 +01:00
parent 7e4ed6545b
commit a2ce777762
159 changed files with 320 additions and 323 deletions

View file

@ -0,0 +1,48 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package apidocumizecom
import (
"crypto/tls"
"errors"
"net/http"
"github.com/documize/community/core/api/request"
)
func endPoint() string {
r := request.ConfigString("LICENSE", "endpoint")
if r != "" {
return r
}
return "https://api.documize.com"
}
func token() (string, error) {
r := request.ConfigString("LICENSE", "token")
if r == "" {
return "", errors.New("Documize token is empty")
}
// TODO more validation here
return r, nil
}
var transport = &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true, // TODO should be glick.InsecureSkipVerifyTLS (from -insecure flag) but get error: x509: certificate signed by unknown authority
}}
// CheckToken returns an error if the Documize LICENSE token is invalid.
func CheckToken() error {
_, err := token()
return err
}

View file

@ -0,0 +1,72 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package apidocumizecom
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
"path/filepath"
api "github.com/documize/community/core/convapi"
"golang.org/x/net/context"
)
// Msword type provides a peg to hang the Convert method on.
type Msword struct{}
// Convert converts a file into the Countersoft Documize format.
func (file *Msword) Convert(r api.DocumentConversionRequest, reply *api.DocumentConversionResponse) error {
byts, err := json.Marshal(r)
if err != nil {
return err
}
base := filepath.Base(r.Filename)
fmt.Println("Starting conversion of document: ", base)
client := &http.Client{Transport: transport}
tok,err:=token()
if err != nil {
return err
}
resp, err := client.Post(endPoint()+"/api/word?token="+tok, "application/json", bytes.NewReader(byts))
if err != nil {
return err
}
defer func() {
if e := resp.Body.Close(); e != nil {
fmt.Println("resp.Body.Close error: " + e.Error())
}
}()
fmt.Println("Finished converting document: ", base)
dec := json.NewDecoder(resp.Body)
err = dec.Decode(reply)
return err
}
// MSwordConvert provides the standard interface for conversion of a MS-Word document.
// All the function does is return a pointer to api.DocumentConversionResponse with
// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata converted by the Documize server.
func MSwordConvert(ctx context.Context, in interface{}) (interface{}, error) {
var msw Msword
dcr := in.(*api.DocumentConversionRequest)
rep := new(api.DocumentConversionResponse)
err := msw.Convert(*dcr, rep)
return rep, err
}

124
core/api/convert/convert.go Normal file
View file

@ -0,0 +1,124 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package convert provides the gateway to document conversion native and plugin functionality, both in and out of the system.
package convert
import (
"errors"
"github.com/documize/community/core/api/convert/excerpt"
"github.com/documize/community/core/api/convert/html"
"github.com/documize/community/core/api/plugins"
api "github.com/documize/community/core/convapi"
"github.com/documize/community/core/utility"
"golang.org/x/net/context"
)
// Convert provides the entry-point into the document conversion process.
func Convert(ctx context.Context, xtn string, fileRequest *api.DocumentConversionRequest) (*api.DocumentConversionResponse, error) {
fileRequest.Token = plugins.Lib.Token("Convert", xtn)
fileResultI, err := plugins.Lib.Run(ctx, "Convert", xtn, fileRequest)
if err != nil {
return nil, err
}
fileResult, ok := fileResultI.(*api.DocumentConversionResponse)
if !ok {
return nil, errors.New("interface conversion: interface {} is nil, not *api.ConversionFileResponse")
}
if fileResult.Err != "" {
return nil, errors.New(fileResult.Err)
}
err = html.SplitIfHTML(fileRequest, fileResult)
if err != nil {
return nil, err
}
/* TODO add title & body santization that keeps the images & table formatting
for p, pg := range fileResult.Pages {
fileResult.Pages[p].Title = titlePolicy.Sanitize(pg.Title)
fileResult.Pages[p].Body = bodyPolicy.SanitizeBytes(pg.Body)
}
*/
if fileResult.Excerpt != "" {
//fmt.Println("DEBUG supplied excerpt: " + fileResult.Excerpt)
} else {
titleWds := []string{}
bodyWds := []string{}
for p := range fileResult.Pages {
var wds []string
var err error
if p > 0 { // title 0 is already the title of the document
wds, _, err = utility.Words(utility.HTML(fileResult.Pages[p].Title), 0, false)
if err != nil {
return nil, err
}
titleWds = append(titleWds, wds...)
titleWds = append(titleWds, ".")
}
wds, _, err = utility.Words(utility.HTML(string(fileResult.Pages[p].Body)), 0, false)
if err != nil {
return nil, err
}
bodyWds = append(bodyWds, wds...)
bodyWds = append(bodyWds, ".")
}
fileResult.Excerpt = excerpt.Excerpt(titleWds, bodyWds)
}
return fileResult, nil
}
/* TODO add sanitisation for body & title HTML
var titlePolicy, bodyPolicy *bluemonday.Policy
func init() {
policy := bluemonday.UGCPolicy()
policy.RequireNoFollowOnLinks(true)
// URLs must be parseable by net/url.Parse()
policy.RequireParseableURLs(true)
policy.AllowRelativeURLs(false)
policy.AllowURLSchemes("http", "https")
// replacement below for: policy.AllowDataURIImages()
// Supply a function to validate images contained within data URI
policy.AllowURLSchemeWithCustomPolicy(
"data",
func(url *url.URL) (allowUrl bool) {
if url.RawQuery != "" || url.Fragment != "" {
return false
}
// matched := dataURIImagePrefix.FindString(url.Opaque)
// if matched == "" {
// return false
// }
// _, err := base64.StdEncoding.DecodeString(url.Opaque[len(matched):])
// if err != nil {
// return false
// }
return true
})
policy.AllowImages()
// TODO remove links to #tags
// TODO allow DataURI of image/* for LibreOffice ppt output
bodyPolicy = policy
titlePolicy = bluemonday.StrictPolicy()
}
*/

View file

@ -0,0 +1,155 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package convert_test
import (
"strings"
"testing"
"github.com/documize/community/core/api/convert"
"github.com/documize/community/core/api/plugins"
api "github.com/documize/community/core/convapi"
"github.com/documize/community/core/log"
"golang.org/x/net/context"
)
func TestConvert(t *testing.T) {
plugins.PluginFile = "" // no file as html is built-in
if lerr := plugins.LibSetup(); lerr == nil {
//t.Error("did not error on plugin.Libsetup() with no plugin.json file")
//return
}
defer log.IfErr(plugins.Lib.KillSubProcs())
ctx := context.Background()
xtn := "html"
fileRequest := new(api.DocumentConversionRequest)
fileRequest.Filedata = []byte(yorkweb)
resp, err := convert.Convert(ctx, xtn, fileRequest)
if err != nil {
t.Error(err)
return
}
if len(resp.Pages) != 3 ||
!strings.HasPrefix(resp.Pages[1].Title, "STARTING") ||
!strings.HasPrefix(resp.Pages[2].Title, "EXERCISE") {
for p, pg := range resp.Pages {
t.Error(p, pg.Level, len(pg.Body), pg.Title)
}
}
exp := "There are lots of ways to create web pages using already coded programmes. … HTML isn' t computer code, but is a language that uses US English to enable texts( words, images, sounds) to be inserted and formatting such as colo( u) r and centre/ erin…"
if resp.Excerpt != exp {
t.Errorf("unexpected excerpt wanted: `%s` got: `%s`", exp, resp.Excerpt)
}
// check errors are caught
resp, err = convert.Convert(ctx, "unknown", fileRequest)
if err == nil {
t.Error("does not error on unknown extension")
}
}
// www.york.ac.uk/teaching/cws/wws/webpage1.html
const yorkweb = `
<HMTL>
<HEAD>
<TITLE>webpage1</TITLE>
</HEAD>
<BODY BGCOLOR="FFFFFf" LINK="006666" ALINK="8B4513" VLINK="006666">
<TABLE WIDTH="75%" ALIGN="center">
<TR>
<TD>
<DIV ALIGN="center"><H1>STARTING . . . </H1></DIV>
<DIV ALIGN="justify"><P>There are lots of ways to create web pages using already coded programmes. These lessons will teach you how to use the underlying HyperText Markup Language - HTML.
<BR>
<P>HTML isn't computer code, but is a language that uses US English to enable texts (words, images, sounds) to be inserted and formatting such as colo(u)r and centre/ering to be written in. The process is fairly simple; the main difficulties often lie in small mistakes - if you slip up while word processing your reader may pick up your typos, but the page will still be legible. However, if your HTML is inaccurate the page may not appear - writing web pages is, at the least, very good practice for proof reading!</P>
<P>Learning HTML will enable you to:
<UL>
<LI>create your own simple pages
<LI>read and appreciate pages created by others
<LI>develop an understanding of the creative and literary implications of web-texts
<LI>have the confidence to branch out into more complex web design
</UL></P>
<P>A HTML web page is made up of tags. Tags are placed in brackets like this <B>< tag > </B>. A tag tells the browser how to display information. Most tags need to be opened < tag > and closed < /tag >.
<P> To make a simple web page you need to know only four tags:
<UL>
<LI>< HTML > tells the browser your page is written in HTML format
<LI>< HEAD > this is a kind of preface of vital information that doesn't appear on the screen.
<LI>< TITLE >Write the title of the web page here - this is the information that viewers see on the upper bar of their screen. (I've given this page the title 'webpage1').
<LI>< BODY >This is where you put the content of your page, the words and pictures that people read on the screen.
</UL>
<P>All these tags need to be closed.
<H4>EXERCISE</H4>
<P>Write a simple web page.</P>
<P> Copy out exactly the HTML below, using a WP program such as Notepad.<BR>
Information in <I>italics</I> indicates where you can insert your own text, other information is HTML and needs to be exact. However, make sure there are no spaces between the tag brackets and the text inside.<BR>
(Find Notepad by going to the START menu\ PROGRAMS\ ACCESSORIES\ NOTEPAD).
<P>
< HTML ><BR>
< HEAD ><BR>
< TITLE ><I> title of page</I>< /TITLE ><BR>
< /HEAD ><BR>
< BODY><BR>
<I> write what you like here: 'my first web page', or a piece about what you are reading, or a few thoughts on the course, or copy out a few words from a book or cornflake packet. Just type in your words using no extras such as bold, or italics, as these have special HTML tags, although you may use upper and lower case letters and single spaces. </I><BR>
< /BODY ><BR>
< /HTML ><BR>
<P>Save the file as 'first.html' (ie. call the file anything at all) It's useful if you start a folder - just as you would for word-processing - and call it something like WEBPAGES, and put your first.html file in the folder.
<P>NOW - open your browser.<BR>
On Netscape the process is: <BR>
Top menu; FILE\ OPEN PAGE\ CHOOSE FILE<BR>
Click on your WEBPAGES folder\ FIRST file<BR>
Click 'open' and your page should appear.
<P>On Internet Explorer: <BR>
Top menu; FILE\ OPEN\ BROWSE <BR>
Click on your WEBPAGES folder\ FIRST file<BR>
Click 'open' and your page should appear.<BR>
<P>If the page doesn't open, go back over your notepad typing and make sure that all the HTML tags are correct. Check there are no spaces between tags and internal text; check that all tags are closed; check that you haven't written < HTLM > or < BDDY >. Your page will work eventually.
<P>
Make another page. Call it somethingdifferent.html and place it in the same WEBPAGES folder as detailed above.
<P>start formatting in <A HREF="webpage2.html">lesson two</A>
<BR><A HREF="col3.html">back to wws index</A> </P>
</P>
</DIV>
</TD>
</TR>
</TABLE>
</BODY>
</HTML>
`

View file

@ -0,0 +1,27 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package documizeapi
import (
"encoding/json"
api "github.com/documize/community/core/convapi"
"golang.org/x/net/context"
)
// Convert provides the standard interface for conversion of a ".documizeapi" json document.
func Convert(ctx context.Context, in interface{}) (interface{}, error) {
ret := new(api.DocumentConversionResponse)
err := json.Unmarshal(in.(*api.DocumentConversionRequest).Filedata, ret)
return ret, err
}

View file

@ -0,0 +1,228 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package excerpt provides basic functionality to create excerpts of text in English.
package excerpt
import (
"sort"
"strings"
"unicode"
"unicode/utf8"
words "github.com/documize/community/core/wordlists/en-2012"
"github.com/rookii/paicehusk"
)
type extractItem struct {
sequence int
score float64
count int
sentance string
}
type extractList []extractItem
// the Sort interface
// Len is the number of elements in the collection.
func (a extractList) Len() int { return len(a) }
// Less reports whether the element with
// index i should sort before the element with index j.
func (a extractList) Less(i, j int) bool {
return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
}
// Swap swaps the elements with indexes i and j.
func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
type presentItem struct {
sequence int
text string
}
type presentList []presentItem
// the Sort interface
// Len is the number of elements in the collection.
func (a presentList) Len() int { return len(a) }
// Less reports whether the element with
// index i should sort before the element with index j.
func (a presentList) Less(i, j int) bool {
return a[i].sequence < a[j].sequence
}
// Swap swaps the elements with indexes i and j.
func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func addWd(sentance, wd string) (string, bool) {
var isStop bool
if len(sentance) == 0 {
if wd != "[" {
sentance = wd
}
} else {
switch wd {
case "[": //NoOp
case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
sentance += wd
} else {
sentance += " " + wd
}
case ".", "!", "?":
isStop = true
fallthrough
default:
if isPunct(wd) {
sentance += wd
} else {
sentance += " " + wd
}
}
}
return sentance, isStop
}
func isPunct(s string) bool {
for _, r := range s {
if !unicode.IsPunct(r) {
switch r {
case '`', '\'', '"', '(', '/': // still punct
default:
return false
}
}
}
return true
}
// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
func Excerpt(titleWords, bodyWords []string) string {
var el extractList
//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
// populate stemMap
stemMap := make(map[string]uint64)
for _, wd := range bodyWords {
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
stemMap[stem]++
}
for _, wd := range titleWords {
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
stemMap[stem]++ // TODO are words in titles more important?
}
wds := append(titleWords, bodyWords...)
sentance := ""
score := 0.0
count := 0
seq := 0
for _, wd := range wds {
var isStop bool
sentance, isStop = addWd(sentance, wd)
if isStop {
//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
// seq, score*10000/float64(count), sentance)
var ei extractItem
ei.count = count + 1 // must be at least 1
ei.score = score
ei.sentance = sentance
ei.sequence = seq
el = append(el, ei)
sentance = ""
score = 0.0
seq++
} else {
uncommon := true
// TODO Discuss correct level or maybe find a better algorithem for this
ent, ok := words.Words[wd]
if ok {
if ent.Rank <= 100 {
// do not score very common words
uncommon = false
}
}
if uncommon {
stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
usage, used := stemMap[stem]
if used {
relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
if relativeStemFreq > 0.0 {
score += relativeStemFreq
}
}
count++
}
}
}
sort.Sort(el)
return present(el)
}
func present(el extractList) (ret string) {
var pl presentList
words := 0
const excerptWords = 50
for s, e := range el {
if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
notEmpty(e.sentance) {
words += e.count
pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
// 1000*e.score/float64(e.count), e.sequence, e.sentance)
}
}
sort.Sort(pl)
var lastSeq int
for p := range pl {
txt := strings.TrimPrefix(pl[p].text, ". ")
if p == 0 {
ret = txt
lastSeq = pl[0].sequence
} else {
thisSeq := pl[p].sequence
if lastSeq+1 != thisSeq {
ret += " …" // Horizontal elipsis character
}
ret += " " + txt
lastSeq = thisSeq
}
}
if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
for len(ret) > 250 {
_, size := utf8.DecodeLastRuneInString(ret)
ret = ret[:len(ret)-size]
}
return ret + "…" // Horizontal elipsis character added after truncation
}
return ret
}
func notEmpty(wds string) bool {
for _, r := range wds {
if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
return true
}
}
return false
}

View file

@ -0,0 +1,130 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package excerpt_test
import "testing"
import "github.com/documize/community/core/api/convert/excerpt"
import "strings"
import "fmt"
func TestExerpt(t *testing.T) {
if excerpt.Excerpt(nil, nil) != "" ||
excerpt.Excerpt([]string{}, []string{}) != "" {
t.Error("empty lists do not return empty string")
}
qbf := strings.Split("The quick brown fox jumps over the lazy dog .", " ")
qbf2 := qbf
for i := 0; i < 200; i++ {
qbf2 = append(qbf2, qbf...)
}
tst := excerpt.Excerpt(qbf, qbf2)
if tst !=
"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." {
t.Error("'quick brown fox' did not work:", tst)
}
tt123 := strings.Split("Testing , testing ; 1 2 3 is fun ! Bracket [ anyone ? .", " ")
tt123a := tt123
for i := 0; i < 200; i++ {
tt123a = append(tt123a, fmt.Sprintf("%d", i))
tt123a = append(tt123a, tt123...)
}
tst2 := excerpt.Excerpt(tt123, tt123a)
if tst2 !=
"Testing, testing; 123 is fun! … Testing, testing; 123 is fun! … 0 Testing, testing; 123 is fun!" {
t.Error("'Testing testing 123' did not work:", tst2)
}
s := strings.Split(strings.Replace(`
It's supercalifragilisticexpialidocious
Even though the sound of it is something quite atrocious
If you say it loud enough, you'll always sound precocious
Supercalifragilisticexpialidocious
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Because I was afraid to speak
When I was just a lad
My father gave me nose a tweak
And told me I was bad
But then one day I learned a word
That saved me achin' nose
The biggest word I ever heard
And this is how it goes, oh
Supercalifragilisticexpialidocious
Even though the sound of it is something quite atrocious
If you say it loud enough, you'll always sound precocious
Supercalifragilisticexpialidocious
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
He traveled all around the world
And everywhere he went
He'd use his word and all would say
There goes a clever gent
When Dukes and Maharajahs
Pass the time of day with me
I say me special word
And then they ask me out to tea
Oh, supercalifragilisticexpialidocious
Even though the sound of it is something quite atrocious
If you say it loud enough, you'll always sound precocious
Supercalifragilisticexpialidocious
Um diddle, diddle diddle, um diddle ay
Um diddle, diddle diddle, um diddle ay
No, you can say it backwards, which is dociousaliexpilisticfragicalirupus
But that's going a bit too far, don't you think?
So when the cat has got your tongue
There's no need for dismay
Just summon up this word
And then you've got a lot to say
But better use it carefully
Or it could change your life
For example, yes, one night I said it to me girl
And now me girl's my wife, oh, and a lovely thing she's too
She's, supercalifragilisticexpialidocious
Supercalifragilisticexpialidocious
Supercalifragilisticexpialidocious
Supercalifragilisticexpialidocious
. `, "\n", " . ", -1), " ")
ts := []string{"Supercalifragilisticexpialidocious", "song", "lyrics"}
st := excerpt.Excerpt(ts, s)
if st != "Supercalifragilisticexpialidocious song lyrics. … Um diddle, diddle diddle, um diddle ay. Um diddle, diddle diddle, um diddle ay." {
t.Error("'Supercalifragilisticexpialidocious song lyrics' did not work:", st)
}
ss := []string{"Supercalifragilisticexpialidocious", "!"}
ssa := ss
for i := 0; i < 100; i++ {
ssa = append(ssa, ss...)
}
sst := excerpt.Excerpt(ss, ssa)
if sst !=
"Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious…" {
t.Error("'Supercalifragilisticexpialidocious' did not work:", sst)
}
}

View file

@ -0,0 +1,10 @@
How the HTML conversion works
=============================
Uses the "golang.org/x/net" repository package "html" to parse the HTML into a tree,
then walks the tree using processHeadings() to make a series of sections with a heading as their title and the following HTML as the body.
Importantly, if a heading is within some other structure, that other structure is ignored in order to get the heading into the list. This seems to mostly work well, but may have some unintended side-effects.
On the subject of unintended side-effects, or rather their avoidance, "script" HTML tags and their contents are not passed through.

View file

@ -0,0 +1,13 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package html documizes html files.
package html

View file

@ -0,0 +1,244 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package html
import (
"bytes"
"fmt"
"strings"
api "github.com/documize/community/core/convapi"
"github.com/documize/community/core/log"
"github.com/documize/community/core/utility"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"golang.org/x/net/context"
)
const maxTitle = 2000 // NOTE: must be the same length as database page.title
const maxBody = 4000000 // NOTE: must be less than the mysql max_allowed_packet limit, amongst other values
type htmlToSplit struct {
CFR *api.DocumentConversionResponse
thisSect api.Page
nodeCache map[*html.Node]bool
}
// Convert provides the standard interface for conversion of an HTML document.
// All the function does is return a pointer to api.DocumentConversionResponse with
// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata - so effectively a no-op.
func Convert(ctx context.Context, in interface{}) (interface{}, error) {
return &api.DocumentConversionResponse{
PagesHTML: in.(*api.DocumentConversionRequest).Filedata}, nil
}
// SplitIfHTML splits HTML code into pages, if it exists.
func SplitIfHTML(req *api.DocumentConversionRequest, res *api.DocumentConversionResponse) error {
if len(res.PagesHTML) == 0 {
return nil
}
hd := &htmlToSplit{CFR: res, nodeCache: make(map[*html.Node]bool)}
err := hd.testableSplit(req, res)
/*
for k, v := range hd.CFR.Pages {
fmt.Printf("DEBUG hd.CFR.Pages[%d] = Level: %d Title: %s len(Body)=%d\n",
k, v.Level, v.Title, len(v.Body))
}
*/
return err
}
// testableSplit, NOTE pointer receiver so that test code can inspect generated datastructures.
func (h *htmlToSplit) testableSplit(request *api.DocumentConversionRequest,
response *api.DocumentConversionResponse) error {
doc, err := html.Parse(bytes.NewReader(response.PagesHTML))
if err != nil {
return err
}
if doc.Type != html.DocumentNode {
return fmt.Errorf("no HTML document node")
}
for htm := doc.FirstChild; htm != nil; htm = htm.NextSibling {
if htm.Type == html.ElementNode && htm.DataAtom == atom.Html {
for bdy := htm.FirstChild; bdy != nil; bdy = bdy.NextSibling {
if bdy.Type == html.ElementNode && bdy.DataAtom == atom.Body {
h.thisSect = api.Page{
Level: 1,
Title: utility.BeautifyFilename(request.Filename),
Body: []byte(``)}
err := h.processChildren(bdy)
if err != nil {
h.CFR.Err = err.Error()
}
h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
}
}
}
}
return nil
}
func getLevel(at atom.Atom) uint64 {
level := uint64(1)
switch at {
case atom.H6:
level++
fallthrough
case atom.H5:
level++
fallthrough
case atom.H4:
level++
fallthrough
case atom.H3:
level++
fallthrough
case atom.H2:
level++
fallthrough
case atom.H1:
level++
}
return level
}
func (h *htmlToSplit) processChildren(bdy *html.Node) error {
for c := bdy.FirstChild; c != nil; c = c.NextSibling {
var err error
if c.Type == html.ElementNode {
if level := getLevel(c.DataAtom); level > 1 {
err = h.renderHeading(c, level)
} else {
err = h.renderNonHeading(c)
}
} else {
err = h.renderAppend(c)
}
if err != nil {
return err
}
}
return nil
}
func stripZeroWidthSpaces(str string) string {
ret := ""
for _, r := range str {
if r != 0x200B { // zero width space
ret += string(r) // stripped of zero-width spaces
}
}
return ret
}
func (h *htmlToSplit) renderHeading(c *html.Node, level uint64) error {
byt, err := byteRenderChildren(c) // get heading html
if err != nil {
return err
}
str, err := utility.HTML(string(byt)).Text(false) // heading text
if err != nil {
return err
}
str = stripZeroWidthSpaces(str)
if strings.TrimSpace(str) != "" { // only put in non-empty headings
h.newSect(str, level)
}
return nil
}
func (h *htmlToSplit) newSect(tstr string, level uint64) {
h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
title := tstr //was: utility.EscapeHTMLcomplexChars(tstr) -- removed to avoid double-escaping
body := ``
if len(title) > maxTitle {
body = title[maxTitle:]
title = title[:maxTitle]
}
h.thisSect = api.Page{
Level: level,
Title: title,
Body: []byte(body)}
}
func (h *htmlToSplit) renderNonHeading(c *html.Node) error {
if h.nodeContainsHeading(c) { // ignore this atom in order to get at the contents
err := h.processChildren(c)
if err != nil {
return err
}
} else {
if err := h.renderAppend(c); err != nil {
return err
}
}
return nil
}
func (h *htmlToSplit) renderAppend(c *html.Node) error {
byt, err := byteRender(c)
if err != nil {
return err
}
ebyt := utility.EscapeHTMLcomplexCharsByte(byt)
if len(ebyt) > maxBody {
msg := fmt.Sprintf("(Documize warning: HTML render element ignored, size of %d exceeded maxBody of %d.)", len(ebyt), maxBody)
log.Info(msg)
ebyt = []byte("<p><b>" + msg + "</b></p>")
}
if len(h.thisSect.Body)+len(ebyt) > maxBody {
h.newSect("-", h.thisSect.Level+1) // plus one so that the new "-" one is part of the previous
}
h.thisSect.Body = append(h.thisSect.Body, ebyt...)
return nil
}
func byteRender(n *html.Node) ([]byte, error) {
var b bytes.Buffer
err := html.Render(&b, n)
return b.Bytes(), err
}
func byteRenderChildren(n *html.Node) ([]byte, error) {
var b bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
err := html.Render(&b, c)
if err != nil {
return nil, err
}
}
return b.Bytes(), nil
}
func (h *htmlToSplit) nodeContainsHeading(n *html.Node) bool {
val, ok := h.nodeCache[n]
if ok {
return val
}
switch n.DataAtom {
case atom.H6, atom.H5, atom.H4, atom.H3, atom.H2, atom.H1:
h.nodeCache[n] = true
return true
default:
for c := n.FirstChild; c != nil; c = c.NextSibling {
if h.nodeContainsHeading(c) {
h.nodeCache[n] = true
h.nodeCache[c] = true
return true
}
}
}
h.nodeCache[n] = false
return false
}

View file

@ -0,0 +1,384 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package html_test
import (
"strings"
"testing"
)
import api "github.com/documize/community/core/convapi"
import "github.com/documize/community/core/api/convert/html"
const b string = `
<h1>Markdown: Basics</h1>
<ul id="ProjectSubmenu">
<li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li>
<li><a class="selected" title="Markdown Basics">Basics</a></li>
<li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li>
<li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li>
<li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li>
</ul>
<h2>Getting the Gist of Markdown's Formatting Syntax</h2>
<p>This page offers a brief overview of what it's like to use Markdown.
The <a href="/projects/markdown/syntax" title="Markdown Syntax">syntax page</a> provides complete, detailed documentation for
every feature, but Markdown should be very easy to pick up simply by
looking at a few examples of it in action. The examples on this page
are written in a before/after style, showing example syntax and the
HTML output produced by Markdown.</p>
<p>It's also helpful to simply try Markdown out; the <a href="/projects/markdown/dingus" title="Markdown Dingus">Dingus</a> is a
web application that allows you type your own Markdown-formatted text
and translate it to XHTML.</p>
<p><strong>Note:</strong> This document is itself written using Markdown; you
can <a href="/projects/markdown/basics.text">see the source for it by adding '.text' to the URL</a>.</p>
<h2>Paragraphs, Headers, Blockquotes</h2>
<p>A paragraph is simply one or more consecutive lines of text, separated
by one or more blank lines. (A blank line is any line that looks like a
blank line -- a line containing nothing spaces or tabs is considered
blank.) Normal paragraphs should not be intended with spaces or tabs.</p>
<p>Markdown offers two styles of headers: <em>Setext</em> and <em>atx</em>.
Setext-style headers for <code>&lt;h1&gt;</code> and <code>&lt;h2&gt;</code> are created by
&quot;underlining&quot; with equal signs (<code>=</code>) and hyphens (<code>-</code>), respectively.
To create an atx-style header, you put 1-6 hash marks (<code>#</code>) at the
beginning of the line -- the number of hashes equals the resulting
HTML header level.</p>
<p>Blockquotes are indicated using email-style '<code>&gt;</code>' angle brackets.</p>
<p>Markdown:</p>
<pre><code>A First Level Header
====================
A Second Level Header
---------------------
Now is the time for all good men to come to
the aid of their country. This is just a
regular paragraph.
The quick brown fox jumped over the lazy
dog's back.
### Header 3
&gt; This is a blockquote.
&gt;
&gt; This is the second paragraph in the blockquote.
&gt;
&gt; ## This is an H2 in a blockquote
</code></pre>
<p>Output:</p>
<pre><code>&lt;h1&gt;A First Level Header&lt;/h1&gt;
&lt;h2&gt;A Second Level Header&lt;/h2&gt;
&lt;p&gt;Now is the time for all good men to come to
the aid of their country. This is just a
regular paragraph.&lt;/p&gt;
&lt;p&gt;The quick brown fox jumped over the lazy
dog's back.&lt;/p&gt;
&lt;h3&gt;Header 3&lt;/h3&gt;
&lt;blockquote&gt;
&lt;p&gt;This is a blockquote.&lt;/p&gt;
&lt;p&gt;This is the second paragraph in the blockquote.&lt;/p&gt;
&lt;h2&gt;This is an H2 in a blockquote&lt;/h2&gt;
&lt;/blockquote&gt;
</code></pre>
<h3>Phrase Emphasis</h3>
<p>Markdown uses asterisks and underscores to indicate spans of emphasis.</p>
<p>Markdown:</p>
<pre><code>Some of these words *are emphasized*.
Some of these words _are emphasized also_.
Use two asterisks for **strong emphasis**.
Or, if you prefer, __use two underscores instead__.
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;Some of these words &lt;em&gt;are emphasized&lt;/em&gt;.
Some of these words &lt;em&gt;are emphasized also&lt;/em&gt;.&lt;/p&gt;
&lt;p&gt;Use two asterisks for &lt;strong&gt;strong emphasis&lt;/strong&gt;.
Or, if you prefer, &lt;strong&gt;use two underscores instead&lt;/strong&gt;.&lt;/p&gt;
</code></pre>
<h2>Lists</h2>
<p>Unordered (bulleted) lists use asterisks, pluses, and hyphens (<code>*</code>,
<code>+</code>, and <code>-</code>) as list markers. These three markers are
interchangable; this:</p>
<pre><code>* Candy.
* Gum.
* Booze.
</code></pre>
<p>this:</p>
<pre><code>+ Candy.
+ Gum.
+ Booze.
</code></pre>
<p>and this:</p>
<pre><code>- Candy.
- Gum.
- Booze.
</code></pre>
<p>all produce the same output:</p>
<pre><code>&lt;ul&gt;
&lt;li&gt;Candy.&lt;/li&gt;
&lt;li&gt;Gum.&lt;/li&gt;
&lt;li&gt;Booze.&lt;/li&gt;
&lt;/ul&gt;
</code></pre>
<p>Ordered (numbered) lists use regular numbers, followed by periods, as
list markers:</p>
<pre><code>1. Red
2. Green
3. Blue
</code></pre>
<p>Output:</p>
<pre><code>&lt;ol&gt;
&lt;li&gt;Red&lt;/li&gt;
&lt;li&gt;Green&lt;/li&gt;
&lt;li&gt;Blue&lt;/li&gt;
&lt;/ol&gt;
</code></pre>
<p>If you put blank lines between items, you'll get <code>&lt;p&gt;</code> tags for the
list item text. You can create multi-paragraph list items by indenting
the paragraphs by 4 spaces or 1 tab:</p>
<pre><code>* A list item.
With multiple paragraphs.
* Another item in the list.
</code></pre>
<p>Output:</p>
<pre><code>&lt;ul&gt;
&lt;li&gt;&lt;p&gt;A list item.&lt;/p&gt;
&lt;p&gt;With multiple paragraphs.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;Another item in the list.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;
</code></pre>
<h3>Links</h3>
<p>Markdown supports two styles for creating links: <em>inline</em> and
<em>reference</em>. With both styles, you use square brackets to delimit the
text you want to turn into a link.</p>
<p>Inline-style links use parentheses immediately after the link text.
For example:</p>
<pre><code>This is an [example link](http://example.com/).
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot;&gt;
example link&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<p>Optionally, you may include a title attribute in the parentheses:</p>
<pre><code>This is an [example link](http://example.com/ &quot;With a Title&quot;).
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot; title=&quot;With a Title&quot;&gt;
example link&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<p>Reference-style links allow you to refer to your links by names, which
you define elsewhere in your document:</p>
<pre><code>I get 10 times more traffic from [Google][1] than from
[Yahoo][2] or [MSN][3].
[1]: http://google.com/ &quot;Google&quot;
[2]: http://search.yahoo.com/ &quot;Yahoo Search&quot;
[3]: http://search.msn.com/ &quot;MSN Search&quot;
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;I get 10 times more traffic from &lt;a href=&quot;http://google.com/&quot;
title=&quot;Google&quot;&gt;Google&lt;/a&gt; than from &lt;a href=&quot;http://search.yahoo.com/&quot;
title=&quot;Yahoo Search&quot;&gt;Yahoo&lt;/a&gt; or &lt;a href=&quot;http://search.msn.com/&quot;
title=&quot;MSN Search&quot;&gt;MSN&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<p>The title attribute is optional. Link names may contain letters,
numbers and spaces, but are <em>not</em> case sensitive:</p>
<pre><code>I start my morning with a cup of coffee and
[The New York Times][NY Times].
[ny times]: http://www.nytimes.com/
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;I start my morning with a cup of coffee and
&lt;a href=&quot;http://www.nytimes.com/&quot;&gt;The New York Times&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<h3>Images</h3>
<p>Image syntax is very much like link syntax.</p>
<p>Inline (titles are optional):</p>
<pre><code>![alt text](/path/to/img.jpg &quot;Title&quot;)
</code></pre>
<p>Reference-style:</p>
<pre><code>![alt text][id]
[id]: /path/to/img.jpg &quot;Title&quot;
</code></pre>
<p>Both of the above examples produce the same output:</p>
<pre><code>&lt;img src=&quot;/path/to/img.jpg&quot; alt=&quot;alt text&quot; title=&quot;Title&quot; /&gt;
</code></pre>
<h3>Code</h3>
<p>In a regular paragraph, you can create code span by wrapping text in
backtick quotes. Any ampersands (<code>&amp;</code>) and angle brackets (<code>&lt;</code> or
<code>&gt;</code>) will automatically be translated into HTML entities. This makes
it easy to use Markdown to write about HTML example code:</p>
<pre><code>I strongly recommend against using any "&lt;blink&gt;" tags.
I wish SmartyPants used named entities like "&amp;mdash;""
instead of decimal-encoded entites like "&amp;#8212;".
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;I strongly recommend against using any
&lt;code&gt;&amp;lt;blink&amp;gt;&lt;/code&gt; tags.&lt;/p&gt;
&lt;p&gt;I wish SmartyPants used named entities like
&lt;code&gt;&amp;amp;mdash;&lt;/code&gt; instead of decimal-encoded
entites like &lt;code&gt;&amp;amp;#8212;&lt;/code&gt;.&lt;/p&gt;
</code></pre>
<p>To specify an entire block of pre-formatted code, indent every line of
the block by 4 spaces or 1 tab. Just like with code spans, <code>&amp;</code>, <code>&lt;</code>,
and <code>&gt;</code> characters will be escaped automatically.</p>
<p>Markdown:</p>
<pre><code>If you want your page to validate under XHTML 1.0 Strict,
you've got to put paragraph tags in your blockquotes:
&lt;blockquote&gt;
&lt;p&gt;For example.&lt;/p&gt;
&lt;/blockquote&gt;
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;If you want your page to validate under XHTML 1.0 Strict,
you've got to put paragraph tags in your blockquotes:&lt;/p&gt;
&lt;pre&gt;&lt;code&gt;&amp;lt;blockquote&amp;gt;
&amp;lt;p&amp;gt;For example.&amp;lt;/p&amp;gt;
&amp;lt;/blockquote&amp;gt;
&lt;/code&gt;&lt;/pre&gt;
</code></pre>
<h4>Header4</h4>
<div><div><div><div><div><div>
<h5>Header5</h5>Body 555.
</div></div></div></div></div></div>
<h6>Header6</h6>
`
func TestHTML(t *testing.T) {
req := &api.DocumentConversionRequest{}
res := &api.DocumentConversionResponse{}
err := html.SplitIfHTML(req, res)
if err != nil || len(res.PagesHTML) != 0 || len(res.Pages) != 0 || len(res.EmbeddedFiles) != 0 {
t.Error(err)
return
}
titleTooBig := []byte("<h1>")
for i := 0; i < 2048; i++ {
titleTooBig = append(titleTooBig, []byte("title too long ")...)
}
titleTooBig = append(titleTooBig, []byte("</h1>")...)
req = &api.DocumentConversionRequest{}
res = &api.DocumentConversionResponse{PagesHTML: titleTooBig}
err = html.SplitIfHTML(req, res)
if err != nil || len(res.Pages[0].Title) > 2000 {
t.Error(err)
return
}
req = &api.DocumentConversionRequest{}
res = &api.DocumentConversionResponse{PagesHTML: []byte(b)}
err = html.SplitIfHTML(req, res)
if err != nil {
t.Error(err)
return
}
//for p, pg := range res.Pages {
// t.Logf("%d %d %d %s", p, pg.Level, len(pg.Body), pg.Title)
//}
if !strings.HasPrefix(res.Pages[10].Title, "Header5") ||
!strings.HasPrefix(string(res.Pages[10].Body), "Body 555.") {
t.Errorf("wrong page ten title: `%s` body: `%s`", res.Pages[10].Title, string(res.Pages[10].Body))
}
}

28
core/api/convert/md/md.go Normal file
View file

@ -0,0 +1,28 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package md
import (
api "github.com/documize/community/core/convapi"
"github.com/documize/blackfriday"
"golang.org/x/net/context"
)
// Convert provides the standard interface for conversion of a Markdown document.
// All the function does is return a pointer to api.DocumentConversionResponse with
// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata converted by the blackfriday lib.
func Convert(ctx context.Context, in interface{}) (interface{}, error) {
return &api.DocumentConversionResponse{
PagesHTML: blackfriday.MarkdownCommon(in.(*api.DocumentConversionRequest).Filedata)}, nil
}