restructure directories

2025-07-24 07:39:43 +02:00 · 2016-07-20 15:58:37 +01:00 · 2016-07-20 15:58:37 +01:00 · a2ce777762
commit a2ce777762
parent 7e4ed6545b
159 changed files with 320 additions and 323 deletions
--- a/core/api/convert/html/README.md
+++ b/core/api/convert/html/README.md
@ -0,0 +1,10 @@
+How the HTML conversion works
+=============================
+
+Uses the "golang.org/x/net" repository package "html" to parse the HTML into a tree,
+then walks the tree using processHeadings() to make a series of sections with a heading as their title and the following HTML as the body. 
+
+Importantly, if a heading is within some other structure, that other structure is ignored in order to get the heading into the list. This seems to mostly work well, but may have some unintended side-effects.
+
+On the subject of unintended side-effects, or rather their avoidance, "script" HTML tags and their contents are not passed through.
+
--- a/core/api/convert/html/doc.go
+++ b/core/api/convert/html/doc.go
@ -0,0 +1,13 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+// Package html documizes html files.
+package html
--- a/core/api/convert/html/html.go
+++ b/core/api/convert/html/html.go
@ -0,0 +1,244 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package html
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+
+	api "github.com/documize/community/core/convapi"
+	"github.com/documize/community/core/log"
+	"github.com/documize/community/core/utility"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+
+	"golang.org/x/net/context"
+)
+
+const maxTitle = 2000   // NOTE: must be the same length as database page.title
+const maxBody = 4000000 // NOTE: must be less than the mysql max_allowed_packet limit, amongst other values
+
+type htmlToSplit struct {
+	CFR       *api.DocumentConversionResponse
+	thisSect  api.Page
+	nodeCache map[*html.Node]bool
+}
+
+// Convert provides the standard interface for conversion of an HTML document.
+// All the function does is return a pointer to api.DocumentConversionResponse with
+// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata - so effectively a no-op.
+func Convert(ctx context.Context, in interface{}) (interface{}, error) {
+	return &api.DocumentConversionResponse{
+		PagesHTML: in.(*api.DocumentConversionRequest).Filedata}, nil
+}
+
+// SplitIfHTML splits HTML code into pages, if it exists.
+func SplitIfHTML(req *api.DocumentConversionRequest, res *api.DocumentConversionResponse) error {
+	if len(res.PagesHTML) == 0 {
+		return nil
+	}
+	hd := &htmlToSplit{CFR: res, nodeCache: make(map[*html.Node]bool)}
+	err := hd.testableSplit(req, res)
+	/*
+		for k, v := range hd.CFR.Pages {
+			fmt.Printf("DEBUG hd.CFR.Pages[%d] = Level: %d Title: %s len(Body)=%d\n",
+				k, v.Level, v.Title, len(v.Body))
+		}
+	*/
+	return err
+}
+
+// testableSplit, NOTE pointer receiver so that test code can inspect generated datastructures.
+func (h *htmlToSplit) testableSplit(request *api.DocumentConversionRequest,
+	response *api.DocumentConversionResponse) error {
+	doc, err := html.Parse(bytes.NewReader(response.PagesHTML))
+	if err != nil {
+		return err
+	}
+	if doc.Type != html.DocumentNode {
+		return fmt.Errorf("no HTML document node")
+	}
+	for htm := doc.FirstChild; htm != nil; htm = htm.NextSibling {
+		if htm.Type == html.ElementNode && htm.DataAtom == atom.Html {
+			for bdy := htm.FirstChild; bdy != nil; bdy = bdy.NextSibling {
+				if bdy.Type == html.ElementNode && bdy.DataAtom == atom.Body {
+					h.thisSect = api.Page{
+						Level: 1,
+						Title: utility.BeautifyFilename(request.Filename),
+						Body:  []byte(``)}
+					err := h.processChildren(bdy)
+					if err != nil {
+						h.CFR.Err = err.Error()
+					}
+					h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func getLevel(at atom.Atom) uint64 {
+	level := uint64(1)
+	switch at {
+	case atom.H6:
+		level++
+		fallthrough
+	case atom.H5:
+		level++
+		fallthrough
+	case atom.H4:
+		level++
+		fallthrough
+	case atom.H3:
+		level++
+		fallthrough
+	case atom.H2:
+		level++
+		fallthrough
+	case atom.H1:
+		level++
+	}
+	return level
+}
+
+func (h *htmlToSplit) processChildren(bdy *html.Node) error {
+	for c := bdy.FirstChild; c != nil; c = c.NextSibling {
+		var err error
+		if c.Type == html.ElementNode {
+			if level := getLevel(c.DataAtom); level > 1 {
+				err = h.renderHeading(c, level)
+			} else {
+				err = h.renderNonHeading(c)
+			}
+		} else {
+			err = h.renderAppend(c)
+		}
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func stripZeroWidthSpaces(str string) string {
+	ret := ""
+	for _, r := range str {
+		if r != 0x200B { // zero width space
+			ret += string(r) // stripped of zero-width spaces
+		}
+	}
+	return ret
+}
+
+func (h *htmlToSplit) renderHeading(c *html.Node, level uint64) error {
+	byt, err := byteRenderChildren(c) // get heading html
+	if err != nil {
+		return err
+	}
+	str, err := utility.HTML(string(byt)).Text(false) // heading text
+	if err != nil {
+		return err
+	}
+	str = stripZeroWidthSpaces(str)
+	if strings.TrimSpace(str) != "" { // only put in non-empty headings
+		h.newSect(str, level)
+	}
+	return nil
+}
+
+func (h *htmlToSplit) newSect(tstr string, level uint64) {
+	h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
+	title := tstr //was: utility.EscapeHTMLcomplexChars(tstr) -- removed to avoid double-escaping
+	body := ``
+	if len(title) > maxTitle {
+		body = title[maxTitle:]
+		title = title[:maxTitle]
+	}
+	h.thisSect = api.Page{
+		Level: level,
+		Title: title,
+		Body:  []byte(body)}
+}
+
+func (h *htmlToSplit) renderNonHeading(c *html.Node) error {
+	if h.nodeContainsHeading(c) { // ignore this atom in order to get at the contents
+		err := h.processChildren(c)
+		if err != nil {
+			return err
+		}
+	} else {
+		if err := h.renderAppend(c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (h *htmlToSplit) renderAppend(c *html.Node) error {
+	byt, err := byteRender(c)
+	if err != nil {
+		return err
+	}
+	ebyt := utility.EscapeHTMLcomplexCharsByte(byt)
+	if len(ebyt) > maxBody {
+		msg := fmt.Sprintf("(Documize warning: HTML render element ignored, size of %d exceeded maxBody of %d.)", len(ebyt), maxBody)
+		log.Info(msg)
+		ebyt = []byte("<p><b>" + msg + "</b></p>")
+	}
+	if len(h.thisSect.Body)+len(ebyt) > maxBody {
+		h.newSect("-", h.thisSect.Level+1) // plus one so that the new "-" one is part of the previous
+	}
+	h.thisSect.Body = append(h.thisSect.Body, ebyt...)
+	return nil
+}
+
+func byteRender(n *html.Node) ([]byte, error) {
+	var b bytes.Buffer
+	err := html.Render(&b, n)
+	return b.Bytes(), err
+}
+
+func byteRenderChildren(n *html.Node) ([]byte, error) {
+	var b bytes.Buffer
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		err := html.Render(&b, c)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return b.Bytes(), nil
+}
+
+func (h *htmlToSplit) nodeContainsHeading(n *html.Node) bool {
+	val, ok := h.nodeCache[n]
+	if ok {
+		return val
+	}
+	switch n.DataAtom {
+	case atom.H6, atom.H5, atom.H4, atom.H3, atom.H2, atom.H1:
+		h.nodeCache[n] = true
+		return true
+	default:
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			if h.nodeContainsHeading(c) {
+				h.nodeCache[n] = true
+				h.nodeCache[c] = true
+				return true
+			}
+		}
+	}
+	h.nodeCache[n] = false
+	return false
+}
--- a/core/api/convert/html/html_test.go
+++ b/core/api/convert/html/html_test.go
@ -0,0 +1,384 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+package html_test
+
+import (
+	"strings"
+	"testing"
+)
+import api "github.com/documize/community/core/convapi"
+import "github.com/documize/community/core/api/convert/html"
+
+const b string = `
+<h1>Markdown: Basics</h1>
+
+<ul id="ProjectSubmenu">
+    <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li>
+    <li><a class="selected" title="Markdown Basics">Basics</a></li>
+    <li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li>
+    <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li>
+    <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li>
+</ul>
+
+<h2>Getting the Gist of Markdown's Formatting Syntax</h2>
+
+<p>This page offers a brief overview of what it's like to use Markdown.
+The <a href="/projects/markdown/syntax" title="Markdown Syntax">syntax page</a> provides complete, detailed documentation for
+every feature, but Markdown should be very easy to pick up simply by
+looking at a few examples of it in action. The examples on this page
+are written in a before/after style, showing example syntax and the
+HTML output produced by Markdown.</p>
+
+<p>It's also helpful to simply try Markdown out; the <a href="/projects/markdown/dingus" title="Markdown Dingus">Dingus</a> is a
+web application that allows you type your own Markdown-formatted text
+and translate it to XHTML.</p>
+
+<p><strong>Note:</strong> This document is itself written using Markdown; you
+can <a href="/projects/markdown/basics.text">see the source for it by adding '.text' to the URL</a>.</p>
+
+<h2>Paragraphs, Headers, Blockquotes</h2>
+
+<p>A paragraph is simply one or more consecutive lines of text, separated
+by one or more blank lines. (A blank line is any line that looks like a
+blank line -- a line containing nothing spaces or tabs is considered
+blank.) Normal paragraphs should not be intended with spaces or tabs.</p>
+
+<p>Markdown offers two styles of headers: <em>Setext</em> and <em>atx</em>.
+Setext-style headers for <code>&lt;h1&gt;</code> and <code>&lt;h2&gt;</code> are created by
+&quot;underlining&quot; with equal signs (<code>=</code>) and hyphens (<code>-</code>), respectively.
+To create an atx-style header, you put 1-6 hash marks (<code>#</code>) at the
+beginning of the line -- the number of hashes equals the resulting
+HTML header level.</p>
+
+<p>Blockquotes are indicated using email-style '<code>&gt;</code>' angle brackets.</p>
+
+<p>Markdown:</p>
+
+<pre><code>A First Level Header
+====================
+
+A Second Level Header
+---------------------
+
+Now is the time for all good men to come to
+the aid of their country. This is just a
+regular paragraph.
+
+The quick brown fox jumped over the lazy
+dog's back.
+
+### Header 3
+
+&gt; This is a blockquote.
+&gt; 
+&gt; This is the second paragraph in the blockquote.
+&gt;
+&gt; ## This is an H2 in a blockquote
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;h1&gt;A First Level Header&lt;/h1&gt;
+
+&lt;h2&gt;A Second Level Header&lt;/h2&gt;
+
+&lt;p&gt;Now is the time for all good men to come to
+the aid of their country. This is just a
+regular paragraph.&lt;/p&gt;
+
+&lt;p&gt;The quick brown fox jumped over the lazy
+dog's back.&lt;/p&gt;
+
+&lt;h3&gt;Header 3&lt;/h3&gt;
+
+&lt;blockquote&gt;
+    &lt;p&gt;This is a blockquote.&lt;/p&gt;
+
+    &lt;p&gt;This is the second paragraph in the blockquote.&lt;/p&gt;
+
+    &lt;h2&gt;This is an H2 in a blockquote&lt;/h2&gt;
+&lt;/blockquote&gt;
+</code></pre>
+
+<h3>Phrase Emphasis</h3>
+
+<p>Markdown uses asterisks and underscores to indicate spans of emphasis.</p>
+
+<p>Markdown:</p>
+
+<pre><code>Some of these words *are emphasized*.
+Some of these words _are emphasized also_.
+
+Use two asterisks for **strong emphasis**.
+Or, if you prefer, __use two underscores instead__.
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;Some of these words &lt;em&gt;are emphasized&lt;/em&gt;.
+Some of these words &lt;em&gt;are emphasized also&lt;/em&gt;.&lt;/p&gt;
+
+&lt;p&gt;Use two asterisks for &lt;strong&gt;strong emphasis&lt;/strong&gt;.
+Or, if you prefer, &lt;strong&gt;use two underscores instead&lt;/strong&gt;.&lt;/p&gt;
+</code></pre>
+
+<h2>Lists</h2>
+
+<p>Unordered (bulleted) lists use asterisks, pluses, and hyphens (<code>*</code>,
+<code>+</code>, and <code>-</code>) as list markers. These three markers are
+interchangable; this:</p>
+
+<pre><code>*   Candy.
+*   Gum.
+*   Booze.
+</code></pre>
+
+<p>this:</p>
+
+<pre><code>+   Candy.
+   Gum.
+   Booze.
+</code></pre>
+
+<p>and this:</p>
+
+<pre><code>-   Candy.
+-   Gum.
+-   Booze.
+</code></pre>
+
+<p>all produce the same output:</p>
+
+<pre><code>&lt;ul&gt;
+&lt;li&gt;Candy.&lt;/li&gt;
+&lt;li&gt;Gum.&lt;/li&gt;
+&lt;li&gt;Booze.&lt;/li&gt;
+&lt;/ul&gt;
+</code></pre>
+
+<p>Ordered (numbered) lists use regular numbers, followed by periods, as
+list markers:</p>
+
+<pre><code>1.  Red
+2.  Green
+3.  Blue
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;ol&gt;
+&lt;li&gt;Red&lt;/li&gt;
+&lt;li&gt;Green&lt;/li&gt;
+&lt;li&gt;Blue&lt;/li&gt;
+&lt;/ol&gt;
+</code></pre>
+
+<p>If you put blank lines between items, you'll get <code>&lt;p&gt;</code> tags for the
+list item text. You can create multi-paragraph list items by indenting
+the paragraphs by 4 spaces or 1 tab:</p>
+
+<pre><code>*   A list item.
+
+    With multiple paragraphs.
+
+*   Another item in the list.
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;ul&gt;
+&lt;li&gt;&lt;p&gt;A list item.&lt;/p&gt;
+&lt;p&gt;With multiple paragraphs.&lt;/p&gt;&lt;/li&gt;
+&lt;li&gt;&lt;p&gt;Another item in the list.&lt;/p&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</code></pre>
+
+<h3>Links</h3>
+
+<p>Markdown supports two styles for creating links: <em>inline</em> and
+<em>reference</em>. With both styles, you use square brackets to delimit the
+text you want to turn into a link.</p>
+
+<p>Inline-style links use parentheses immediately after the link text.
+For example:</p>
+
+<pre><code>This is an [example link](http://example.com/).
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot;&gt;
+example link&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>Optionally, you may include a title attribute in the parentheses:</p>
+
+<pre><code>This is an [example link](http://example.com/ &quot;With a Title&quot;).
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot; title=&quot;With a Title&quot;&gt;
+example link&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>Reference-style links allow you to refer to your links by names, which
+you define elsewhere in your document:</p>
+
+<pre><code>I get 10 times more traffic from [Google][1] than from
+[Yahoo][2] or [MSN][3].
+
+[1]: http://google.com/        &quot;Google&quot;
+[2]: http://search.yahoo.com/  &quot;Yahoo Search&quot;
+[3]: http://search.msn.com/    &quot;MSN Search&quot;
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;I get 10 times more traffic from &lt;a href=&quot;http://google.com/&quot;
+title=&quot;Google&quot;&gt;Google&lt;/a&gt; than from &lt;a href=&quot;http://search.yahoo.com/&quot;
+title=&quot;Yahoo Search&quot;&gt;Yahoo&lt;/a&gt; or &lt;a href=&quot;http://search.msn.com/&quot;
+title=&quot;MSN Search&quot;&gt;MSN&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>The title attribute is optional. Link names may contain letters,
+numbers and spaces, but are <em>not</em> case sensitive:</p>
+
+<pre><code>I start my morning with a cup of coffee and
+[The New York Times][NY Times].
+
+[ny times]: http://www.nytimes.com/
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;I start my morning with a cup of coffee and
+&lt;a href=&quot;http://www.nytimes.com/&quot;&gt;The New York Times&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<h3>Images</h3>
+
+<p>Image syntax is very much like link syntax.</p>
+
+<p>Inline (titles are optional):</p>
+
+<pre><code>![alt text](/path/to/img.jpg &quot;Title&quot;)
+</code></pre>
+
+<p>Reference-style:</p>
+
+<pre><code>![alt text][id]
+
+[id]: /path/to/img.jpg &quot;Title&quot;
+</code></pre>
+
+<p>Both of the above examples produce the same output:</p>
+
+<pre><code>&lt;img src=&quot;/path/to/img.jpg&quot; alt=&quot;alt text&quot; title=&quot;Title&quot; /&gt;
+</code></pre>
+
+<h3>Code</h3>
+
+<p>In a regular paragraph, you can create code span by wrapping text in
+backtick quotes. Any ampersands (<code>&amp;</code>) and angle brackets (<code>&lt;</code> or
+<code>&gt;</code>) will automatically be translated into HTML entities. This makes
+it easy to use Markdown to write about HTML example code:</p>
+
+<pre><code>I strongly recommend against using any "&lt;blink&gt;" tags.
+
+I wish SmartyPants used named entities like "&amp;mdash;""
+instead of decimal-encoded entites like "&amp;#8212;".
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;I strongly recommend against using any
+&lt;code&gt;&amp;lt;blink&amp;gt;&lt;/code&gt; tags.&lt;/p&gt;
+
+&lt;p&gt;I wish SmartyPants used named entities like
+&lt;code&gt;&amp;amp;mdash;&lt;/code&gt; instead of decimal-encoded
+entites like &lt;code&gt;&amp;amp;#8212;&lt;/code&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>To specify an entire block of pre-formatted code, indent every line of
+the block by 4 spaces or 1 tab. Just like with code spans, <code>&amp;</code>, <code>&lt;</code>,
+and <code>&gt;</code> characters will be escaped automatically.</p>
+
+<p>Markdown:</p>
+
+<pre><code>If you want your page to validate under XHTML 1.0 Strict,
+you've got to put paragraph tags in your blockquotes:
+
+    &lt;blockquote&gt;
+        &lt;p&gt;For example.&lt;/p&gt;
+    &lt;/blockquote&gt;
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;If you want your page to validate under XHTML 1.0 Strict,
+you've got to put paragraph tags in your blockquotes:&lt;/p&gt;
+
+&lt;pre&gt;&lt;code&gt;&amp;lt;blockquote&amp;gt;
+    &amp;lt;p&amp;gt;For example.&amp;lt;/p&amp;gt;
+&amp;lt;/blockquote&amp;gt;
+&lt;/code&gt;&lt;/pre&gt;
+</code></pre>
+
+<h4>Header4</h4>
+<div><div><div><div><div><div>
+<h5>Header5</h5>Body 555.
+</div></div></div></div></div></div>
+<h6>Header6</h6>
+
+`
+
+func TestHTML(t *testing.T) {
+
+	req := &api.DocumentConversionRequest{}
+	res := &api.DocumentConversionResponse{}
+
+	err := html.SplitIfHTML(req, res)
+	if err != nil || len(res.PagesHTML) != 0 || len(res.Pages) != 0 || len(res.EmbeddedFiles) != 0 {
+		t.Error(err)
+		return
+	}
+
+	titleTooBig := []byte("<h1>")
+	for i := 0; i < 2048; i++ {
+		titleTooBig = append(titleTooBig, []byte("title too long ")...)
+	}
+	titleTooBig = append(titleTooBig, []byte("</h1>")...)
+	req = &api.DocumentConversionRequest{}
+	res = &api.DocumentConversionResponse{PagesHTML: titleTooBig}
+	err = html.SplitIfHTML(req, res)
+	if err != nil || len(res.Pages[0].Title) > 2000 {
+		t.Error(err)
+		return
+	}
+
+	req = &api.DocumentConversionRequest{}
+	res = &api.DocumentConversionResponse{PagesHTML: []byte(b)}
+	err = html.SplitIfHTML(req, res)
+	if err != nil {
+		t.Error(err)
+		return
+	}
+	//for p, pg := range res.Pages {
+	//	t.Logf("%d %d %d %s", p, pg.Level, len(pg.Body), pg.Title)
+	//}
+	if !strings.HasPrefix(res.Pages[10].Title, "Header5") ||
+		!strings.HasPrefix(string(res.Pages[10].Body), "Body 555.") {
+		t.Errorf("wrong page ten title: `%s` body: `%s`", res.Pages[10].Title, string(res.Pages[10].Body))
+	}
+
+}