1
0
Fork 0
mirror of https://github.com/documize/community.git synced 2025-07-24 07:39:43 +02:00

restructure directories

This commit is contained in:
Elliott Stoneham 2016-07-20 15:58:37 +01:00
parent 7e4ed6545b
commit a2ce777762
159 changed files with 320 additions and 323 deletions

View file

@ -0,0 +1,10 @@
How the HTML conversion works
=============================
Uses the "golang.org/x/net" repository package "html" to parse the HTML into a tree,
then walks the tree using processHeadings() to make a series of sections with a heading as their title and the following HTML as the body.
Importantly, if a heading is within some other structure, that other structure is ignored in order to get the heading into the list. This seems to mostly work well, but may have some unintended side-effects.
On the subject of unintended side-effects, or rather their avoidance, "script" HTML tags and their contents are not passed through.

View file

@ -0,0 +1,13 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
// Package html documizes html files.
package html

View file

@ -0,0 +1,244 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package html
import (
"bytes"
"fmt"
"strings"
api "github.com/documize/community/core/convapi"
"github.com/documize/community/core/log"
"github.com/documize/community/core/utility"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"golang.org/x/net/context"
)
const maxTitle = 2000 // NOTE: must be the same length as database page.title
const maxBody = 4000000 // NOTE: must be less than the mysql max_allowed_packet limit, amongst other values
type htmlToSplit struct {
CFR *api.DocumentConversionResponse
thisSect api.Page
nodeCache map[*html.Node]bool
}
// Convert provides the standard interface for conversion of an HTML document.
// All the function does is return a pointer to api.DocumentConversionResponse with
// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata - so effectively a no-op.
func Convert(ctx context.Context, in interface{}) (interface{}, error) {
return &api.DocumentConversionResponse{
PagesHTML: in.(*api.DocumentConversionRequest).Filedata}, nil
}
// SplitIfHTML splits HTML code into pages, if it exists.
func SplitIfHTML(req *api.DocumentConversionRequest, res *api.DocumentConversionResponse) error {
if len(res.PagesHTML) == 0 {
return nil
}
hd := &htmlToSplit{CFR: res, nodeCache: make(map[*html.Node]bool)}
err := hd.testableSplit(req, res)
/*
for k, v := range hd.CFR.Pages {
fmt.Printf("DEBUG hd.CFR.Pages[%d] = Level: %d Title: %s len(Body)=%d\n",
k, v.Level, v.Title, len(v.Body))
}
*/
return err
}
// testableSplit, NOTE pointer receiver so that test code can inspect generated datastructures.
func (h *htmlToSplit) testableSplit(request *api.DocumentConversionRequest,
response *api.DocumentConversionResponse) error {
doc, err := html.Parse(bytes.NewReader(response.PagesHTML))
if err != nil {
return err
}
if doc.Type != html.DocumentNode {
return fmt.Errorf("no HTML document node")
}
for htm := doc.FirstChild; htm != nil; htm = htm.NextSibling {
if htm.Type == html.ElementNode && htm.DataAtom == atom.Html {
for bdy := htm.FirstChild; bdy != nil; bdy = bdy.NextSibling {
if bdy.Type == html.ElementNode && bdy.DataAtom == atom.Body {
h.thisSect = api.Page{
Level: 1,
Title: utility.BeautifyFilename(request.Filename),
Body: []byte(``)}
err := h.processChildren(bdy)
if err != nil {
h.CFR.Err = err.Error()
}
h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
}
}
}
}
return nil
}
func getLevel(at atom.Atom) uint64 {
level := uint64(1)
switch at {
case atom.H6:
level++
fallthrough
case atom.H5:
level++
fallthrough
case atom.H4:
level++
fallthrough
case atom.H3:
level++
fallthrough
case atom.H2:
level++
fallthrough
case atom.H1:
level++
}
return level
}
func (h *htmlToSplit) processChildren(bdy *html.Node) error {
for c := bdy.FirstChild; c != nil; c = c.NextSibling {
var err error
if c.Type == html.ElementNode {
if level := getLevel(c.DataAtom); level > 1 {
err = h.renderHeading(c, level)
} else {
err = h.renderNonHeading(c)
}
} else {
err = h.renderAppend(c)
}
if err != nil {
return err
}
}
return nil
}
func stripZeroWidthSpaces(str string) string {
ret := ""
for _, r := range str {
if r != 0x200B { // zero width space
ret += string(r) // stripped of zero-width spaces
}
}
return ret
}
func (h *htmlToSplit) renderHeading(c *html.Node, level uint64) error {
byt, err := byteRenderChildren(c) // get heading html
if err != nil {
return err
}
str, err := utility.HTML(string(byt)).Text(false) // heading text
if err != nil {
return err
}
str = stripZeroWidthSpaces(str)
if strings.TrimSpace(str) != "" { // only put in non-empty headings
h.newSect(str, level)
}
return nil
}
func (h *htmlToSplit) newSect(tstr string, level uint64) {
h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
title := tstr //was: utility.EscapeHTMLcomplexChars(tstr) -- removed to avoid double-escaping
body := ``
if len(title) > maxTitle {
body = title[maxTitle:]
title = title[:maxTitle]
}
h.thisSect = api.Page{
Level: level,
Title: title,
Body: []byte(body)}
}
func (h *htmlToSplit) renderNonHeading(c *html.Node) error {
if h.nodeContainsHeading(c) { // ignore this atom in order to get at the contents
err := h.processChildren(c)
if err != nil {
return err
}
} else {
if err := h.renderAppend(c); err != nil {
return err
}
}
return nil
}
func (h *htmlToSplit) renderAppend(c *html.Node) error {
byt, err := byteRender(c)
if err != nil {
return err
}
ebyt := utility.EscapeHTMLcomplexCharsByte(byt)
if len(ebyt) > maxBody {
msg := fmt.Sprintf("(Documize warning: HTML render element ignored, size of %d exceeded maxBody of %d.)", len(ebyt), maxBody)
log.Info(msg)
ebyt = []byte("<p><b>" + msg + "</b></p>")
}
if len(h.thisSect.Body)+len(ebyt) > maxBody {
h.newSect("-", h.thisSect.Level+1) // plus one so that the new "-" one is part of the previous
}
h.thisSect.Body = append(h.thisSect.Body, ebyt...)
return nil
}
func byteRender(n *html.Node) ([]byte, error) {
var b bytes.Buffer
err := html.Render(&b, n)
return b.Bytes(), err
}
func byteRenderChildren(n *html.Node) ([]byte, error) {
var b bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
err := html.Render(&b, c)
if err != nil {
return nil, err
}
}
return b.Bytes(), nil
}
func (h *htmlToSplit) nodeContainsHeading(n *html.Node) bool {
val, ok := h.nodeCache[n]
if ok {
return val
}
switch n.DataAtom {
case atom.H6, atom.H5, atom.H4, atom.H3, atom.H2, atom.H1:
h.nodeCache[n] = true
return true
default:
for c := n.FirstChild; c != nil; c = c.NextSibling {
if h.nodeContainsHeading(c) {
h.nodeCache[n] = true
h.nodeCache[c] = true
return true
}
}
}
h.nodeCache[n] = false
return false
}

View file

@ -0,0 +1,384 @@
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package html_test
import (
"strings"
"testing"
)
import api "github.com/documize/community/core/convapi"
import "github.com/documize/community/core/api/convert/html"
const b string = `
<h1>Markdown: Basics</h1>
<ul id="ProjectSubmenu">
<li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li>
<li><a class="selected" title="Markdown Basics">Basics</a></li>
<li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li>
<li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li>
<li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li>
</ul>
<h2>Getting the Gist of Markdown's Formatting Syntax</h2>
<p>This page offers a brief overview of what it's like to use Markdown.
The <a href="/projects/markdown/syntax" title="Markdown Syntax">syntax page</a> provides complete, detailed documentation for
every feature, but Markdown should be very easy to pick up simply by
looking at a few examples of it in action. The examples on this page
are written in a before/after style, showing example syntax and the
HTML output produced by Markdown.</p>
<p>It's also helpful to simply try Markdown out; the <a href="/projects/markdown/dingus" title="Markdown Dingus">Dingus</a> is a
web application that allows you type your own Markdown-formatted text
and translate it to XHTML.</p>
<p><strong>Note:</strong> This document is itself written using Markdown; you
can <a href="/projects/markdown/basics.text">see the source for it by adding '.text' to the URL</a>.</p>
<h2>Paragraphs, Headers, Blockquotes</h2>
<p>A paragraph is simply one or more consecutive lines of text, separated
by one or more blank lines. (A blank line is any line that looks like a
blank line -- a line containing nothing spaces or tabs is considered
blank.) Normal paragraphs should not be intended with spaces or tabs.</p>
<p>Markdown offers two styles of headers: <em>Setext</em> and <em>atx</em>.
Setext-style headers for <code>&lt;h1&gt;</code> and <code>&lt;h2&gt;</code> are created by
&quot;underlining&quot; with equal signs (<code>=</code>) and hyphens (<code>-</code>), respectively.
To create an atx-style header, you put 1-6 hash marks (<code>#</code>) at the
beginning of the line -- the number of hashes equals the resulting
HTML header level.</p>
<p>Blockquotes are indicated using email-style '<code>&gt;</code>' angle brackets.</p>
<p>Markdown:</p>
<pre><code>A First Level Header
====================
A Second Level Header
---------------------
Now is the time for all good men to come to
the aid of their country. This is just a
regular paragraph.
The quick brown fox jumped over the lazy
dog's back.
### Header 3
&gt; This is a blockquote.
&gt;
&gt; This is the second paragraph in the blockquote.
&gt;
&gt; ## This is an H2 in a blockquote
</code></pre>
<p>Output:</p>
<pre><code>&lt;h1&gt;A First Level Header&lt;/h1&gt;
&lt;h2&gt;A Second Level Header&lt;/h2&gt;
&lt;p&gt;Now is the time for all good men to come to
the aid of their country. This is just a
regular paragraph.&lt;/p&gt;
&lt;p&gt;The quick brown fox jumped over the lazy
dog's back.&lt;/p&gt;
&lt;h3&gt;Header 3&lt;/h3&gt;
&lt;blockquote&gt;
&lt;p&gt;This is a blockquote.&lt;/p&gt;
&lt;p&gt;This is the second paragraph in the blockquote.&lt;/p&gt;
&lt;h2&gt;This is an H2 in a blockquote&lt;/h2&gt;
&lt;/blockquote&gt;
</code></pre>
<h3>Phrase Emphasis</h3>
<p>Markdown uses asterisks and underscores to indicate spans of emphasis.</p>
<p>Markdown:</p>
<pre><code>Some of these words *are emphasized*.
Some of these words _are emphasized also_.
Use two asterisks for **strong emphasis**.
Or, if you prefer, __use two underscores instead__.
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;Some of these words &lt;em&gt;are emphasized&lt;/em&gt;.
Some of these words &lt;em&gt;are emphasized also&lt;/em&gt;.&lt;/p&gt;
&lt;p&gt;Use two asterisks for &lt;strong&gt;strong emphasis&lt;/strong&gt;.
Or, if you prefer, &lt;strong&gt;use two underscores instead&lt;/strong&gt;.&lt;/p&gt;
</code></pre>
<h2>Lists</h2>
<p>Unordered (bulleted) lists use asterisks, pluses, and hyphens (<code>*</code>,
<code>+</code>, and <code>-</code>) as list markers. These three markers are
interchangable; this:</p>
<pre><code>* Candy.
* Gum.
* Booze.
</code></pre>
<p>this:</p>
<pre><code>+ Candy.
+ Gum.
+ Booze.
</code></pre>
<p>and this:</p>
<pre><code>- Candy.
- Gum.
- Booze.
</code></pre>
<p>all produce the same output:</p>
<pre><code>&lt;ul&gt;
&lt;li&gt;Candy.&lt;/li&gt;
&lt;li&gt;Gum.&lt;/li&gt;
&lt;li&gt;Booze.&lt;/li&gt;
&lt;/ul&gt;
</code></pre>
<p>Ordered (numbered) lists use regular numbers, followed by periods, as
list markers:</p>
<pre><code>1. Red
2. Green
3. Blue
</code></pre>
<p>Output:</p>
<pre><code>&lt;ol&gt;
&lt;li&gt;Red&lt;/li&gt;
&lt;li&gt;Green&lt;/li&gt;
&lt;li&gt;Blue&lt;/li&gt;
&lt;/ol&gt;
</code></pre>
<p>If you put blank lines between items, you'll get <code>&lt;p&gt;</code> tags for the
list item text. You can create multi-paragraph list items by indenting
the paragraphs by 4 spaces or 1 tab:</p>
<pre><code>* A list item.
With multiple paragraphs.
* Another item in the list.
</code></pre>
<p>Output:</p>
<pre><code>&lt;ul&gt;
&lt;li&gt;&lt;p&gt;A list item.&lt;/p&gt;
&lt;p&gt;With multiple paragraphs.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;Another item in the list.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;
</code></pre>
<h3>Links</h3>
<p>Markdown supports two styles for creating links: <em>inline</em> and
<em>reference</em>. With both styles, you use square brackets to delimit the
text you want to turn into a link.</p>
<p>Inline-style links use parentheses immediately after the link text.
For example:</p>
<pre><code>This is an [example link](http://example.com/).
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot;&gt;
example link&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<p>Optionally, you may include a title attribute in the parentheses:</p>
<pre><code>This is an [example link](http://example.com/ &quot;With a Title&quot;).
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot; title=&quot;With a Title&quot;&gt;
example link&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<p>Reference-style links allow you to refer to your links by names, which
you define elsewhere in your document:</p>
<pre><code>I get 10 times more traffic from [Google][1] than from
[Yahoo][2] or [MSN][3].
[1]: http://google.com/ &quot;Google&quot;
[2]: http://search.yahoo.com/ &quot;Yahoo Search&quot;
[3]: http://search.msn.com/ &quot;MSN Search&quot;
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;I get 10 times more traffic from &lt;a href=&quot;http://google.com/&quot;
title=&quot;Google&quot;&gt;Google&lt;/a&gt; than from &lt;a href=&quot;http://search.yahoo.com/&quot;
title=&quot;Yahoo Search&quot;&gt;Yahoo&lt;/a&gt; or &lt;a href=&quot;http://search.msn.com/&quot;
title=&quot;MSN Search&quot;&gt;MSN&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<p>The title attribute is optional. Link names may contain letters,
numbers and spaces, but are <em>not</em> case sensitive:</p>
<pre><code>I start my morning with a cup of coffee and
[The New York Times][NY Times].
[ny times]: http://www.nytimes.com/
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;I start my morning with a cup of coffee and
&lt;a href=&quot;http://www.nytimes.com/&quot;&gt;The New York Times&lt;/a&gt;.&lt;/p&gt;
</code></pre>
<h3>Images</h3>
<p>Image syntax is very much like link syntax.</p>
<p>Inline (titles are optional):</p>
<pre><code>![alt text](/path/to/img.jpg &quot;Title&quot;)
</code></pre>
<p>Reference-style:</p>
<pre><code>![alt text][id]
[id]: /path/to/img.jpg &quot;Title&quot;
</code></pre>
<p>Both of the above examples produce the same output:</p>
<pre><code>&lt;img src=&quot;/path/to/img.jpg&quot; alt=&quot;alt text&quot; title=&quot;Title&quot; /&gt;
</code></pre>
<h3>Code</h3>
<p>In a regular paragraph, you can create code span by wrapping text in
backtick quotes. Any ampersands (<code>&amp;</code>) and angle brackets (<code>&lt;</code> or
<code>&gt;</code>) will automatically be translated into HTML entities. This makes
it easy to use Markdown to write about HTML example code:</p>
<pre><code>I strongly recommend against using any "&lt;blink&gt;" tags.
I wish SmartyPants used named entities like "&amp;mdash;""
instead of decimal-encoded entites like "&amp;#8212;".
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;I strongly recommend against using any
&lt;code&gt;&amp;lt;blink&amp;gt;&lt;/code&gt; tags.&lt;/p&gt;
&lt;p&gt;I wish SmartyPants used named entities like
&lt;code&gt;&amp;amp;mdash;&lt;/code&gt; instead of decimal-encoded
entites like &lt;code&gt;&amp;amp;#8212;&lt;/code&gt;.&lt;/p&gt;
</code></pre>
<p>To specify an entire block of pre-formatted code, indent every line of
the block by 4 spaces or 1 tab. Just like with code spans, <code>&amp;</code>, <code>&lt;</code>,
and <code>&gt;</code> characters will be escaped automatically.</p>
<p>Markdown:</p>
<pre><code>If you want your page to validate under XHTML 1.0 Strict,
you've got to put paragraph tags in your blockquotes:
&lt;blockquote&gt;
&lt;p&gt;For example.&lt;/p&gt;
&lt;/blockquote&gt;
</code></pre>
<p>Output:</p>
<pre><code>&lt;p&gt;If you want your page to validate under XHTML 1.0 Strict,
you've got to put paragraph tags in your blockquotes:&lt;/p&gt;
&lt;pre&gt;&lt;code&gt;&amp;lt;blockquote&amp;gt;
&amp;lt;p&amp;gt;For example.&amp;lt;/p&amp;gt;
&amp;lt;/blockquote&amp;gt;
&lt;/code&gt;&lt;/pre&gt;
</code></pre>
<h4>Header4</h4>
<div><div><div><div><div><div>
<h5>Header5</h5>Body 555.
</div></div></div></div></div></div>
<h6>Header6</h6>
`
func TestHTML(t *testing.T) {
req := &api.DocumentConversionRequest{}
res := &api.DocumentConversionResponse{}
err := html.SplitIfHTML(req, res)
if err != nil || len(res.PagesHTML) != 0 || len(res.Pages) != 0 || len(res.EmbeddedFiles) != 0 {
t.Error(err)
return
}
titleTooBig := []byte("<h1>")
for i := 0; i < 2048; i++ {
titleTooBig = append(titleTooBig, []byte("title too long ")...)
}
titleTooBig = append(titleTooBig, []byte("</h1>")...)
req = &api.DocumentConversionRequest{}
res = &api.DocumentConversionResponse{PagesHTML: titleTooBig}
err = html.SplitIfHTML(req, res)
if err != nil || len(res.Pages[0].Title) > 2000 {
t.Error(err)
return
}
req = &api.DocumentConversionRequest{}
res = &api.DocumentConversionResponse{PagesHTML: []byte(b)}
err = html.SplitIfHTML(req, res)
if err != nil {
t.Error(err)
return
}
//for p, pg := range res.Pages {
// t.Logf("%d %d %d %s", p, pg.Level, len(pg.Body), pg.Title)
//}
if !strings.HasPrefix(res.Pages[10].Title, "Header5") ||
!strings.HasPrefix(string(res.Pages[10].Body), "Body 555.") {
t.Errorf("wrong page ten title: `%s` body: `%s`", res.Pages[10].Title, string(res.Pages[10].Body))
}
}