restructure directories

2025-07-24 15:49:44 +02:00 · 2016-07-20 15:58:37 +01:00 · 2016-07-20 15:58:37 +01:00 · a2ce777762
commit a2ce777762
parent 7e4ed6545b
159 changed files with 320 additions and 323 deletions
--- a/core/api/convert/apidocumizecom/init.go
+++ b/core/api/convert/apidocumizecom/init.go
@ -0,0 +1,48 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+package apidocumizecom
+
+import (
+	"crypto/tls"
+	"errors"
+	"net/http"
+
+	"github.com/documize/community/core/api/request"
+)
+
+func endPoint() string {
+	r := request.ConfigString("LICENSE", "endpoint")
+	if r != "" {
+		return r
+	}
+	return "https://api.documize.com"
+}
+
+func token() (string, error) {
+	r := request.ConfigString("LICENSE", "token")
+	if r == "" {
+		return "", errors.New("Documize token is empty")
+	}
+	// TODO more validation here
+	return r, nil
+}
+
+var transport = &http.Transport{
+	TLSClientConfig: &tls.Config{
+		InsecureSkipVerify: true, // TODO should be glick.InsecureSkipVerifyTLS (from -insecure flag) but get error: x509: certificate signed by unknown authority
+	}}
+
+// CheckToken returns an error if the Documize LICENSE token is invalid.
+func CheckToken() error {
+	_, err := token()
+	return err
+}
--- a/core/api/convert/apidocumizecom/msword.go
+++ b/core/api/convert/apidocumizecom/msword.go
@ -0,0 +1,72 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+package apidocumizecom
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"path/filepath"
+
+	api "github.com/documize/community/core/convapi"
+
+	"golang.org/x/net/context"
+)
+
+// Msword type provides a peg to hang the Convert method on.
+type Msword struct{}
+
+// Convert converts a file into the Countersoft Documize format.
+func (file *Msword) Convert(r api.DocumentConversionRequest, reply *api.DocumentConversionResponse) error {
+	byts, err := json.Marshal(r)
+	if err != nil {
+		return err
+	}
+	base := filepath.Base(r.Filename)
+	fmt.Println("Starting conversion of document: ", base)
+
+	client := &http.Client{Transport: transport}
+
+	tok,err:=token()
+	if err != nil {
+		return err
+	}
+	
+	resp, err := client.Post(endPoint()+"/api/word?token="+tok, "application/json", bytes.NewReader(byts))
+	if err != nil {
+		return err
+	}
+	defer func() {
+		if e := resp.Body.Close(); e != nil {
+			fmt.Println("resp.Body.Close error: " + e.Error())
+		}
+	}()
+
+	fmt.Println("Finished converting document: ", base)
+
+	dec := json.NewDecoder(resp.Body)
+	err = dec.Decode(reply)
+
+	return err
+}
+
+// MSwordConvert provides the standard interface for conversion of a MS-Word document.
+// All the function does is return a pointer to api.DocumentConversionResponse with
+// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata converted by the Documize server.
+func MSwordConvert(ctx context.Context, in interface{}) (interface{}, error) {
+	var msw Msword
+	dcr := in.(*api.DocumentConversionRequest)
+	rep := new(api.DocumentConversionResponse)
+	err := msw.Convert(*dcr, rep)
+	return rep, err
+}
--- a/core/api/convert/convert.go
+++ b/core/api/convert/convert.go
@ -0,0 +1,124 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+// Package convert provides the gateway to document conversion native and plugin functionality, both in and out of the system.
+package convert
+
+import (
+	"errors"
+	"github.com/documize/community/core/api/convert/excerpt"
+	"github.com/documize/community/core/api/convert/html"
+	"github.com/documize/community/core/api/plugins"
+api	"github.com/documize/community/core/convapi"
+	"github.com/documize/community/core/utility"
+
+	"golang.org/x/net/context"
+)
+
+// Convert provides the entry-point into the document conversion process.
+func Convert(ctx context.Context, xtn string, fileRequest *api.DocumentConversionRequest) (*api.DocumentConversionResponse, error) {
+	fileRequest.Token = plugins.Lib.Token("Convert", xtn)
+	fileResultI, err := plugins.Lib.Run(ctx, "Convert", xtn, fileRequest)
+	if err != nil {
+		return nil, err
+	}
+	fileResult, ok := fileResultI.(*api.DocumentConversionResponse)
+	if !ok {
+		return nil, errors.New("interface conversion: interface {} is nil, not *api.ConversionFileResponse")
+	}
+	if fileResult.Err != "" {
+		return nil, errors.New(fileResult.Err)
+	}
+
+	err = html.SplitIfHTML(fileRequest, fileResult)
+	if err != nil {
+		return nil, err
+	}
+
+	/* TODO add title & body santization that keeps the images & table formatting
+	for p, pg := range fileResult.Pages {
+		fileResult.Pages[p].Title = titlePolicy.Sanitize(pg.Title)
+		fileResult.Pages[p].Body = bodyPolicy.SanitizeBytes(pg.Body)
+	}
+	*/
+
+	if fileResult.Excerpt != "" {
+		//fmt.Println("DEBUG supplied excerpt: " + fileResult.Excerpt)
+	} else {
+		titleWds := []string{}
+		bodyWds := []string{}
+		for p := range fileResult.Pages {
+			var wds []string
+			var err error
+			if p > 0 { // title 0 is already the title of the document
+				wds, _, err = utility.Words(utility.HTML(fileResult.Pages[p].Title), 0, false)
+				if err != nil {
+					return nil, err
+				}
+				titleWds = append(titleWds, wds...)
+				titleWds = append(titleWds, ".")
+			}
+			wds, _, err = utility.Words(utility.HTML(string(fileResult.Pages[p].Body)), 0, false)
+			if err != nil {
+				return nil, err
+			}
+			bodyWds = append(bodyWds, wds...)
+			bodyWds = append(bodyWds, ".")
+		}
+		fileResult.Excerpt = excerpt.Excerpt(titleWds, bodyWds)
+	}
+
+	return fileResult, nil
+}
+
+/* TODO add sanitisation for body & title HTML
+var titlePolicy, bodyPolicy *bluemonday.Policy
+
+func init() {
+	policy := bluemonday.UGCPolicy()
+	policy.RequireNoFollowOnLinks(true)
+
+	// URLs must be parseable by net/url.Parse()
+	policy.RequireParseableURLs(true)
+	policy.AllowRelativeURLs(false)
+	policy.AllowURLSchemes("http", "https")
+
+	// replacement below for:	policy.AllowDataURIImages()
+	// Supply a function to validate images contained within data URI
+	policy.AllowURLSchemeWithCustomPolicy(
+		"data",
+		func(url *url.URL) (allowUrl bool) {
+			if url.RawQuery != "" || url.Fragment != "" {
+				return false
+			}
+
+			//	matched := dataURIImagePrefix.FindString(url.Opaque)
+			//	if matched == "" {
+			//		return false
+			//	}
+
+			//	_, err := base64.StdEncoding.DecodeString(url.Opaque[len(matched):])
+			//	if err != nil {
+			//		return false
+			//	}
+
+			return true
+		})
+	policy.AllowImages()
+
+	// TODO remove links to #tags
+	// TODO allow DataURI of image/* for LibreOffice ppt output
+
+	bodyPolicy = policy
+
+	titlePolicy = bluemonday.StrictPolicy()
+}
+*/
--- a/core/api/convert/convert_test.go
+++ b/core/api/convert/convert_test.go
@ -0,0 +1,155 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package convert_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/documize/community/core/api/convert"
+	"github.com/documize/community/core/api/plugins"
+	api "github.com/documize/community/core/convapi"
+	"github.com/documize/community/core/log"
+
+	"golang.org/x/net/context"
+)
+
+func TestConvert(t *testing.T) {
+
+	plugins.PluginFile = "" // no file as html is built-in
+	if lerr := plugins.LibSetup(); lerr == nil {
+		//t.Error("did not error on plugin.Libsetup() with no plugin.json file")
+		//return
+	}
+	defer log.IfErr(plugins.Lib.KillSubProcs())
+
+	ctx := context.Background()
+	xtn := "html"
+	fileRequest := new(api.DocumentConversionRequest)
+	fileRequest.Filedata = []byte(yorkweb)
+	resp, err := convert.Convert(ctx, xtn, fileRequest)
+	if err != nil {
+		t.Error(err)
+		return
+	}
+	if len(resp.Pages) != 3 ||
+		!strings.HasPrefix(resp.Pages[1].Title, "STARTING") ||
+		!strings.HasPrefix(resp.Pages[2].Title, "EXERCISE") {
+		for p, pg := range resp.Pages {
+			t.Error(p, pg.Level, len(pg.Body), pg.Title)
+		}
+	}
+	exp := "There are lots of ways to create web pages using already coded programmes. … HTML isn' t computer code, but is a language that uses US English to enable texts( words, images, sounds) to be inserted and formatting such as colo( u) r and centre/ erin…"
+	if resp.Excerpt != exp {
+		t.Errorf("unexpected excerpt wanted: `%s` got: `%s`", exp, resp.Excerpt)
+	}
+
+	// check errors are caught
+	resp, err = convert.Convert(ctx, "unknown", fileRequest)
+	if err == nil {
+		t.Error("does not error on unknown extension")
+	}
+
+}
+
+// www.york.ac.uk/teaching/cws/wws/webpage1.html
+const yorkweb = `
+
+<HMTL>
+<HEAD>
+<TITLE>webpage1</TITLE>
+</HEAD>
+<BODY BGCOLOR="FFFFFf" LINK="006666" ALINK="8B4513" VLINK="006666">
+<TABLE WIDTH="75%" ALIGN="center">
+<TR>
+<TD>
+<DIV ALIGN="center"><H1>STARTING . . . </H1></DIV>
+
+
+<DIV ALIGN="justify"><P>There are lots of ways to create web pages using already coded programmes. These lessons will teach you how to use the underlying HyperText Markup Language -  HTML. 
+<BR>
+<P>HTML isn't computer code, but is a language that uses US English to enable texts (words, images, sounds) to be inserted and formatting such as colo(u)r and centre/ering to be written in. The process is fairly simple; the main difficulties often lie in small mistakes - if you slip up while word processing your reader may pick up your typos, but the page will still be legible. However, if your HTML is inaccurate the page may not appear - writing web pages is, at the least, very good practice for proof reading!</P>
+
+<P>Learning HTML will enable you to:
+<UL>
+<LI>create your own simple pages
+<LI>read and appreciate pages created by others
+<LI>develop an understanding of the creative and literary implications of web-texts
+<LI>have the confidence to branch out into more complex web design 
+</UL></P>
+
+<P>A HTML web page is made up of tags. Tags are placed in brackets like this <B>< tag > </B>. A tag tells the browser how to display information. Most tags need to be opened < tag > and closed < /tag >.
+
+<P> To make a simple web page you need to know only four tags:
+<UL>
+<LI>< HTML > tells the browser your page is written in HTML format
+<LI>< HEAD > this is a kind of preface of vital information that doesn't appear on the screen. 
+<LI>< TITLE >Write the title of the web page here - this is the information that viewers see on the upper bar of their screen. (I've given this page the title 'webpage1').
+<LI>< BODY >This is where you put the content of your page, the words and pictures that people read on the screen. 
+</UL>
+<P>All these tags need to be closed.
+
+<H4>EXERCISE</H4>
+
+<P>Write a simple web page.</P>
+<P> Copy out exactly the HTML below, using a WP program such as Notepad.<BR>
+Information in <I>italics</I> indicates where you can insert your own text, other information is HTML and needs to be exact. However, make sure there are no spaces between the tag brackets and the text inside.<BR>
+(Find Notepad by going to the START menu\ PROGRAMS\ ACCESSORIES\ NOTEPAD). 
+<P>
+< HTML ><BR>
+< HEAD ><BR>
+< TITLE ><I> title of page</I>< /TITLE ><BR>
+< /HEAD ><BR>
+< BODY><BR>
+<I> write what you like here: 'my first web page', or a piece about what you are reading, or a few thoughts on the course, or copy out a few words from a book or cornflake packet.  Just type in your words using no extras such as bold, or italics, as these have special HTML tags, although you may use upper and lower case letters and single spaces. </I><BR>
+
+< /BODY ><BR>
+< /HTML ><BR>
+
+<P>Save the file as 'first.html' (ie. call the file anything at all) It's useful if you start a folder - just as you would for word-processing - and call it something like WEBPAGES, and put your first.html file in the folder.
+
+<P>NOW - open your browser.<BR>
+On Netscape the process is: <BR>  
+Top menu; FILE\ OPEN PAGE\ CHOOSE FILE<BR> 
+Click on your WEBPAGES folder\ FIRST file<BR>
+Click 'open' and your page should appear.
+<P>On Internet Explorer: <BR>
+Top menu; FILE\ OPEN\ BROWSE <BR> 
+Click on your WEBPAGES folder\ FIRST file<BR>
+Click 'open' and your page should appear.<BR>
+
+
+<P>If the page doesn't open, go back over your notepad typing and make sure that all the HTML tags are correct. Check there are no spaces between tags and internal text; check that all tags are closed; check that you haven't written < HTLM > or < BDDY >.  Your page will work eventually. 
+<P>
+Make another page. Call it somethingdifferent.html and place it in the same WEBPAGES folder as detailed above.
+<P>start formatting in <A HREF="webpage2.html">lesson two</A>
+<BR><A HREF="col3.html">back to wws index</A> </P>
+</P>
+ 
+  
+</DIV>
+
+
+</TD>
+</TR>
+</TABLE>
+</BODY>
+</HTML>
+
+
+
+
+
+
+
+
+`
--- a/core/api/convert/documizeapi/documizeapi.go
+++ b/core/api/convert/documizeapi/documizeapi.go
@ -0,0 +1,27 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+package documizeapi
+
+import (
+	"encoding/json"
+
+	api "github.com/documize/community/core/convapi"
+
+	"golang.org/x/net/context"
+)
+
+// Convert provides the standard interface for conversion of a ".documizeapi" json document.
+func Convert(ctx context.Context, in interface{}) (interface{}, error) {
+	ret := new(api.DocumentConversionResponse)
+	err := json.Unmarshal(in.(*api.DocumentConversionRequest).Filedata, ret)
+	return ret, err
+}
--- a/core/api/convert/excerpt/excerpt.go
+++ b/core/api/convert/excerpt/excerpt.go
@ -0,0 +1,228 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+// Package excerpt provides basic functionality to create excerpts of text in English.
+package excerpt
+
+import (
+	"sort"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+
+	words "github.com/documize/community/core/wordlists/en-2012"
+
+	"github.com/rookii/paicehusk"
+)
+
+type extractItem struct {
+	sequence int
+	score    float64
+	count    int
+	sentance string
+}
+
+type extractList []extractItem
+
+// the Sort interface
+// Len is the number of elements in the collection.
+func (a extractList) Len() int { return len(a) }
+
+// Less reports whether the element with
+// index i should sort before the element with index j.
+func (a extractList) Less(i, j int) bool {
+	return (a[i].score / float64(a[i].count)) > (a[j].score / float64(a[j].count))
+}
+
+// Swap swaps the elements with indexes i and j.
+func (a extractList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+
+type presentItem struct {
+	sequence int
+	text     string
+}
+
+type presentList []presentItem
+
+// the Sort interface
+// Len is the number of elements in the collection.
+func (a presentList) Len() int { return len(a) }
+
+// Less reports whether the element with
+// index i should sort before the element with index j.
+func (a presentList) Less(i, j int) bool {
+	return a[i].sequence < a[j].sequence
+}
+
+// Swap swaps the elements with indexes i and j.
+func (a presentList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+
+func addWd(sentance, wd string) (string, bool) {
+	var isStop bool
+	if len(sentance) == 0 {
+		if wd != "[" {
+			sentance = wd
+		}
+	} else {
+		switch wd {
+		case "[": //NoOp
+		case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
+			if unicode.IsDigit(rune(sentance[len(sentance)-1])) {
+				sentance += wd
+			} else {
+				sentance += " " + wd
+			}
+		case ".", "!", "?":
+			isStop = true
+			fallthrough
+		default:
+			if isPunct(wd) {
+				sentance += wd
+			} else {
+				sentance += " " + wd
+			}
+		}
+	}
+	return sentance, isStop
+}
+
+func isPunct(s string) bool {
+	for _, r := range s {
+		if !unicode.IsPunct(r) {
+			switch r {
+			case '`', '\'', '"', '(', '/': // still punct
+			default:
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// Excerpt returns the most statically significant 100 or so words of text for use in the Excerpt field
+func Excerpt(titleWords, bodyWords []string) string {
+	var el extractList
+
+	//fmt.Println("DEBUG Excerpt ", len(titleWords), len(bodyWords))
+
+	// populate stemMap
+	stemMap := make(map[string]uint64)
+	for _, wd := range bodyWords {
+		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
+		stemMap[stem]++
+	}
+	for _, wd := range titleWords {
+		stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
+		stemMap[stem]++                         // TODO are words in titles more important?
+	}
+
+	wds := append(titleWords, bodyWords...)
+
+	sentance := ""
+	score := 0.0
+	count := 0
+	seq := 0
+	for _, wd := range wds {
+		var isStop bool
+
+		sentance, isStop = addWd(sentance, wd)
+
+		if isStop {
+			//fmt.Printf(" DEBUG sentance: %3d %3.2f %s\n",
+			//	seq, score*10000/float64(count), sentance)
+			var ei extractItem
+			ei.count = count + 1 // must be at least 1
+			ei.score = score
+			ei.sentance = sentance
+			ei.sequence = seq
+			el = append(el, ei)
+			sentance = ""
+			score = 0.0
+			seq++
+		} else {
+			uncommon := true
+			// TODO Discuss correct level or maybe find a better algorithem for this
+			ent, ok := words.Words[wd]
+			if ok {
+				if ent.Rank <= 100 {
+					// do not score very common words
+					uncommon = false
+				}
+			}
+			if uncommon {
+				stem := paicehusk.DefaultRules.Stem(wd) // find the stem of the word
+				usage, used := stemMap[stem]
+				if used {
+					relativeStemFreq := (float64(usage) / float64(len(wds))) - words.Stems[stem]
+					if relativeStemFreq > 0.0 {
+						score += relativeStemFreq
+					}
+				}
+				count++
+			}
+		}
+	}
+
+	sort.Sort(el)
+
+	return present(el)
+}
+
+func present(el extractList) (ret string) {
+	var pl presentList
+	words := 0
+
+	const excerptWords = 50
+
+	for s, e := range el {
+		if (words < excerptWords || s == 0) && len(e.sentance) > 1 &&
+			notEmpty(e.sentance) {
+			words += e.count
+			pl = append(pl, presentItem{sequence: e.sequence, text: e.sentance})
+			//fmt.Printf("DEBUG With score %3.2f on page %d // %s \n",
+			//	1000*e.score/float64(e.count), e.sequence, e.sentance)
+		}
+	}
+	sort.Sort(pl)
+
+	var lastSeq int
+	for p := range pl {
+		txt := strings.TrimPrefix(pl[p].text, ". ")
+		if p == 0 {
+			ret = txt
+			lastSeq = pl[0].sequence
+		} else {
+			thisSeq := pl[p].sequence
+			if lastSeq+1 != thisSeq {
+				ret += " …" // Horizontal elipsis character
+			}
+			ret += " " + txt
+			lastSeq = thisSeq
+		}
+	}
+	if len(ret) > 250 { // make sure the excerpt is not too long, shorten it if required
+		for len(ret) > 250 {
+			_, size := utf8.DecodeLastRuneInString(ret)
+			ret = ret[:len(ret)-size]
+		}
+		return ret + "…" // Horizontal elipsis character added after truncation
+	}
+	return ret
+}
+
+func notEmpty(wds string) bool {
+	for _, r := range wds {
+		if !unicode.IsPunct(r) && !unicode.IsSpace(r) {
+			return true
+		}
+	}
+	return false
+}
--- a/core/api/convert/excerpt/excerpt_test.go
+++ b/core/api/convert/excerpt/excerpt_test.go
@ -0,0 +1,130 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package excerpt_test
+
+import "testing"
+import "github.com/documize/community/core/api/convert/excerpt"
+import "strings"
+import "fmt"
+
+func TestExerpt(t *testing.T) {
+	if excerpt.Excerpt(nil, nil) != "" ||
+		excerpt.Excerpt([]string{}, []string{}) != "" {
+		t.Error("empty lists do not return empty string")
+	}
+	qbf := strings.Split("The quick brown fox jumps over the lazy dog .", " ")
+	qbf2 := qbf
+	for i := 0; i < 200; i++ {
+		qbf2 = append(qbf2, qbf...)
+	}
+	tst := excerpt.Excerpt(qbf, qbf2)
+	if tst !=
+		"The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." {
+		t.Error("'quick brown fox' did not work:", tst)
+	}
+
+	tt123 := strings.Split("Testing , testing ; 1 2 3 is fun ! Bracket [ anyone ? .", " ")
+	tt123a := tt123
+	for i := 0; i < 200; i++ {
+		tt123a = append(tt123a, fmt.Sprintf("%d", i))
+		tt123a = append(tt123a, tt123...)
+	}
+	tst2 := excerpt.Excerpt(tt123, tt123a)
+	if tst2 !=
+		"Testing, testing; 123 is fun! … Testing, testing; 123 is fun! … 0 Testing, testing; 123 is fun!" {
+		t.Error("'Testing testing 123' did not work:", tst2)
+	}
+
+	s := strings.Split(strings.Replace(`
+It's supercalifragilisticexpialidocious
+Even though the sound of it is something quite atrocious
+If you say it loud enough, you'll always sound precocious
+Supercalifragilisticexpialidocious
+
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+
+Because I was afraid to speak
+When I was just a lad
+My father gave me nose a tweak
+And told me I was bad
+
+But then one day I learned a word
+That saved me achin' nose
+The biggest word I ever heard
+And this is how it goes, oh
+
+Supercalifragilisticexpialidocious
+Even though the sound of it is something quite atrocious
+If you say it loud enough, you'll always sound precocious
+Supercalifragilisticexpialidocious
+
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+
+He traveled all around the world
+And everywhere he went
+He'd use his word and all would say
+There goes a clever gent
+
+When Dukes and Maharajahs
+Pass the time of day with me
+I say me special word
+And then they ask me out to tea
+
+Oh, supercalifragilisticexpialidocious
+Even though the sound of it is something quite atrocious
+If you say it loud enough, you'll always sound precocious
+Supercalifragilisticexpialidocious
+
+Um diddle, diddle diddle, um diddle ay
+Um diddle, diddle diddle, um diddle ay
+
+No, you can say it backwards, which is dociousaliexpilisticfragicalirupus
+But that's going a bit too far, don't you think?
+
+So when the cat has got your tongue
+There's no need for dismay
+Just summon up this word
+And then you've got a lot to say
+
+But better use it carefully
+Or it could change your life
+For example, yes, one night I said it to me girl
+And now me girl's my wife, oh, and a lovely thing she's too
+
+She's, supercalifragilisticexpialidocious
+Supercalifragilisticexpialidocious
+Supercalifragilisticexpialidocious
+Supercalifragilisticexpialidocious
+.	`, "\n", " . ", -1), " ")
+	ts := []string{"Supercalifragilisticexpialidocious", "song", "lyrics"}
+	st := excerpt.Excerpt(ts, s)
+	if st != "Supercalifragilisticexpialidocious song lyrics. … Um diddle, diddle diddle, um diddle ay. Um diddle, diddle diddle, um diddle ay." {
+		t.Error("'Supercalifragilisticexpialidocious song lyrics' did not work:", st)
+	}
+
+	ss := []string{"Supercalifragilisticexpialidocious", "!"}
+	ssa := ss
+	for i := 0; i < 100; i++ {
+		ssa = append(ssa, ss...)
+	}
+	sst := excerpt.Excerpt(ss, ssa)
+	if sst !=
+		"Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious! Supercalifragilisticexpialidocious…" {
+		t.Error("'Supercalifragilisticexpialidocious' did not work:", sst)
+	}
+}
--- a/core/api/convert/html/README.md
+++ b/core/api/convert/html/README.md
@ -0,0 +1,10 @@
+How the HTML conversion works
+=============================
+
+Uses the "golang.org/x/net" repository package "html" to parse the HTML into a tree,
+then walks the tree using processHeadings() to make a series of sections with a heading as their title and the following HTML as the body. 
+
+Importantly, if a heading is within some other structure, that other structure is ignored in order to get the heading into the list. This seems to mostly work well, but may have some unintended side-effects.
+
+On the subject of unintended side-effects, or rather their avoidance, "script" HTML tags and their contents are not passed through.
+
--- a/core/api/convert/html/doc.go
+++ b/core/api/convert/html/doc.go
@ -0,0 +1,13 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+// Package html documizes html files.
+package html
--- a/core/api/convert/html/html.go
+++ b/core/api/convert/html/html.go
@ -0,0 +1,244 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>.
+//
+// https://documize.com
+
+package html
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+
+	api "github.com/documize/community/core/convapi"
+	"github.com/documize/community/core/log"
+	"github.com/documize/community/core/utility"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+
+	"golang.org/x/net/context"
+)
+
+const maxTitle = 2000   // NOTE: must be the same length as database page.title
+const maxBody = 4000000 // NOTE: must be less than the mysql max_allowed_packet limit, amongst other values
+
+type htmlToSplit struct {
+	CFR       *api.DocumentConversionResponse
+	thisSect  api.Page
+	nodeCache map[*html.Node]bool
+}
+
+// Convert provides the standard interface for conversion of an HTML document.
+// All the function does is return a pointer to api.DocumentConversionResponse with
+// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata - so effectively a no-op.
+func Convert(ctx context.Context, in interface{}) (interface{}, error) {
+	return &api.DocumentConversionResponse{
+		PagesHTML: in.(*api.DocumentConversionRequest).Filedata}, nil
+}
+
+// SplitIfHTML splits HTML code into pages, if it exists.
+func SplitIfHTML(req *api.DocumentConversionRequest, res *api.DocumentConversionResponse) error {
+	if len(res.PagesHTML) == 0 {
+		return nil
+	}
+	hd := &htmlToSplit{CFR: res, nodeCache: make(map[*html.Node]bool)}
+	err := hd.testableSplit(req, res)
+	/*
+		for k, v := range hd.CFR.Pages {
+			fmt.Printf("DEBUG hd.CFR.Pages[%d] = Level: %d Title: %s len(Body)=%d\n",
+				k, v.Level, v.Title, len(v.Body))
+		}
+	*/
+	return err
+}
+
+// testableSplit, NOTE pointer receiver so that test code can inspect generated datastructures.
+func (h *htmlToSplit) testableSplit(request *api.DocumentConversionRequest,
+	response *api.DocumentConversionResponse) error {
+	doc, err := html.Parse(bytes.NewReader(response.PagesHTML))
+	if err != nil {
+		return err
+	}
+	if doc.Type != html.DocumentNode {
+		return fmt.Errorf("no HTML document node")
+	}
+	for htm := doc.FirstChild; htm != nil; htm = htm.NextSibling {
+		if htm.Type == html.ElementNode && htm.DataAtom == atom.Html {
+			for bdy := htm.FirstChild; bdy != nil; bdy = bdy.NextSibling {
+				if bdy.Type == html.ElementNode && bdy.DataAtom == atom.Body {
+					h.thisSect = api.Page{
+						Level: 1,
+						Title: utility.BeautifyFilename(request.Filename),
+						Body:  []byte(``)}
+					err := h.processChildren(bdy)
+					if err != nil {
+						h.CFR.Err = err.Error()
+					}
+					h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+func getLevel(at atom.Atom) uint64 {
+	level := uint64(1)
+	switch at {
+	case atom.H6:
+		level++
+		fallthrough
+	case atom.H5:
+		level++
+		fallthrough
+	case atom.H4:
+		level++
+		fallthrough
+	case atom.H3:
+		level++
+		fallthrough
+	case atom.H2:
+		level++
+		fallthrough
+	case atom.H1:
+		level++
+	}
+	return level
+}
+
+func (h *htmlToSplit) processChildren(bdy *html.Node) error {
+	for c := bdy.FirstChild; c != nil; c = c.NextSibling {
+		var err error
+		if c.Type == html.ElementNode {
+			if level := getLevel(c.DataAtom); level > 1 {
+				err = h.renderHeading(c, level)
+			} else {
+				err = h.renderNonHeading(c)
+			}
+		} else {
+			err = h.renderAppend(c)
+		}
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func stripZeroWidthSpaces(str string) string {
+	ret := ""
+	for _, r := range str {
+		if r != 0x200B { // zero width space
+			ret += string(r) // stripped of zero-width spaces
+		}
+	}
+	return ret
+}
+
+func (h *htmlToSplit) renderHeading(c *html.Node, level uint64) error {
+	byt, err := byteRenderChildren(c) // get heading html
+	if err != nil {
+		return err
+	}
+	str, err := utility.HTML(string(byt)).Text(false) // heading text
+	if err != nil {
+		return err
+	}
+	str = stripZeroWidthSpaces(str)
+	if strings.TrimSpace(str) != "" { // only put in non-empty headings
+		h.newSect(str, level)
+	}
+	return nil
+}
+
+func (h *htmlToSplit) newSect(tstr string, level uint64) {
+	h.CFR.Pages = append(h.CFR.Pages, h.thisSect)
+	title := tstr //was: utility.EscapeHTMLcomplexChars(tstr) -- removed to avoid double-escaping
+	body := ``
+	if len(title) > maxTitle {
+		body = title[maxTitle:]
+		title = title[:maxTitle]
+	}
+	h.thisSect = api.Page{
+		Level: level,
+		Title: title,
+		Body:  []byte(body)}
+}
+
+func (h *htmlToSplit) renderNonHeading(c *html.Node) error {
+	if h.nodeContainsHeading(c) { // ignore this atom in order to get at the contents
+		err := h.processChildren(c)
+		if err != nil {
+			return err
+		}
+	} else {
+		if err := h.renderAppend(c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (h *htmlToSplit) renderAppend(c *html.Node) error {
+	byt, err := byteRender(c)
+	if err != nil {
+		return err
+	}
+	ebyt := utility.EscapeHTMLcomplexCharsByte(byt)
+	if len(ebyt) > maxBody {
+		msg := fmt.Sprintf("(Documize warning: HTML render element ignored, size of %d exceeded maxBody of %d.)", len(ebyt), maxBody)
+		log.Info(msg)
+		ebyt = []byte("<p><b>" + msg + "</b></p>")
+	}
+	if len(h.thisSect.Body)+len(ebyt) > maxBody {
+		h.newSect("-", h.thisSect.Level+1) // plus one so that the new "-" one is part of the previous
+	}
+	h.thisSect.Body = append(h.thisSect.Body, ebyt...)
+	return nil
+}
+
+func byteRender(n *html.Node) ([]byte, error) {
+	var b bytes.Buffer
+	err := html.Render(&b, n)
+	return b.Bytes(), err
+}
+
+func byteRenderChildren(n *html.Node) ([]byte, error) {
+	var b bytes.Buffer
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		err := html.Render(&b, c)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return b.Bytes(), nil
+}
+
+func (h *htmlToSplit) nodeContainsHeading(n *html.Node) bool {
+	val, ok := h.nodeCache[n]
+	if ok {
+		return val
+	}
+	switch n.DataAtom {
+	case atom.H6, atom.H5, atom.H4, atom.H3, atom.H2, atom.H1:
+		h.nodeCache[n] = true
+		return true
+	default:
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			if h.nodeContainsHeading(c) {
+				h.nodeCache[n] = true
+				h.nodeCache[c] = true
+				return true
+			}
+		}
+	}
+	h.nodeCache[n] = false
+	return false
+}
--- a/core/api/convert/html/html_test.go
+++ b/core/api/convert/html/html_test.go
@ -0,0 +1,384 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+package html_test
+
+import (
+	"strings"
+	"testing"
+)
+import api "github.com/documize/community/core/convapi"
+import "github.com/documize/community/core/api/convert/html"
+
+const b string = `
+<h1>Markdown: Basics</h1>
+
+<ul id="ProjectSubmenu">
+    <li><a href="/projects/markdown/" title="Markdown Project Page">Main</a></li>
+    <li><a class="selected" title="Markdown Basics">Basics</a></li>
+    <li><a href="/projects/markdown/syntax" title="Markdown Syntax Documentation">Syntax</a></li>
+    <li><a href="/projects/markdown/license" title="Pricing and License Information">License</a></li>
+    <li><a href="/projects/markdown/dingus" title="Online Markdown Web Form">Dingus</a></li>
+</ul>
+
+<h2>Getting the Gist of Markdown's Formatting Syntax</h2>
+
+<p>This page offers a brief overview of what it's like to use Markdown.
+The <a href="/projects/markdown/syntax" title="Markdown Syntax">syntax page</a> provides complete, detailed documentation for
+every feature, but Markdown should be very easy to pick up simply by
+looking at a few examples of it in action. The examples on this page
+are written in a before/after style, showing example syntax and the
+HTML output produced by Markdown.</p>
+
+<p>It's also helpful to simply try Markdown out; the <a href="/projects/markdown/dingus" title="Markdown Dingus">Dingus</a> is a
+web application that allows you type your own Markdown-formatted text
+and translate it to XHTML.</p>
+
+<p><strong>Note:</strong> This document is itself written using Markdown; you
+can <a href="/projects/markdown/basics.text">see the source for it by adding '.text' to the URL</a>.</p>
+
+<h2>Paragraphs, Headers, Blockquotes</h2>
+
+<p>A paragraph is simply one or more consecutive lines of text, separated
+by one or more blank lines. (A blank line is any line that looks like a
+blank line -- a line containing nothing spaces or tabs is considered
+blank.) Normal paragraphs should not be intended with spaces or tabs.</p>
+
+<p>Markdown offers two styles of headers: <em>Setext</em> and <em>atx</em>.
+Setext-style headers for <code>&lt;h1&gt;</code> and <code>&lt;h2&gt;</code> are created by
+&quot;underlining&quot; with equal signs (<code>=</code>) and hyphens (<code>-</code>), respectively.
+To create an atx-style header, you put 1-6 hash marks (<code>#</code>) at the
+beginning of the line -- the number of hashes equals the resulting
+HTML header level.</p>
+
+<p>Blockquotes are indicated using email-style '<code>&gt;</code>' angle brackets.</p>
+
+<p>Markdown:</p>
+
+<pre><code>A First Level Header
+====================
+
+A Second Level Header
+---------------------
+
+Now is the time for all good men to come to
+the aid of their country. This is just a
+regular paragraph.
+
+The quick brown fox jumped over the lazy
+dog's back.
+
+### Header 3
+
+&gt; This is a blockquote.
+&gt; 
+&gt; This is the second paragraph in the blockquote.
+&gt;
+&gt; ## This is an H2 in a blockquote
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;h1&gt;A First Level Header&lt;/h1&gt;
+
+&lt;h2&gt;A Second Level Header&lt;/h2&gt;
+
+&lt;p&gt;Now is the time for all good men to come to
+the aid of their country. This is just a
+regular paragraph.&lt;/p&gt;
+
+&lt;p&gt;The quick brown fox jumped over the lazy
+dog's back.&lt;/p&gt;
+
+&lt;h3&gt;Header 3&lt;/h3&gt;
+
+&lt;blockquote&gt;
+    &lt;p&gt;This is a blockquote.&lt;/p&gt;
+
+    &lt;p&gt;This is the second paragraph in the blockquote.&lt;/p&gt;
+
+    &lt;h2&gt;This is an H2 in a blockquote&lt;/h2&gt;
+&lt;/blockquote&gt;
+</code></pre>
+
+<h3>Phrase Emphasis</h3>
+
+<p>Markdown uses asterisks and underscores to indicate spans of emphasis.</p>
+
+<p>Markdown:</p>
+
+<pre><code>Some of these words *are emphasized*.
+Some of these words _are emphasized also_.
+
+Use two asterisks for **strong emphasis**.
+Or, if you prefer, __use two underscores instead__.
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;Some of these words &lt;em&gt;are emphasized&lt;/em&gt;.
+Some of these words &lt;em&gt;are emphasized also&lt;/em&gt;.&lt;/p&gt;
+
+&lt;p&gt;Use two asterisks for &lt;strong&gt;strong emphasis&lt;/strong&gt;.
+Or, if you prefer, &lt;strong&gt;use two underscores instead&lt;/strong&gt;.&lt;/p&gt;
+</code></pre>
+
+<h2>Lists</h2>
+
+<p>Unordered (bulleted) lists use asterisks, pluses, and hyphens (<code>*</code>,
+<code>+</code>, and <code>-</code>) as list markers. These three markers are
+interchangable; this:</p>
+
+<pre><code>*   Candy.
+*   Gum.
+*   Booze.
+</code></pre>
+
+<p>this:</p>
+
+<pre><code>+   Candy.
+   Gum.
+   Booze.
+</code></pre>
+
+<p>and this:</p>
+
+<pre><code>-   Candy.
+-   Gum.
+-   Booze.
+</code></pre>
+
+<p>all produce the same output:</p>
+
+<pre><code>&lt;ul&gt;
+&lt;li&gt;Candy.&lt;/li&gt;
+&lt;li&gt;Gum.&lt;/li&gt;
+&lt;li&gt;Booze.&lt;/li&gt;
+&lt;/ul&gt;
+</code></pre>
+
+<p>Ordered (numbered) lists use regular numbers, followed by periods, as
+list markers:</p>
+
+<pre><code>1.  Red
+2.  Green
+3.  Blue
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;ol&gt;
+&lt;li&gt;Red&lt;/li&gt;
+&lt;li&gt;Green&lt;/li&gt;
+&lt;li&gt;Blue&lt;/li&gt;
+&lt;/ol&gt;
+</code></pre>
+
+<p>If you put blank lines between items, you'll get <code>&lt;p&gt;</code> tags for the
+list item text. You can create multi-paragraph list items by indenting
+the paragraphs by 4 spaces or 1 tab:</p>
+
+<pre><code>*   A list item.
+
+    With multiple paragraphs.
+
+*   Another item in the list.
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;ul&gt;
+&lt;li&gt;&lt;p&gt;A list item.&lt;/p&gt;
+&lt;p&gt;With multiple paragraphs.&lt;/p&gt;&lt;/li&gt;
+&lt;li&gt;&lt;p&gt;Another item in the list.&lt;/p&gt;&lt;/li&gt;
+&lt;/ul&gt;
+</code></pre>
+
+<h3>Links</h3>
+
+<p>Markdown supports two styles for creating links: <em>inline</em> and
+<em>reference</em>. With both styles, you use square brackets to delimit the
+text you want to turn into a link.</p>
+
+<p>Inline-style links use parentheses immediately after the link text.
+For example:</p>
+
+<pre><code>This is an [example link](http://example.com/).
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot;&gt;
+example link&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>Optionally, you may include a title attribute in the parentheses:</p>
+
+<pre><code>This is an [example link](http://example.com/ &quot;With a Title&quot;).
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;This is an &lt;a href=&quot;http://example.com/&quot; title=&quot;With a Title&quot;&gt;
+example link&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>Reference-style links allow you to refer to your links by names, which
+you define elsewhere in your document:</p>
+
+<pre><code>I get 10 times more traffic from [Google][1] than from
+[Yahoo][2] or [MSN][3].
+
+[1]: http://google.com/        &quot;Google&quot;
+[2]: http://search.yahoo.com/  &quot;Yahoo Search&quot;
+[3]: http://search.msn.com/    &quot;MSN Search&quot;
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;I get 10 times more traffic from &lt;a href=&quot;http://google.com/&quot;
+title=&quot;Google&quot;&gt;Google&lt;/a&gt; than from &lt;a href=&quot;http://search.yahoo.com/&quot;
+title=&quot;Yahoo Search&quot;&gt;Yahoo&lt;/a&gt; or &lt;a href=&quot;http://search.msn.com/&quot;
+title=&quot;MSN Search&quot;&gt;MSN&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>The title attribute is optional. Link names may contain letters,
+numbers and spaces, but are <em>not</em> case sensitive:</p>
+
+<pre><code>I start my morning with a cup of coffee and
+[The New York Times][NY Times].
+
+[ny times]: http://www.nytimes.com/
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;I start my morning with a cup of coffee and
+&lt;a href=&quot;http://www.nytimes.com/&quot;&gt;The New York Times&lt;/a&gt;.&lt;/p&gt;
+</code></pre>
+
+<h3>Images</h3>
+
+<p>Image syntax is very much like link syntax.</p>
+
+<p>Inline (titles are optional):</p>
+
+<pre><code>![alt text](/path/to/img.jpg &quot;Title&quot;)
+</code></pre>
+
+<p>Reference-style:</p>
+
+<pre><code>![alt text][id]
+
+[id]: /path/to/img.jpg &quot;Title&quot;
+</code></pre>
+
+<p>Both of the above examples produce the same output:</p>
+
+<pre><code>&lt;img src=&quot;/path/to/img.jpg&quot; alt=&quot;alt text&quot; title=&quot;Title&quot; /&gt;
+</code></pre>
+
+<h3>Code</h3>
+
+<p>In a regular paragraph, you can create code span by wrapping text in
+backtick quotes. Any ampersands (<code>&amp;</code>) and angle brackets (<code>&lt;</code> or
+<code>&gt;</code>) will automatically be translated into HTML entities. This makes
+it easy to use Markdown to write about HTML example code:</p>
+
+<pre><code>I strongly recommend against using any "&lt;blink&gt;" tags.
+
+I wish SmartyPants used named entities like "&amp;mdash;""
+instead of decimal-encoded entites like "&amp;#8212;".
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;I strongly recommend against using any
+&lt;code&gt;&amp;lt;blink&amp;gt;&lt;/code&gt; tags.&lt;/p&gt;
+
+&lt;p&gt;I wish SmartyPants used named entities like
+&lt;code&gt;&amp;amp;mdash;&lt;/code&gt; instead of decimal-encoded
+entites like &lt;code&gt;&amp;amp;#8212;&lt;/code&gt;.&lt;/p&gt;
+</code></pre>
+
+<p>To specify an entire block of pre-formatted code, indent every line of
+the block by 4 spaces or 1 tab. Just like with code spans, <code>&amp;</code>, <code>&lt;</code>,
+and <code>&gt;</code> characters will be escaped automatically.</p>
+
+<p>Markdown:</p>
+
+<pre><code>If you want your page to validate under XHTML 1.0 Strict,
+you've got to put paragraph tags in your blockquotes:
+
+    &lt;blockquote&gt;
+        &lt;p&gt;For example.&lt;/p&gt;
+    &lt;/blockquote&gt;
+</code></pre>
+
+<p>Output:</p>
+
+<pre><code>&lt;p&gt;If you want your page to validate under XHTML 1.0 Strict,
+you've got to put paragraph tags in your blockquotes:&lt;/p&gt;
+
+&lt;pre&gt;&lt;code&gt;&amp;lt;blockquote&amp;gt;
+    &amp;lt;p&amp;gt;For example.&amp;lt;/p&amp;gt;
+&amp;lt;/blockquote&amp;gt;
+&lt;/code&gt;&lt;/pre&gt;
+</code></pre>
+
+<h4>Header4</h4>
+<div><div><div><div><div><div>
+<h5>Header5</h5>Body 555.
+</div></div></div></div></div></div>
+<h6>Header6</h6>
+
+`
+
+func TestHTML(t *testing.T) {
+
+	req := &api.DocumentConversionRequest{}
+	res := &api.DocumentConversionResponse{}
+
+	err := html.SplitIfHTML(req, res)
+	if err != nil || len(res.PagesHTML) != 0 || len(res.Pages) != 0 || len(res.EmbeddedFiles) != 0 {
+		t.Error(err)
+		return
+	}
+
+	titleTooBig := []byte("<h1>")
+	for i := 0; i < 2048; i++ {
+		titleTooBig = append(titleTooBig, []byte("title too long ")...)
+	}
+	titleTooBig = append(titleTooBig, []byte("</h1>")...)
+	req = &api.DocumentConversionRequest{}
+	res = &api.DocumentConversionResponse{PagesHTML: titleTooBig}
+	err = html.SplitIfHTML(req, res)
+	if err != nil || len(res.Pages[0].Title) > 2000 {
+		t.Error(err)
+		return
+	}
+
+	req = &api.DocumentConversionRequest{}
+	res = &api.DocumentConversionResponse{PagesHTML: []byte(b)}
+	err = html.SplitIfHTML(req, res)
+	if err != nil {
+		t.Error(err)
+		return
+	}
+	//for p, pg := range res.Pages {
+	//	t.Logf("%d %d %d %s", p, pg.Level, len(pg.Body), pg.Title)
+	//}
+	if !strings.HasPrefix(res.Pages[10].Title, "Header5") ||
+		!strings.HasPrefix(string(res.Pages[10].Body), "Body 555.") {
+		t.Errorf("wrong page ten title: `%s` body: `%s`", res.Pages[10].Title, string(res.Pages[10].Body))
+	}
+
+}
--- a/core/api/convert/md/md.go
+++ b/core/api/convert/md/md.go
@ -0,0 +1,28 @@
+// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
+//
+// This software (Documize Community Edition) is licensed under 
+// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
+//
+// You can operate outside the AGPL restrictions by purchasing
+// Documize Enterprise Edition and obtaining a commercial license
+// by contacting <sales@documize.com>. 
+//
+// https://documize.com
+
+package md
+
+import (
+api	"github.com/documize/community/core/convapi"
+
+	"github.com/documize/blackfriday"
+
+	"golang.org/x/net/context"
+)
+
+// Convert provides the standard interface for conversion of a Markdown document.
+// All the function does is return a pointer to api.DocumentConversionResponse with
+// PagesHTML set to the given (*api.DocumentConversionRequest).Filedata converted by the blackfriday lib.
+func Convert(ctx context.Context, in interface{}) (interface{}, error) {
+	return &api.DocumentConversionResponse{
+		PagesHTML: blackfriday.MarkdownCommon(in.(*api.DocumentConversionRequest).Filedata)}, nil
+}