diff --git a/modules/git/blob.go b/modules/git/blob.go index 14ca2b1445..4eef5f0e2a 100644 --- a/modules/git/blob.go +++ b/modules/git/blob.go @@ -220,7 +220,7 @@ func (b *Blob) GuessContentType() (typesniffer.SniffedType, error) { } defer r.Close() - return typesniffer.DetectContentTypeFromReader(r) + return typesniffer.DetectContentTypeFromReader(r, b.Name()) } // GetBlob finds the blob object in the repository. diff --git a/modules/httplib/serve.go b/modules/httplib/serve.go index c5f0658d4e..d385ac21c9 100644 --- a/modules/httplib/serve.go +++ b/modules/httplib/serve.go @@ -99,7 +99,7 @@ func setServeHeadersByFile(r *http.Request, w http.ResponseWriter, filePath stri Filename: path.Base(filePath), } - sniffedType := typesniffer.DetectContentType(mineBuf) + sniffedType := typesniffer.DetectContentType(mineBuf, opts.Filename) // the "render" parameter came from year 2016: 638dd24c, it doesn't have clear meaning, so I think it could be removed later isPlain := sniffedType.IsText() || r.FormValue("render") != "" diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index c53b7a2e6d..4c8b5f2a86 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -177,7 +177,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) if err != nil { return err - } else if !typesniffer.DetectContentType(fileContents).IsText() { + } else if !typesniffer.DetectContentType(fileContents, update.Filename).IsText() { // FIXME: UTF-16 files will probably fail here // Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty. fileContents = nil diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 3903d77fe0..9b11f56fb7 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -144,7 +144,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro fileContents, err := io.ReadAll(io.LimitReader(batchReader, size)) if err != nil { return nil, err - } else if !typesniffer.DetectContentType(fileContents).IsText() { + } else if !typesniffer.DetectContentType(fileContents, update.Filename).IsText() { // FIXME: UTF-16 files will probably fail here return nil, nil } diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go index 262feb2b05..8cb1513a88 100644 --- a/modules/typesniffer/typesniffer.go +++ b/modules/typesniffer/typesniffer.go @@ -124,7 +124,7 @@ func (ct SniffedType) GetMimeType() string { } // DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty. -func DetectContentType(data []byte) SniffedType { +func DetectContentType(data []byte, filename string) SniffedType { if len(data) == 0 { return SniffedType{"text/unknown"} } @@ -176,6 +176,13 @@ func DetectContentType(data []byte) SniffedType { } } + if ct == "application/octet-stream" && + filename != "" && + !strings.HasSuffix(strings.ToUpper(filename), ".LCOM") && + bytes.Contains(data, []byte("(DEFINE-FILE-INFO ")) { + ct = "text/vnd.interlisp" + } + // GLTF is unsupported by http.DetectContentType // hexdump -n 4 -C glTF.glb if bytes.HasPrefix(data, []byte("glTF")) { @@ -186,7 +193,7 @@ func DetectContentType(data []byte) SniffedType { } // DetectContentTypeFromReader guesses the content type contained in the reader. -func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) { +func DetectContentTypeFromReader(r io.Reader, filename string) (SniffedType, error) { buf := make([]byte, sniffLen) n, err := util.ReadAtMost(r, buf) if err != nil { @@ -194,5 +201,5 @@ func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) { } buf = buf[:n] - return DetectContentType(buf), nil + return DetectContentType(buf, filename), nil } diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go index 176d3658bb..d2b7ed4f21 100644 --- a/modules/typesniffer/typesniffer_test.go +++ b/modules/typesniffer/typesniffer_test.go @@ -16,63 +16,63 @@ import ( func TestDetectContentTypeLongerThanSniffLen(t *testing.T) { // Pre-condition: Shorter than sniffLen detects SVG. - assert.Equal(t, "image/svg+xml", DetectContentType([]byte(``)).contentType) + assert.Equal(t, "image/svg+xml", DetectContentType([]byte(``), "").contentType) // Longer than sniffLen detects something else. - assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(``)).contentType) + assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(``), "").contentType) } func TestIsTextFile(t *testing.T) { - assert.True(t, DetectContentType([]byte{}).IsText()) - assert.True(t, DetectContentType([]byte("lorem ipsum")).IsText()) + assert.True(t, DetectContentType([]byte{}, "").IsText()) + assert.True(t, DetectContentType([]byte("lorem ipsum"), "").IsText()) } func TestIsSvgImage(t *testing.T) { - assert.True(t, DetectContentType([]byte("")).IsSvgImage()) - assert.True(t, DetectContentType([]byte(" ")).IsSvgImage()) - assert.True(t, DetectContentType([]byte(``)).IsSvgImage()) - assert.True(t, DetectContentType([]byte(``)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(""), "").IsSvgImage()) + assert.True(t, DetectContentType([]byte(" "), "").IsSvgImage()) + assert.True(t, DetectContentType([]byte(``), "").IsSvgImage()) + assert.True(t, DetectContentType([]byte(``), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.True(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) // the DetectContentType should work for incomplete data, because only beginning bytes are used for detection - assert.True(t, DetectContentType([]byte(`....`)).IsSvgImage()) + assert.True(t, DetectContentType([]byte(`....`), "").IsSvgImage()) - assert.False(t, DetectContentType([]byte{}).IsSvgImage()) - assert.False(t, DetectContentType([]byte("svg")).IsSvgImage()) - assert.False(t, DetectContentType([]byte("")).IsSvgImage()) - assert.False(t, DetectContentType([]byte("text")).IsSvgImage()) - assert.False(t, DetectContentType([]byte("")).IsSvgImage()) - assert.False(t, DetectContentType([]byte(``)).IsSvgImage()) + assert.False(t, DetectContentType([]byte{}, "").IsSvgImage()) + assert.False(t, DetectContentType([]byte("svg"), "").IsSvgImage()) + assert.False(t, DetectContentType([]byte(""), "").IsSvgImage()) + assert.False(t, DetectContentType([]byte("text"), "").IsSvgImage()) + assert.False(t, DetectContentType([]byte(""), "").IsSvgImage()) + assert.False(t, DetectContentType([]byte(``), "").IsSvgImage()) assert.False(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.False(t, DetectContentType([]byte(` - `)).IsSvgImage()) + `), "").IsSvgImage()) assert.False(t, DetectContentType([]byte(` @@ -80,7 +80,7 @@ func TestIsSvgImage(t *testing.T) { -`)).IsSvgImage()) +`), "").IsSvgImage()) assert.False(t, DetectContentType([]byte(` -`)).IsSvgImage()) - assert.False(t, DetectContentType([]byte(``)).IsSvgImage()) - assert.False(t, DetectContentType([]byte(``)).IsSvgImage()) +`), "").IsSvgImage()) + assert.False(t, DetectContentType([]byte(``), "").IsSvgImage()) + assert.False(t, DetectContentType([]byte(``), "").IsSvgImage()) } func TestIsPDF(t *testing.T) { pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe") - assert.True(t, DetectContentType(pdf).IsPDF()) - assert.False(t, DetectContentType([]byte("plain text")).IsPDF()) + assert.True(t, DetectContentType(pdf, "").IsPDF()) + assert.False(t, DetectContentType([]byte("plain text"), "").IsPDF()) } func TestIsVideo(t *testing.T) { mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA") - assert.True(t, DetectContentType(mp4).IsVideo()) - assert.False(t, DetectContentType([]byte("plain text")).IsVideo()) + assert.True(t, DetectContentType(mp4, "").IsVideo()) + assert.False(t, DetectContentType([]byte("plain text"), "").IsVideo()) } func TestIsAudio(t *testing.T) { mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") - assert.True(t, DetectContentType(mp3).IsAudio()) - assert.False(t, DetectContentType([]byte("plain text")).IsAudio()) + assert.True(t, DetectContentType(mp3, "").IsAudio()) + assert.False(t, DetectContentType([]byte("plain text"), "").IsAudio()) - assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio()) - assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text - assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char + assert.True(t, DetectContentType([]byte("ID3Toy\000"), "").IsAudio()) + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."), "").IsText()) // test ID3 tag for plain text + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2]), "").IsText()) // test ID3 tag with incomplete UTF8 char } func TestIsGLB(t *testing.T) { glb, _ := hex.DecodeString("676c5446") - assert.True(t, DetectContentType(glb).IsGLB()) - assert.True(t, DetectContentType(glb).Is3DModel()) - assert.False(t, DetectContentType([]byte("plain text")).IsGLB()) - assert.False(t, DetectContentType([]byte("plain text")).Is3DModel()) + assert.True(t, DetectContentType(glb, "").IsGLB()) + assert.True(t, DetectContentType(glb, "").Is3DModel()) + assert.False(t, DetectContentType([]byte("plain text"), "").IsGLB()) + assert.False(t, DetectContentType([]byte("plain text"), "").Is3DModel()) } func TestDetectContentTypeFromReader(t *testing.T) { mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") - st, err := DetectContentTypeFromReader(bytes.NewReader(mp3)) + st, err := DetectContentTypeFromReader(bytes.NewReader(mp3), "") require.NoError(t, err) assert.True(t, st.IsAudio()) } func TestDetectContentTypeOgg(t *testing.T) { oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000") - st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio)) + st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio), "") require.NoError(t, err) assert.True(t, st.IsAudio()) oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001") - st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo)) + st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo), "") require.NoError(t, err) assert.True(t, st.IsVideo()) } @@ -148,7 +148,7 @@ func TestDetectContentTypeAvif(t *testing.T) { avifImage, err := hex.DecodeString("000000206674797061766966") require.NoError(t, err) - st, err := DetectContentTypeFromReader(bytes.NewReader(avifImage)) + st, err := DetectContentTypeFromReader(bytes.NewReader(avifImage), "") require.NoError(t, err) assert.True(t, st.IsImage()) @@ -158,10 +158,24 @@ func TestDetectContentTypeModelGLB(t *testing.T) { glb, err := hex.DecodeString("676c5446") require.NoError(t, err) - st, err := DetectContentTypeFromReader(bytes.NewReader(glb)) + st, err := DetectContentTypeFromReader(bytes.NewReader(glb), "") require.NoError(t, err) // print st for debugging assert.Equal(t, "model/gltf-binary", st.GetMimeType()) assert.True(t, st.IsGLB()) } + +func TestDetectInterlisp(t *testing.T) { + interlisp, err := base64.StdEncoding.DecodeString("ICAKKERFRklORS1GSUxFLUlORk8gHlBBQ0tBR0UgIklOVEVSTElTUCIgHlJFQURUQUJMRSAiSU5URVJMSVNQIiAeQkFTRSAxMCkKCgYB") + require.NoError(t, err) + st, err := DetectContentTypeFromReader(bytes.NewReader(interlisp), "test") + require.NoError(t, err) + assert.True(t, st.IsText()) + st, err = DetectContentTypeFromReader(bytes.NewReader(interlisp), "") + require.NoError(t, err) + assert.False(t, st.IsText()) + st, err = DetectContentTypeFromReader(bytes.NewReader(interlisp), "test.lcom") + require.NoError(t, err) + assert.False(t, st.IsText()) +} diff --git a/routers/web/repo/editor.go b/routers/web/repo/editor.go index 5114cc9c05..3e3cb0016d 100644 --- a/routers/web/repo/editor.go +++ b/routers/web/repo/editor.go @@ -189,7 +189,7 @@ func editFile(ctx *context.Context, isNewFile bool) { buf = buf[:n] // Only some file types are editable online as text. - if !typesniffer.DetectContentType(buf).IsRepresentableAsText() { + if !typesniffer.DetectContentType(buf, blob.Name()).IsRepresentableAsText() { ctx.NotFound("typesniffer.IsRepresentableAsText", nil) return } diff --git a/routers/web/repo/render.go b/routers/web/repo/render.go index b31e2e203a..05eeadc519 100644 --- a/routers/web/repo/render.go +++ b/routers/web/repo/render.go @@ -41,7 +41,7 @@ func RenderFile(ctx *context.Context) { n, _ := util.ReadAtMost(dataRc, buf) buf = buf[:n] - st := typesniffer.DetectContentType(buf) + st := typesniffer.DetectContentType(buf, blob.Name()) isTextFile := st.IsText() rd := charset.ToUTF8WithFallbackReader(io.MultiReader(bytes.NewReader(buf), dataRc), charset.ConvertOpts{}) diff --git a/routers/web/repo/setting/avatar.go b/routers/web/repo/setting/avatar.go index 84d7cccdb8..20e211316d 100644 --- a/routers/web/repo/setting/avatar.go +++ b/routers/web/repo/setting/avatar.go @@ -45,7 +45,7 @@ func UpdateAvatarSetting(ctx *context.Context, form forms.AvatarForm) error { if err != nil { return fmt.Errorf("io.ReadAll: %w", err) } - st := typesniffer.DetectContentType(data) + st := typesniffer.DetectContentType(data, "") if !st.IsImage() || st.IsSvgImage() { return errors.New(ctx.Locale.TrString("settings.uploaded_avatar_not_a_image")) } diff --git a/routers/web/repo/setting/lfs.go b/routers/web/repo/setting/lfs.go index b9cb86bd08..9930d03e8e 100644 --- a/routers/web/repo/setting/lfs.go +++ b/routers/web/repo/setting/lfs.go @@ -291,7 +291,7 @@ func LFSFileGet(ctx *context.Context) { } buf = buf[:n] - st := typesniffer.DetectContentType(buf) + st := typesniffer.DetectContentType(buf, "") ctx.Data["IsTextFile"] = st.IsText() isRepresentableAsText := st.IsRepresentableAsText() diff --git a/routers/web/repo/view.go b/routers/web/repo/view.go index bb3e1388a8..d00f85a134 100644 --- a/routers/web/repo/view.go +++ b/routers/web/repo/view.go @@ -228,7 +228,7 @@ func getFileReader(ctx gocontext.Context, repoID int64, blob *git.Blob) ([]byte, n, _ := util.ReadAtMost(dataRc, buf) buf = buf[:n] - st := typesniffer.DetectContentType(buf) + st := typesniffer.DetectContentType(buf, blob.Name()) isTextFile := st.IsText() // FIXME: what happens when README file is an image? @@ -262,7 +262,7 @@ func getFileReader(ctx gocontext.Context, repoID int64, blob *git.Blob) ([]byte, } buf = buf[:n] - st = typesniffer.DetectContentType(buf) + st = typesniffer.DetectContentType(buf, blob.Name()) return buf, dataRc, &fileInfo{st.IsText(), true, meta.Size, &meta.Pointer, st}, nil } diff --git a/routers/web/user/setting/profile.go b/routers/web/user/setting/profile.go index 400ee71f08..e0ce88b582 100644 --- a/routers/web/user/setting/profile.go +++ b/routers/web/user/setting/profile.go @@ -151,7 +151,7 @@ func UpdateAvatarSetting(ctx *context.Context, form *forms.AvatarForm, ctxUser * return fmt.Errorf("io.ReadAll: %w", err) } - st := typesniffer.DetectContentType(data) + st := typesniffer.DetectContentType(data, "") if !st.IsImage() || st.IsSvgImage() { return errors.New(ctx.Locale.TrString("settings.uploaded_avatar_not_a_image")) }