Feature/improve error message on scrape (#476)

* add better feedback on failed scrape * fix json download link * add better recipe parser * dump deps * fix force open on mobile * formatting * rewrite scraper to use new library * fix failing tests * bookmarklet support * bookmarklet instructions * recipes changelog Co-authored-by: hay-kot <hay-kot@pm.me>
2025-07-24 15:49:42 +02:00 · 2021-06-09 13:04:54 -08:00 · 2021-06-09 13:04:54 -08:00 · a78fbea711
commit a78fbea711
parent 3702331630
22 changed files with 658 additions and 15582 deletions
--- a/tests/unit_tests/test_cleaner.py
+++ b/tests/unit_tests/test_cleaner.py
@ -2,8 +2,8 @@ import json
 import re

 import pytest
-from mealie.services.scraper.cleaner import Cleaner
-from mealie.services.scraper.scraper import extract_recipe_from_html
+from mealie.services.scraper import cleaner
+from mealie.services.scraper.scraper import open_graph
 from tests.test_config import TEST_RAW_HTML, TEST_RAW_RECIPES

 # https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
@ -39,23 +39,23 @@ url_validation_regex = re.compile(
    ],
 )
 def test_cleaner_clean(json_file, num_steps):
-    recipe_data = Cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
+    recipe_data = cleaner.clean(json.load(open(TEST_RAW_RECIPES.joinpath(json_file))))
    assert len(recipe_data["recipeInstructions"]) == num_steps


 def test_clean_category():
-    assert Cleaner.category("my-category") == ["my-category"]
+    assert cleaner.category("my-category") == ["my-category"]


-def test_clean_html():
-    assert Cleaner.html("<div>Hello World</div>") == "Hello World"
+def test_clean_string():
+    assert cleaner.clean_string("<div>Hello World</div>") == "Hello World"


 def test_clean_image():
-    assert Cleaner.image(None) == "no image"
-    assert Cleaner.image("https://my.image/path/") == "https://my.image/path/"
-    assert Cleaner.image({"url": "My URL!"}) == "My URL!"
-    assert Cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"
+    assert cleaner.image(None) == "no image"
+    assert cleaner.image("https://my.image/path/") == "https://my.image/path/"
+    assert cleaner.image({"url": "My URL!"}) == "My URL!"
+    assert cleaner.image(["My URL!", "MY SECOND URL"]) == "My URL!"


@pytest.mark.parametrize(
@ -70,7 +70,7 @@ def test_clean_image():
    ],
 )
 def test_cleaner_instructions(instructions):
-    assert Cleaner.instructions(instructions) == [
+    assert cleaner.instructions(instructions) == [
        {"text": "A"},
        {"text": "B"},
        {"text": "C"},
@ -80,20 +80,18 @@ def test_cleaner_instructions(instructions):
 def test_html_with_recipe_data():
    path = TEST_RAW_HTML.joinpath("healthy_pasta_bake_60759.html")
    url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
-    recipe_data = extract_recipe_from_html(open(path, encoding="utf8").read(), url)
+    recipe_data = open_graph.basic_recipe_from_opengraph(open(path, encoding="utf8").read(), url)

    assert len(recipe_data["name"]) > 10
    assert len(recipe_data["slug"]) > 10
    assert recipe_data["orgURL"] == url
    assert len(recipe_data["description"]) > 100
    assert url_validation_regex.match(recipe_data["image"])
-    assert len(recipe_data["recipeIngredient"]) == 13
-    assert len(recipe_data["recipeInstructions"]) == 4


 def test_time_cleaner():

    my_time_delta = "PT2H30M"
-    return_delta = Cleaner.time(my_time_delta)
+    return_delta = cleaner.clean_time(my_time_delta)

    assert return_delta == "2 Hours 30 Minutes"