mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-08-03 04:25:24 +02:00
refactor: rewrite cleaner functions for parsing recipe dicts (#1743)
* rewrite cleaner functions * unify verbage * try importing dep during check * fix syntax * allow override defaults * satisfy mypy
This commit is contained in:
parent
77316d639b
commit
89d0cae51d
7 changed files with 918 additions and 347 deletions
|
@ -0,0 +1,56 @@
|
|||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from mealie.services.scraper import cleaner
|
||||
from mealie.services.scraper.scraper_strategies import RecipeScraperOpenGraph
|
||||
from tests import data as test_data
|
||||
|
||||
# https://github.com/django/django/blob/stable/1.3.x/django/core/validators.py#L45
|
||||
url_validation_regex = re.compile(
|
||||
r"^(?:http|ftp)s?://" # http:// or https://
|
||||
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
||||
r"localhost|" # localhost...
|
||||
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
||||
r"(?::\d+)?" # optional port
|
||||
r"(?:/?|[/?]\S+)$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
test_cleaner_data = [
|
||||
(test_data.json_best_homemade_salsa_recipe, 2),
|
||||
(test_data.json_blue_cheese_stuffed_turkey_meatballs_with_raspberry_balsamic_glaze_2, 3),
|
||||
(test_data.json_bon_appetit, 8),
|
||||
(test_data.json_chunky_apple_cake, 4),
|
||||
(test_data.json_dairy_free_impossible_pumpkin_pie, 7),
|
||||
(test_data.json_how_to_make_instant_pot_spaghetti, 8),
|
||||
(test_data.json_instant_pot_chicken_and_potatoes, 4),
|
||||
(test_data.json_instant_pot_kerala_vegetable_stew, 13),
|
||||
(test_data.json_jalapeno_popper_dip, 4),
|
||||
(test_data.json_microwave_sweet_potatoes_04783, 4),
|
||||
(test_data.json_moroccan_skirt_steak_with_roasted_pepper_couscous, 4),
|
||||
(test_data.json_pizza_knoblauch_champignon_paprika_vegan_html, 3),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("json_file,num_steps", test_cleaner_data)
|
||||
def test_cleaner_clean(json_file: Path, num_steps):
|
||||
recipe_data = cleaner.clean(json.loads(json_file.read_text()))
|
||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
||||
|
||||
|
||||
def test_html_with_recipe_data():
|
||||
path = test_data.html_healthy_pasta_bake_60759
|
||||
url = "https://www.bbc.co.uk/food/recipes/healthy_pasta_bake_60759"
|
||||
|
||||
open_graph_strategy = RecipeScraperOpenGraph(url)
|
||||
|
||||
recipe_data = open_graph_strategy.get_recipe_fields(path.read_text())
|
||||
|
||||
assert len(recipe_data["name"]) > 10
|
||||
assert len(recipe_data["slug"]) > 10
|
||||
assert recipe_data["orgURL"] == url
|
||||
assert len(recipe_data["description"]) > 100
|
||||
assert url_validation_regex.match(recipe_data["image"])
|
|
@ -0,0 +1,541 @@
|
|||
from dataclasses import dataclass
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from mealie.services.scraper import cleaner
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CleanerCase:
|
||||
test_id: str
|
||||
input: Any
|
||||
expected: Any
|
||||
exception: Any = None
|
||||
|
||||
|
||||
clean_string_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty_string",
|
||||
input="",
|
||||
expected="",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="html",
|
||||
input="<p> Hello World </p>",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="no_change",
|
||||
input="Hello World",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="html_with_extra_closing_tag",
|
||||
input="<p> Hello World </p></p>",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="multiple_spaces",
|
||||
input="Hello World",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="tabs",
|
||||
input="\tHello World\t",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="nbsp",
|
||||
input="\xa0Hello World\xa0",
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list",
|
||||
input=["Hello World", "Goodbye World"],
|
||||
expected="Hello World",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="int",
|
||||
input=1,
|
||||
expected="1",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", clean_string_test_cases, ids=(x.test_id for x in clean_string_test_cases))
|
||||
def test_cleaner_clean_string(case: CleanerCase) -> None:
|
||||
assert case.expected == cleaner.clean_string(case.input)
|
||||
|
||||
|
||||
image_cleaner_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty_string",
|
||||
input="",
|
||||
expected="no image",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="no_change",
|
||||
input="https://example.com/image.jpg",
|
||||
expected="https://example.com/image.jpg",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with url key",
|
||||
input={"url": "https://example.com/image.jpg"},
|
||||
expected="https://example.com/image.jpg",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of strings",
|
||||
input=["https://example.com/image.jpg"],
|
||||
expected="https://example.com/image.jpg",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", image_cleaner_test_cases, ids=(x.test_id for x in image_cleaner_test_cases))
|
||||
def test_cleaner_image_cleaner(case: CleanerCase):
|
||||
result = cleaner.clean_image(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
instruction_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="single string",
|
||||
input="Instruction A\nInstruction B\nInstruction C",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="single string multiple newlines",
|
||||
input="Instruction A\n\nInstruction B\n\nInstruction C",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="common list of dicts",
|
||||
input=[
|
||||
{"text": "Instruction A"},
|
||||
{"text": "Instruction B"},
|
||||
{"text": "Instruction C"},
|
||||
],
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with int keys",
|
||||
input={
|
||||
0: {"text": "Instruction A"},
|
||||
1: {"text": "Instruction B"},
|
||||
2: {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with str num keys",
|
||||
input={
|
||||
"0": {"text": "Instruction A"},
|
||||
"1": {"text": "Instruction B"},
|
||||
"2": {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with str num keys",
|
||||
input={
|
||||
"1": {"text": "Instruction A"},
|
||||
"2": {"text": "Instruction B"},
|
||||
"3": {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="dict with str num keys",
|
||||
input={
|
||||
1: {"text": "Instruction A"},
|
||||
2: {"text": "Instruction B"},
|
||||
3: {"text": "Instruction C"},
|
||||
},
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="raw json str",
|
||||
input='{"0": {"text": "Instruction A"}, "1": {"text": "Instruction B"}, "2": {"text": "Instruction C"}}',
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="how to steps",
|
||||
input=[
|
||||
{
|
||||
"@type": "HowToSection",
|
||||
"itemListElement": [
|
||||
{
|
||||
"@type": "HowToStep",
|
||||
"text": "Instruction A",
|
||||
},
|
||||
{
|
||||
"@type": "HowToStep",
|
||||
"text": "Instruction B",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"@type": "HowToSection",
|
||||
"itemListElement": [
|
||||
{
|
||||
"@type": "HowToStep",
|
||||
"text": "Instruction C",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (1)",
|
||||
input="Instruction A\n\nInstruction B\n\nInstruction C\n\n",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (2)",
|
||||
input="Instruction A\nInstruction B\nInstruction C\n",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (3)",
|
||||
input="Instruction A\r\n\r\nInstruction B\r\n\r\nInstruction C\r\n\r\n",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="excessive whitespace str (4)",
|
||||
input="Instruction A\r\nInstruction B\r\nInstruction C\r\n",
|
||||
expected=None,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("instructions", instruction_test_cases, ids=(x.test_id for x in instruction_test_cases))
|
||||
def test_cleaner_instructions(instructions: CleanerCase):
|
||||
reuslt = cleaner.clean_instructions(instructions.input)
|
||||
|
||||
expected = [
|
||||
{"text": "Instruction A"},
|
||||
{"text": "Instruction B"},
|
||||
{"text": "Instruction C"},
|
||||
]
|
||||
|
||||
assert reuslt == expected
|
||||
|
||||
|
||||
ingredients_test_cases = (
|
||||
CleanerCase(
|
||||
input="",
|
||||
expected=[],
|
||||
test_id="empty string",
|
||||
),
|
||||
CleanerCase(
|
||||
input="1 cup of flour",
|
||||
expected=["1 cup of flour"],
|
||||
test_id="single ingredient string",
|
||||
),
|
||||
CleanerCase(
|
||||
input=["1 cup of flour"],
|
||||
expected=["1 cup of flour"],
|
||||
test_id="single ingredient list",
|
||||
),
|
||||
CleanerCase(
|
||||
input=["1 cup of flour", "1 cup of sugar"],
|
||||
expected=["1 cup of flour", "1 cup of sugar"],
|
||||
test_id="multiple ingredient list",
|
||||
),
|
||||
CleanerCase(
|
||||
input={"0": "1 cup of flour", "1": "1 cup of sugar"},
|
||||
expected=None,
|
||||
test_id="multiple ingredient dictionary",
|
||||
exception=TypeError,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ingredients", ingredients_test_cases, ids=(x.test_id for x in ingredients_test_cases))
|
||||
def test_cleaner_clean_ingredients(ingredients: CleanerCase):
|
||||
|
||||
if ingredients.exception:
|
||||
with pytest.raises(ingredients.exception):
|
||||
cleaner.clean_ingredients(ingredients.input)
|
||||
|
||||
return
|
||||
|
||||
assert ingredients.expected == cleaner.clean_ingredients(ingredients.input)
|
||||
|
||||
|
||||
yield_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected="",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of strings",
|
||||
input=["Makes 4 Batches", "4 Batches"],
|
||||
expected="4 Batches",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="basic string",
|
||||
input="Makes 4 Batches",
|
||||
expected="Makes 4 Batches",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="empty list",
|
||||
input=[],
|
||||
expected="",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", yield_test_cases, ids=(x.test_id for x in yield_test_cases))
|
||||
def test_cleaner_clean_yield_amount(case: CleanerCase):
|
||||
result = cleaner.clean_yield(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
time_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="emtpy whitespace",
|
||||
input=" ",
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="none",
|
||||
input=None,
|
||||
expected=None,
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="invalid string",
|
||||
input="invalid",
|
||||
expected="invalid",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta",
|
||||
input=timedelta(minutes=30),
|
||||
expected="30 Minutes",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (1)",
|
||||
input="PT2H30M",
|
||||
expected="2 Hours 30 Minutes",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (2)",
|
||||
input="PT30M",
|
||||
expected="30 Minutes",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (3)",
|
||||
input="PT2H",
|
||||
expected="2 Hours",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (4)",
|
||||
input="P1DT1H1M1S",
|
||||
expected="1 day 1 Hour 1 Minute 1 Second",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (4)",
|
||||
input="P1DT1H1M1.53S",
|
||||
expected="1 day 1 Hour 1 Minute 1 Second",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (5) invalid",
|
||||
input="PT",
|
||||
expected="none",
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="timedelta string (6) PT-3H",
|
||||
input="PT-3H",
|
||||
expected="PT-3H",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", time_test_cases, ids=(x.test_id for x in time_test_cases))
|
||||
def test_cleaner_clean_time(case: CleanerCase):
|
||||
result = cleaner.clean_time(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
category_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="emtpy whitespace",
|
||||
input=" ",
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="emtpy list",
|
||||
input=[],
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="single string",
|
||||
input="Dessert",
|
||||
expected=["Dessert"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="nested dictionary",
|
||||
input=[
|
||||
{"name": "Dessert", "slug": "dessert"},
|
||||
{"name": "Lunch", "slug": "lunch"},
|
||||
],
|
||||
expected=["Dessert", "Lunch"],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", category_test_cases, ids=(x.test_id for x in category_test_cases))
|
||||
def test_cleaner_clean_categories(case: CleanerCase):
|
||||
result = cleaner.clean_categories(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
tag_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty string",
|
||||
input="",
|
||||
expected=[],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="single tag",
|
||||
input="tag",
|
||||
expected=["Tag"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="comma separated tags",
|
||||
input="tag1, tag2, tag3",
|
||||
expected=["Tag1", "Tag2", "Tag3"],
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list of tags",
|
||||
input=["tag1", "tag2", "tag3"],
|
||||
expected=["Tag1", "Tag2", "Tag3"],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", tag_test_cases, ids=(x.test_id for x in tag_test_cases))
|
||||
def test_cleaner_clean_tags(case: CleanerCase):
|
||||
result = cleaner.clean_tags(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
nutrition_test_cases = (
|
||||
CleanerCase(
|
||||
test_id="empty dict",
|
||||
input={},
|
||||
expected={},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="valid kets",
|
||||
input={
|
||||
"calories": "100mg",
|
||||
"fatContent": "10",
|
||||
},
|
||||
expected={
|
||||
"calories": "100",
|
||||
"fatContent": "10",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="invalid keys get removed",
|
||||
input={
|
||||
"calories": "100mg",
|
||||
"fatContent": "10",
|
||||
"invalid": "invalid",
|
||||
},
|
||||
expected={
|
||||
"calories": "100",
|
||||
"fatContent": "10",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="support `,` seperated numbers instead of `.` (common in Europe)",
|
||||
input={
|
||||
"calories": "100,000mg",
|
||||
"fatContent": "10,000",
|
||||
},
|
||||
expected={
|
||||
"calories": "100.000",
|
||||
"fatContent": "10.000",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="special support for sodiumContent (g -> mg)",
|
||||
input={
|
||||
"sodiumContent": "10g",
|
||||
},
|
||||
expected={
|
||||
"sodiumContent": "10000.0",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="special support for sodiumContent (mg -> mg)",
|
||||
input={
|
||||
"sodiumContent": "10000mg",
|
||||
},
|
||||
expected={
|
||||
"sodiumContent": "10000",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="strip units",
|
||||
input={
|
||||
"calories": "100 kcal",
|
||||
},
|
||||
expected={
|
||||
"calories": "100",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="list as value continues after first value",
|
||||
input={
|
||||
"calories": ["100 kcal"],
|
||||
"sugarContent": "but still tries 555.321",
|
||||
},
|
||||
expected={
|
||||
"sugarContent": "555.321",
|
||||
},
|
||||
),
|
||||
CleanerCase(
|
||||
test_id="multiple decimals",
|
||||
input={
|
||||
"sodiumContent": "10.1.2g",
|
||||
},
|
||||
expected={
|
||||
"sodiumContent": "10100.0",
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case", nutrition_test_cases, ids=(x.test_id for x in nutrition_test_cases))
|
||||
def test_cleaner_clean_nutrition(case: CleanerCase):
|
||||
result = cleaner.clean_nutrition(case.input)
|
||||
assert case.expected == result
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"t,max_components,max_decimal_places,expected",
|
||||
[
|
||||
(timedelta(days=2, seconds=17280), None, 2, "2 days 4 Hours 48 Minutes"),
|
||||
(timedelta(days=2, seconds=17280), 1, 2, "2.2 days"),
|
||||
(timedelta(days=365), None, 2, "1 year"),
|
||||
],
|
||||
)
|
||||
def test_pretty_print_timedelta(t, max_components, max_decimal_places, expected):
|
||||
assert cleaner.pretty_print_timedelta(t, max_components, max_decimal_places) == expected
|
Loading…
Add table
Add a link
Reference in a new issue