mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-19 13:19:41 +02:00
feat: Improve Recipe Imports with Cleaner (#4517)
This commit is contained in:
parent
085c489b05
commit
bcd0fcc920
6 changed files with 51 additions and 11 deletions
|
@ -268,6 +268,5 @@ class BaseMigrator(BaseService):
|
||||||
with contextlib.suppress(KeyError):
|
with contextlib.suppress(KeyError):
|
||||||
del recipe_dict["id"]
|
del recipe_dict["id"]
|
||||||
|
|
||||||
recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
|
recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
|
||||||
|
return recipe
|
||||||
return Recipe(**recipe_dict)
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ from mealie.schema.user.user import PrivateUser, UserRatingCreate
|
||||||
from mealie.services._base_service import BaseService
|
from mealie.services._base_service import BaseService
|
||||||
from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService
|
from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService
|
||||||
from mealie.services.recipe.recipe_data_service import RecipeDataService
|
from mealie.services.recipe.recipe_data_service import RecipeDataService
|
||||||
|
from mealie.services.scraper import cleaner
|
||||||
|
|
||||||
from .template_service import TemplateService
|
from .template_service import TemplateService
|
||||||
|
|
||||||
|
@ -297,6 +298,7 @@ class RecipeService(RecipeServiceBase):
|
||||||
recipe_data = await openai_recipe_service.build_recipe_from_images(
|
recipe_data = await openai_recipe_service.build_recipe_from_images(
|
||||||
local_images, translate_language=translate_language
|
local_images, translate_language=translate_language
|
||||||
)
|
)
|
||||||
|
recipe_data = cleaner.clean(recipe_data, self.translator)
|
||||||
|
|
||||||
recipe = self.create_one(recipe_data)
|
recipe = self.create_one(recipe_data)
|
||||||
data_service = RecipeDataService(recipe.id)
|
data_service = RecipeDataService(recipe.id)
|
||||||
|
|
|
@ -11,6 +11,7 @@ from slugify import slugify
|
||||||
|
|
||||||
from mealie.core.root_logger import get_logger
|
from mealie.core.root_logger import get_logger
|
||||||
from mealie.lang.providers import Translator
|
from mealie.lang.providers import Translator
|
||||||
|
from mealie.schema.recipe.recipe import Recipe
|
||||||
|
|
||||||
logger = get_logger("recipe-scraper")
|
logger = get_logger("recipe-scraper")
|
||||||
|
|
||||||
|
@ -33,16 +34,23 @@ MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n")
|
||||||
""" Matches multiple new lines and removes erroneous white space """
|
""" Matches multiple new lines and removes erroneous white space """
|
||||||
|
|
||||||
|
|
||||||
def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
|
def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe:
|
||||||
"""Main entrypoint to clean a recipe extracted from the web
|
"""Main entrypoint to clean a recipe extracted from the web
|
||||||
and format the data into an accectable format for the database
|
and format the data into an accectable format for the database
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
recipe_data (dict): raw recipe dicitonary
|
recipe_data (dict): raw recipe or recipe dictionary
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: cleaned recipe dictionary
|
dict: cleaned recipe dictionary
|
||||||
"""
|
"""
|
||||||
|
if not isinstance(recipe_data, dict):
|
||||||
|
# format the recipe like a scraped dictionary
|
||||||
|
recipe_data_dict = recipe_data.model_dump(by_alias=True)
|
||||||
|
recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient]
|
||||||
|
|
||||||
|
recipe_data = recipe_data_dict
|
||||||
|
|
||||||
recipe_data["description"] = clean_string(recipe_data.get("description", ""))
|
recipe_data["description"] = clean_string(recipe_data.get("description", ""))
|
||||||
|
|
||||||
# Times
|
# Times
|
||||||
|
@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
|
||||||
recipe_data["notes"] = clean_notes(recipe_data.get("notes"))
|
recipe_data["notes"] = clean_notes(recipe_data.get("notes"))
|
||||||
recipe_data["rating"] = clean_int(recipe_data.get("rating"))
|
recipe_data["rating"] = clean_int(recipe_data.get("rating"))
|
||||||
|
|
||||||
return recipe_data
|
return Recipe(**recipe_data)
|
||||||
|
|
||||||
|
|
||||||
def clean_string(text: str | list | int) -> str:
|
def clean_string(text: str | list | int) -> str:
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
|
from mealie.core.root_logger import get_logger
|
||||||
from mealie.lang.providers import Translator
|
from mealie.lang.providers import Translator
|
||||||
from mealie.schema.recipe.recipe import Recipe
|
from mealie.schema.recipe.recipe import Recipe
|
||||||
|
from mealie.services.scraper import cleaner
|
||||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||||
|
|
||||||
from .scraper_strategies import (
|
from .scraper_strategies import (
|
||||||
|
@ -31,6 +33,7 @@ class RecipeScraper:
|
||||||
|
|
||||||
self.scrapers = scrapers
|
self.scrapers = scrapers
|
||||||
self.translator = translator
|
self.translator = translator
|
||||||
|
self.logger = get_logger()
|
||||||
|
|
||||||
async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
|
async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
|
||||||
"""
|
"""
|
||||||
|
@ -41,9 +44,23 @@ class RecipeScraper:
|
||||||
raw_html = html or await safe_scrape_html(url)
|
raw_html = html or await safe_scrape_html(url)
|
||||||
for scraper_type in self.scrapers:
|
for scraper_type in self.scrapers:
|
||||||
scraper = scraper_type(url, self.translator, raw_html=raw_html)
|
scraper = scraper_type(url, self.translator, raw_html=raw_html)
|
||||||
result = await scraper.parse()
|
|
||||||
|
|
||||||
if result is not None:
|
try:
|
||||||
return result
|
result = await scraper.parse()
|
||||||
|
except Exception:
|
||||||
|
self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}")
|
||||||
|
result = None
|
||||||
|
|
||||||
|
if result is None or result[0] is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
recipe_result, extras = result
|
||||||
|
try:
|
||||||
|
recipe = cleaner.clean(recipe_result, self.translator)
|
||||||
|
except Exception:
|
||||||
|
self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return recipe, extras
|
||||||
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
|
@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
|
||||||
rather than trying to scrape it directly.
|
rather than trying to scrape it directly.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str:
|
||||||
|
data_parts: list[str] = []
|
||||||
|
for script in soup.find_all("script", type="application/ld+json"):
|
||||||
|
try:
|
||||||
|
script_data = script.string
|
||||||
|
if script_data:
|
||||||
|
data_parts.append(str(script_data))
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return "\n\n".join(data_parts)
|
||||||
|
|
||||||
def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
|
def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
|
||||||
# find the open graph image tag
|
# find the open graph image tag
|
||||||
og_image = soup.find("meta", property="og:image")
|
og_image = soup.find("meta", property="og:image")
|
||||||
|
@ -285,8 +297,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
|
||||||
soup = bs4.BeautifulSoup(html, "lxml")
|
soup = bs4.BeautifulSoup(html, "lxml")
|
||||||
|
|
||||||
text = soup.get_text(separator="\n", strip=True)
|
text = soup.get_text(separator="\n", strip=True)
|
||||||
|
text += self.extract_json_ld_data_from_html(soup)
|
||||||
if not text:
|
if not text:
|
||||||
raise Exception("No text found in HTML")
|
raise Exception("No text or ld+json data found in HTML")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
image = self.find_image(soup)
|
image = self.find_image(soup)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
|
@ -40,7 +40,7 @@ test_cleaner_data = [
|
||||||
def test_cleaner_clean(json_file: Path, num_steps):
|
def test_cleaner_clean(json_file: Path, num_steps):
|
||||||
translator = local_provider()
|
translator = local_provider()
|
||||||
recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator)
|
recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator)
|
||||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
assert len(recipe_data.recipe_instructions or []) == num_steps
|
||||||
|
|
||||||
|
|
||||||
def test_html_with_recipe_data():
|
def test_html_with_recipe_data():
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue