mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-07-19 13:19:41 +02:00
feat: Improve Recipe Imports with Cleaner (#4517)
This commit is contained in:
parent
085c489b05
commit
bcd0fcc920
6 changed files with 51 additions and 11 deletions
|
@ -268,6 +268,5 @@ class BaseMigrator(BaseService):
|
|||
with contextlib.suppress(KeyError):
|
||||
del recipe_dict["id"]
|
||||
|
||||
recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
|
||||
|
||||
return Recipe(**recipe_dict)
|
||||
recipe = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
|
||||
return recipe
|
||||
|
|
|
@ -32,6 +32,7 @@ from mealie.schema.user.user import PrivateUser, UserRatingCreate
|
|||
from mealie.services._base_service import BaseService
|
||||
from mealie.services.openai import OpenAIDataInjection, OpenAILocalImage, OpenAIService
|
||||
from mealie.services.recipe.recipe_data_service import RecipeDataService
|
||||
from mealie.services.scraper import cleaner
|
||||
|
||||
from .template_service import TemplateService
|
||||
|
||||
|
@ -297,6 +298,7 @@ class RecipeService(RecipeServiceBase):
|
|||
recipe_data = await openai_recipe_service.build_recipe_from_images(
|
||||
local_images, translate_language=translate_language
|
||||
)
|
||||
recipe_data = cleaner.clean(recipe_data, self.translator)
|
||||
|
||||
recipe = self.create_one(recipe_data)
|
||||
data_service = RecipeDataService(recipe.id)
|
||||
|
|
|
@ -11,6 +11,7 @@ from slugify import slugify
|
|||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.schema.recipe.recipe import Recipe
|
||||
|
||||
logger = get_logger("recipe-scraper")
|
||||
|
||||
|
@ -33,16 +34,23 @@ MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n")
|
|||
""" Matches multiple new lines and removes erroneous white space """
|
||||
|
||||
|
||||
def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
|
||||
def clean(recipe_data: Recipe | dict, translator: Translator, url=None) -> Recipe:
|
||||
"""Main entrypoint to clean a recipe extracted from the web
|
||||
and format the data into an accectable format for the database
|
||||
|
||||
Args:
|
||||
recipe_data (dict): raw recipe dicitonary
|
||||
recipe_data (dict): raw recipe or recipe dictionary
|
||||
|
||||
Returns:
|
||||
dict: cleaned recipe dictionary
|
||||
"""
|
||||
if not isinstance(recipe_data, dict):
|
||||
# format the recipe like a scraped dictionary
|
||||
recipe_data_dict = recipe_data.model_dump(by_alias=True)
|
||||
recipe_data_dict["recipeIngredient"] = [ing.display for ing in recipe_data.recipe_ingredient]
|
||||
|
||||
recipe_data = recipe_data_dict
|
||||
|
||||
recipe_data["description"] = clean_string(recipe_data.get("description", ""))
|
||||
|
||||
# Times
|
||||
|
@ -59,7 +67,7 @@ def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
|
|||
recipe_data["notes"] = clean_notes(recipe_data.get("notes"))
|
||||
recipe_data["rating"] = clean_int(recipe_data.get("rating"))
|
||||
|
||||
return recipe_data
|
||||
return Recipe(**recipe_data)
|
||||
|
||||
|
||||
def clean_string(text: str | list | int) -> str:
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
from mealie.core.root_logger import get_logger
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.schema.recipe.recipe import Recipe
|
||||
from mealie.services.scraper import cleaner
|
||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
from .scraper_strategies import (
|
||||
|
@ -31,6 +33,7 @@ class RecipeScraper:
|
|||
|
||||
self.scrapers = scrapers
|
||||
self.translator = translator
|
||||
self.logger = get_logger()
|
||||
|
||||
async def scrape(self, url: str, html: str | None = None) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
|
||||
"""
|
||||
|
@ -41,9 +44,23 @@ class RecipeScraper:
|
|||
raw_html = html or await safe_scrape_html(url)
|
||||
for scraper_type in self.scrapers:
|
||||
scraper = scraper_type(url, self.translator, raw_html=raw_html)
|
||||
result = await scraper.parse()
|
||||
|
||||
if result is not None:
|
||||
return result
|
||||
try:
|
||||
result = await scraper.parse()
|
||||
except Exception:
|
||||
self.logger.exception(f"Failed to scrape HTML with {scraper.__class__.__name__}")
|
||||
result = None
|
||||
|
||||
if result is None or result[0] is None:
|
||||
continue
|
||||
|
||||
recipe_result, extras = result
|
||||
try:
|
||||
recipe = cleaner.clean(recipe_result, self.translator)
|
||||
except Exception:
|
||||
self.logger.exception(f"Failed to clean recipe data from {scraper.__class__.__name__}")
|
||||
continue
|
||||
|
||||
return recipe, extras
|
||||
|
||||
return None, None
|
||||
|
|
|
@ -253,6 +253,18 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
|
|||
rather than trying to scrape it directly.
|
||||
"""
|
||||
|
||||
def extract_json_ld_data_from_html(self, soup: bs4.BeautifulSoup) -> str:
|
||||
data_parts: list[str] = []
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
script_data = script.string
|
||||
if script_data:
|
||||
data_parts.append(str(script_data))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return "\n\n".join(data_parts)
|
||||
|
||||
def find_image(self, soup: bs4.BeautifulSoup) -> str | None:
|
||||
# find the open graph image tag
|
||||
og_image = soup.find("meta", property="og:image")
|
||||
|
@ -285,8 +297,10 @@ class RecipeScraperOpenAI(RecipeScraperPackage):
|
|||
soup = bs4.BeautifulSoup(html, "lxml")
|
||||
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
text += self.extract_json_ld_data_from_html(soup)
|
||||
if not text:
|
||||
raise Exception("No text found in HTML")
|
||||
raise Exception("No text or ld+json data found in HTML")
|
||||
|
||||
try:
|
||||
image = self.find_image(soup)
|
||||
except Exception:
|
||||
|
|
|
@ -40,7 +40,7 @@ test_cleaner_data = [
|
|||
def test_cleaner_clean(json_file: Path, num_steps):
|
||||
translator = local_provider()
|
||||
recipe_data = cleaner.clean(json.loads(json_file.read_text()), translator)
|
||||
assert len(recipe_data["recipeInstructions"]) == num_steps
|
||||
assert len(recipe_data.recipe_instructions or []) == num_steps
|
||||
|
||||
|
||||
def test_html_with_recipe_data():
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue