mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-08-03 04:25:24 +02:00
added translator to scraper
This commit is contained in:
parent
2cfc63b302
commit
408df286fd
8 changed files with 52 additions and 31 deletions
|
@ -44,6 +44,7 @@ class GroupMigrationController(BaseUserController):
|
|||
"user_id": self.user.id,
|
||||
"group_id": self.group_id,
|
||||
"add_migration_tag": add_migration_tag,
|
||||
"translator": self.translator,
|
||||
}
|
||||
|
||||
table: dict[SupportedMigrations, type[BaseMigrator]] = {
|
||||
|
|
|
@ -167,7 +167,7 @@ class RecipeController(BaseRecipeController):
|
|||
async def parse_recipe_url(self, req: ScrapeRecipe):
|
||||
"""Takes in a URL and attempts to scrape data and load it into the database"""
|
||||
try:
|
||||
recipe, extras = await create_from_url(req.url)
|
||||
recipe, extras = await create_from_url(req.url, self.translator)
|
||||
except ForceTimeoutException as e:
|
||||
raise HTTPException(
|
||||
status_code=408, detail=ErrorResponse.respond(message="Recipe Scraping Timed Out")
|
||||
|
@ -196,7 +196,7 @@ class RecipeController(BaseRecipeController):
|
|||
@router.post("/create-url/bulk", status_code=202)
|
||||
def parse_recipe_url_bulk(self, bulk: CreateRecipeByUrlBulk, bg_tasks: BackgroundTasks):
|
||||
"""Takes in a URL and attempts to scrape data and load it into the database"""
|
||||
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group)
|
||||
bulk_scraper = RecipeBulkScraperService(self.service, self.repos, self.group, self.translator)
|
||||
report_id = bulk_scraper.get_report_id()
|
||||
bg_tasks.add_task(bulk_scraper.scrape, bulk)
|
||||
|
||||
|
@ -211,7 +211,7 @@ class RecipeController(BaseRecipeController):
|
|||
async def test_parse_recipe_url(self, url: ScrapeRecipeTest):
|
||||
# Debugger should produce the same result as the scraper sees before cleaning
|
||||
try:
|
||||
if scraped_data := await RecipeScraperPackage(url.url).scrape_url():
|
||||
if scraped_data := await RecipeScraperPackage(url.url, self.translator).scrape_url():
|
||||
return scraped_data.schema.data
|
||||
except ForceTimeoutException as e:
|
||||
raise HTTPException(
|
||||
|
|
|
@ -6,6 +6,7 @@ from pydantic import UUID4
|
|||
|
||||
from mealie.core import root_logger
|
||||
from mealie.core.exceptions import UnexpectedNone
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.repos.all_repositories import AllRepositories
|
||||
from mealie.schema.recipe import Recipe
|
||||
from mealie.schema.recipe.recipe_settings import RecipeSettings
|
||||
|
@ -35,12 +36,20 @@ class BaseMigrator(BaseService):
|
|||
helpers: DatabaseMigrationHelpers
|
||||
|
||||
def __init__(
|
||||
self, archive: Path, db: AllRepositories, session, user_id: UUID4, group_id: UUID, add_migration_tag: bool
|
||||
self,
|
||||
archive: Path,
|
||||
db: AllRepositories,
|
||||
session,
|
||||
user_id: UUID4,
|
||||
group_id: UUID,
|
||||
add_migration_tag: bool,
|
||||
translator: Translator,
|
||||
):
|
||||
self.archive = archive
|
||||
self.db = db
|
||||
self.session = session
|
||||
self.add_migration_tag = add_migration_tag
|
||||
self.translator = translator
|
||||
|
||||
user = db.users.get_one(user_id)
|
||||
if not user:
|
||||
|
@ -225,6 +234,6 @@ class BaseMigrator(BaseService):
|
|||
with contextlib.suppress(KeyError):
|
||||
del recipe_dict["id"]
|
||||
|
||||
recipe_dict = cleaner.clean(recipe_dict, url=recipe_dict.get("org_url", None))
|
||||
recipe_dict = cleaner.clean(recipe_dict, self.translator, url=recipe_dict.get("org_url", None))
|
||||
|
||||
return Recipe(**recipe_dict)
|
||||
|
|
|
@ -10,6 +10,7 @@ from datetime import datetime, timedelta
|
|||
from slugify import slugify
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.lang.providers import Translator
|
||||
|
||||
logger = get_logger("recipe-scraper")
|
||||
|
||||
|
@ -32,7 +33,7 @@ MATCH_ERRONEOUS_WHITE_SPACE = re.compile(r"\n\s*\n")
|
|||
""" Matches multiple new lines and removes erroneous white space """
|
||||
|
||||
|
||||
def clean(recipe_data: dict, url=None) -> dict:
|
||||
def clean(recipe_data: dict, translator: Translator, url=None) -> dict:
|
||||
"""Main entrypoint to clean a recipe extracted from the web
|
||||
and format the data into an accectable format for the database
|
||||
|
||||
|
@ -45,9 +46,9 @@ def clean(recipe_data: dict, url=None) -> dict:
|
|||
recipe_data["description"] = clean_string(recipe_data.get("description", ""))
|
||||
|
||||
# Times
|
||||
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"))
|
||||
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"))
|
||||
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"))
|
||||
recipe_data["prepTime"] = clean_time(recipe_data.get("prepTime"), translator)
|
||||
recipe_data["performTime"] = clean_time(recipe_data.get("performTime"), translator)
|
||||
recipe_data["totalTime"] = clean_time(recipe_data.get("totalTime"), translator)
|
||||
recipe_data["recipeCategory"] = clean_categories(recipe_data.get("recipeCategory", []))
|
||||
recipe_data["recipeYield"] = clean_yield(recipe_data.get("recipeYield"))
|
||||
recipe_data["recipeIngredient"] = clean_ingredients(recipe_data.get("recipeIngredient", []))
|
||||
|
@ -332,7 +333,7 @@ def clean_yield(yld: str | list[str] | None) -> str:
|
|||
return yld
|
||||
|
||||
|
||||
def clean_time(time_entry: str | timedelta | None) -> None | str:
|
||||
def clean_time(time_entry: str | timedelta | None, translator: Translator) -> None | str:
|
||||
"""_summary_
|
||||
|
||||
Supported Structures:
|
||||
|
@ -358,11 +359,11 @@ def clean_time(time_entry: str | timedelta | None) -> None | str:
|
|||
|
||||
try:
|
||||
time_delta_instructionsect = parse_duration(time_entry)
|
||||
return pretty_print_timedelta(time_delta_instructionsect)
|
||||
return pretty_print_timedelta(time_delta_instructionsect, translator)
|
||||
except ValueError:
|
||||
return str(time_entry)
|
||||
case timedelta():
|
||||
return pretty_print_timedelta(time_entry)
|
||||
return pretty_print_timedelta(time_entry, translator)
|
||||
case {"minValue": str(value)}:
|
||||
return clean_time(value)
|
||||
case [str(), *_]:
|
||||
|
@ -371,7 +372,7 @@ def clean_time(time_entry: str | timedelta | None) -> None | str:
|
|||
# TODO: Not sure what to do here
|
||||
return str(time_entry)
|
||||
case _:
|
||||
logger.warning("[SCRAPER] Unexpected type or structure for time_entrys")
|
||||
logger.warning("[SCRAPER] Unexpected type or structure for time_entries")
|
||||
return None
|
||||
|
||||
|
||||
|
@ -405,25 +406,25 @@ def parse_duration(iso_duration: str) -> timedelta:
|
|||
return timedelta(**times)
|
||||
|
||||
|
||||
def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places=2):
|
||||
def pretty_print_timedelta(t: timedelta, translator: Translator, max_components=None, max_decimal_places=2):
|
||||
"""
|
||||
Print a pretty string for a timedelta.
|
||||
For example datetime.timedelta(days=2, seconds=17280) will be printed as '2 days 4 Hours 48 Minutes'.
|
||||
Setting max_components to e.g. 1 will change this to '2.2 days', where the number of decimal
|
||||
points can also be set.
|
||||
"""
|
||||
time_scale_names_dict = {
|
||||
timedelta(days=365): "year",
|
||||
timedelta(days=1): "day",
|
||||
timedelta(hours=1): "Hour",
|
||||
timedelta(minutes=1): "Minute",
|
||||
timedelta(seconds=1): "Second",
|
||||
timedelta(microseconds=1000): "millisecond",
|
||||
timedelta(microseconds=1): "microsecond",
|
||||
time_scale_translation_keys_dict = {
|
||||
timedelta(days=365): "datetime.year",
|
||||
timedelta(days=1): "datetime.day",
|
||||
timedelta(hours=1): "datetime.hour",
|
||||
timedelta(minutes=1): "datetime.minute",
|
||||
timedelta(seconds=1): "datetime.second",
|
||||
timedelta(microseconds=1000): "datetime.millisecond",
|
||||
timedelta(microseconds=1): "datetime.microsecond",
|
||||
}
|
||||
count = 0
|
||||
out_list = []
|
||||
for scale, scale_name in time_scale_names_dict.items():
|
||||
for scale, scale_translation_key in time_scale_translation_keys_dict.items():
|
||||
if t >= scale:
|
||||
count += 1
|
||||
n = t / scale if count == max_components else int(t / scale)
|
||||
|
@ -433,7 +434,8 @@ def pretty_print_timedelta(t: timedelta, max_components=None, max_decimal_places
|
|||
if n_txt[-2:] == ".0":
|
||||
n_txt = n_txt[:-2]
|
||||
|
||||
out_list.append(f"{n_txt} {scale_name}{'s' if n > 1 else ''}")
|
||||
scale_value = translator.t(scale_translation_key, count=n)
|
||||
out_list.append(f"{n_txt} {scale_value}")
|
||||
|
||||
if out_list == []:
|
||||
return "none"
|
||||
|
|
|
@ -2,6 +2,7 @@ import asyncio
|
|||
|
||||
from pydantic import UUID4
|
||||
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.repos.repository_factory import AllRepositories
|
||||
from mealie.schema.recipe.recipe import CreateRecipeByUrlBulk, Recipe
|
||||
from mealie.schema.reports.reports import (
|
||||
|
@ -20,11 +21,14 @@ from mealie.services.scraper.scraper import create_from_url
|
|||
class RecipeBulkScraperService(BaseService):
|
||||
report_entries: list[ReportEntryCreate]
|
||||
|
||||
def __init__(self, service: RecipeService, repos: AllRepositories, group: GroupInDB) -> None:
|
||||
def __init__(
|
||||
self, service: RecipeService, repos: AllRepositories, group: GroupInDB, translator: Translator
|
||||
) -> None:
|
||||
self.service = service
|
||||
self.repos = repos
|
||||
self.group = group
|
||||
self.report_entries = []
|
||||
self.translator = translator
|
||||
|
||||
super().__init__()
|
||||
|
||||
|
@ -81,7 +85,7 @@ class RecipeBulkScraperService(BaseService):
|
|||
async def _do(url: str) -> Recipe | None:
|
||||
async with sem:
|
||||
try:
|
||||
recipe, _ = await create_from_url(url)
|
||||
recipe, _ = await create_from_url(url, self.translator)
|
||||
return recipe
|
||||
except Exception as e:
|
||||
self.service.logger.error(f"failed to scrape url during bulk url import {url}")
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from mealie.lang.providers import Translator
|
||||
from mealie.schema.recipe.recipe import Recipe
|
||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
|
@ -14,11 +15,12 @@ class RecipeScraper:
|
|||
# List of recipe scrapers. Note that order matters
|
||||
scrapers: list[type[ABCScraperStrategy]]
|
||||
|
||||
def __init__(self, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None:
|
||||
def __init__(self, translator: Translator, scrapers: list[type[ABCScraperStrategy]] | None = None) -> None:
|
||||
if scrapers is None:
|
||||
scrapers = DEFAULT_SCRAPER_STRATEGIES
|
||||
|
||||
self.scrapers = scrapers
|
||||
self.translator = translator
|
||||
|
||||
async def scrape(self, url: str) -> tuple[Recipe, ScrapedExtras] | tuple[None, None]:
|
||||
"""
|
||||
|
@ -26,7 +28,7 @@ class RecipeScraper:
|
|||
"""
|
||||
|
||||
for scraper_type in self.scrapers:
|
||||
scraper = scraper_type(url)
|
||||
scraper = scraper_type(url, self.translator)
|
||||
result = await scraper.parse()
|
||||
|
||||
if result is not None:
|
||||
|
|
|
@ -5,6 +5,7 @@ from fastapi import HTTPException, status
|
|||
from slugify import slugify
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.pkgs import cache
|
||||
from mealie.schema.recipe import Recipe
|
||||
from mealie.services.recipe.recipe_data_service import RecipeDataService
|
||||
|
@ -19,7 +20,7 @@ class ParserErrors(str, Enum):
|
|||
CONNECTION_ERROR = "CONNECTION_ERROR"
|
||||
|
||||
|
||||
async def create_from_url(url: str) -> tuple[Recipe, ScrapedExtras | None]:
|
||||
async def create_from_url(url: str, translator: Translator) -> tuple[Recipe, ScrapedExtras | None]:
|
||||
"""Main entry point for generating a recipe from a URL. Pass in a URL and
|
||||
a Recipe object will be returned if successful.
|
||||
|
||||
|
@ -29,7 +30,7 @@ async def create_from_url(url: str) -> tuple[Recipe, ScrapedExtras | None]:
|
|||
Returns:
|
||||
Recipe: Recipe Object
|
||||
"""
|
||||
scraper = RecipeScraper()
|
||||
scraper = RecipeScraper(translator)
|
||||
new_recipe, extras = await scraper.scrape(url)
|
||||
|
||||
if not new_recipe:
|
||||
|
|
|
@ -11,6 +11,7 @@ from slugify import slugify
|
|||
from w3lib.html import get_base_url
|
||||
|
||||
from mealie.core.root_logger import get_logger
|
||||
from mealie.lang.providers import Translator
|
||||
from mealie.schema.recipe.recipe import Recipe, RecipeStep
|
||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||
|
||||
|
@ -77,9 +78,10 @@ class ABCScraperStrategy(ABC):
|
|||
|
||||
url: str
|
||||
|
||||
def __init__(self, url: str) -> None:
|
||||
def __init__(self, url: str, translator: Translator) -> None:
|
||||
self.logger = get_logger()
|
||||
self.url = url
|
||||
self.translator = translator
|
||||
|
||||
@abstractmethod
|
||||
async def get_html(self, url: str) -> str:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue