1
0
Fork 0
mirror of https://github.com/mealie-recipes/mealie.git synced 2025-07-24 15:49:42 +02:00
mealie/mealie/services/scraper/scraper.py
2021-08-28 14:27:56 -08:00

167 lines
5.5 KiB
Python

import json
from enum import Enum
from typing import Any, Callable
from uuid import uuid4
import requests
from fastapi import HTTPException, status
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, WebsiteNotImplementedError, scrape_me
from slugify import slugify
from mealie.core.config import app_dirs
from mealie.core.root_logger import get_logger
from mealie.schema.recipe import Recipe, RecipeStep
from mealie.services.image.image import scrape_image
from mealie.services.scraper import cleaner, open_graph
LAST_JSON = app_dirs.DEBUG_DIR.joinpath("last_recipe.json")
logger = get_logger()
def create_from_url(url: str) -> Recipe:
"""Main entry point for generating a recipe from a URL. Pass in a URL and
a Recipe object will be returned if successful.
Args:
url (str): a valid string representing a URL
Returns:
Recipe: Recipe Object
"""
new_recipe = scrape_from_url(url)
logger.info(f"Image {new_recipe.image}")
new_recipe.image = download_image_for_recipe(new_recipe.slug, new_recipe.image)
if new_recipe.name is None or new_recipe.name == "":
new_recipe.name = "No Recipe Found" + uuid4().hex
new_recipe.slug = slugify(new_recipe.name)
return new_recipe
class ParserErrors(str, Enum):
BAD_RECIPE_DATA = "BAD_RECIPE_DATA"
NO_RECIPE_DATA = "NO_RECIPE_DATA"
CONNECTION_ERROR = "CONNECTION_ERROR"
def extract_open_graph_values(url) -> Recipe:
r = requests.get(url)
recipe = open_graph.basic_recipe_from_opengraph(r.text, url)
return Recipe(**recipe)
def scrape_from_url(url: str) -> Recipe:
"""Entry function to generating are recipe obejct from a url
This will determine if a url can be parsed and raise an appropriate error keyword
This keyword is used on the frontend to reference a localized string to present on the UI.
Args:
url (str): String Representing the URL
Raises:
HTTPException: 400_BAD_REQUEST - See ParserErrors Class for Key Details
Returns:
Recipe: Recipe Model
"""
try:
scraped_schema = scrape_me(url)
except (WebsiteNotImplementedError, AttributeError):
try:
scraped_schema = scrape_me(url, wild_mode=True)
except (NoSchemaFoundInWildMode, AttributeError):
recipe = extract_open_graph_values(url)
if recipe.name != "":
return recipe
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.BAD_RECIPE_DATA.value})
except ConnectionError:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.CONNECTION_ERROR.value})
try:
instruct = scraped_schema.instructions()
except Exception:
instruct = []
try:
ing = scraped_schema.ingredients()
except Exception:
ing = []
if not instruct and not ing:
raise HTTPException(status.HTTP_400_BAD_REQUEST, {"details": ParserErrors.NO_RECIPE_DATA.value})
else:
return clean_scraper(scraped_schema, url)
def clean_scraper(scraped_data: SchemaScraperFactory.SchemaScraper, url: str) -> Recipe:
def try_get_default(func_call: Callable, get_attr: str, default: Any, clean_func=None):
value = default
try:
value = func_call()
except Exception:
logger.error(f"Error parsing recipe func_call for '{get_attr}'")
if value == default:
try:
value = scraped_data.schema.data.get(get_attr)
except Exception:
logger.error(f"Error parsing recipe attribute '{get_attr}'")
if clean_func:
value = clean_func(value)
return value
def get_instructions() -> list[dict]:
instruction_as_text = try_get_default(
scraped_data.instructions, "recipeInstructions", ["No Instructions Found"]
)
logger.info(f"Scraped Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
instruction_as_text = cleaner.instructions(instruction_as_text)
logger.info(f"Cleaned Instructions: (Type: {type(instruction_as_text)}) \n {instruction_as_text}")
try:
return [RecipeStep(title="", text=x.get("text")) for x in instruction_as_text]
except TypeError:
return []
return Recipe(
name=try_get_default(scraped_data.title, "name", "No Name Found", cleaner.clean_string),
slug="",
image=try_get_default(scraped_data.image, "image", None),
description=try_get_default(None, "description", "", cleaner.clean_string),
recipe_yield=try_get_default(scraped_data.yields, "recipeYield", "1", cleaner.clean_string),
recipe_ingredient=try_get_default(scraped_data.ingredients, "recipeIngredient", [""], cleaner.ingredient),
recipe_instructions=get_instructions(),
total_time=try_get_default(None, "totalTime", None, cleaner.clean_time),
prep_time=try_get_default(None, "prepTime", None, cleaner.clean_time),
perform_time=try_get_default(None, "performTime", None, cleaner.clean_time),
org_url=url,
)
def download_image_for_recipe(slug, image_url) -> dict:
img_name = None
try:
img_path = scrape_image(image_url, slug)
img_name = img_path.name
except Exception as e:
logger.error(f"Error Scraping Image: {e}")
img_name = None
return img_name or "no image"
def dump_last_json(recipe_data: dict):
with open(LAST_JSON, "w") as f:
f.write(json.dumps(recipe_data, indent=4, default=str))
return