diff --git a/mealie/services/recipe/recipe_data_service.py b/mealie/services/recipe/recipe_data_service.py index a4d0432ac..f92270aff 100644 --- a/mealie/services/recipe/recipe_data_service.py +++ b/mealie/services/recipe/recipe_data_service.py @@ -9,13 +9,7 @@ from mealie.pkgs import img, safehttp from mealie.pkgs.safehttp.transport import AsyncSafeTransport from mealie.schema.recipe.recipe import Recipe from mealie.services._base_service import BaseService - -try: - from recipe_scrapers._abstract import HEADERS - - _FIREFOX_UA = HEADERS["User-Agent"] -except (ImportError, KeyError): - _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0" +from mealie.services.scraper.user_agents_manager import get_user_agents_manager async def gather_with_concurrency(n, *coros, ignore_exceptions=False): @@ -32,13 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False): async def largest_content_len(urls: list[str]) -> tuple[str, int]: + user_agent_manager = get_user_agents_manager() + largest_url = "" largest_len = 0 max_concurrency = 10 async def do(client: AsyncClient, url: str) -> Response: - return await client.head(url, headers={"User-Agent": _FIREFOX_UA}) + return await client.head(url, headers=user_agent_manager.get_scrape_headers()) async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: tasks = [do(client, url) for url in urls] @@ -110,6 +106,7 @@ class RecipeDataService(BaseService): async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None: self.logger.info(f"Image URL: {image_url}") + user_agent = get_user_agents_manager().user_agents[0] image_url_str = "" @@ -140,7 +137,7 @@ class RecipeDataService(BaseService): async with AsyncClient(transport=AsyncSafeTransport()) as client: try: - r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA}) + r = await client.get(image_url_str, headers={"User-Agent": user_agent}) except Exception: self.logger.exception("Fatal Image Request Exception") return None diff --git a/mealie/services/scraper/scraper_strategies.py b/mealie/services/scraper/scraper_strategies.py index 9cd01b76e..35fdfca72 100644 --- a/mealie/services/scraper/scraper_strategies.py +++ b/mealie/services/scraper/scraper_strategies.py @@ -6,7 +6,7 @@ from typing import Any import bs4 import extruct from fastapi import HTTPException, status -from httpx import AsyncClient +from httpx import AsyncClient, Response from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html from slugify import slugify from w3lib.html import get_base_url @@ -20,16 +20,10 @@ from mealie.services.openai import OpenAIService from mealie.services.scraper.scraped_extras import ScrapedExtras from . import cleaner - -try: - from recipe_scrapers._abstract import HEADERS - - _FIREFOX_UA = HEADERS["User-Agent"] -except (ImportError, KeyError): - _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0" - +from .user_agents_manager import get_user_agents_manager SCRAPER_TIMEOUT = 15 +logger = get_logger() class ForceTimeoutException(Exception): @@ -42,32 +36,50 @@ async def safe_scrape_html(url: str) -> str: if the request takes longer than 15 seconds. This is used to mitigate DDOS attacks from users providing a url with arbitrary large content. """ + user_agents_manager = get_user_agents_manager() + + logger.debug(f"Scraping URL: {url}") async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: - html_bytes = b"" - async with client.stream( - "GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": _FIREFOX_UA}, follow_redirects=True - ) as resp: - start_time = time.time() + for user_agent in user_agents_manager.user_agents: + logger.debug(f'Trying User-Agent: "{user_agent}"') - async for chunk in resp.aiter_bytes(chunk_size=1024): - html_bytes += chunk + response: Response | None = None + html_bytes = b"" + async with client.stream( + "GET", + url, + timeout=SCRAPER_TIMEOUT, + headers=user_agents_manager.get_scrape_headers(user_agent), + follow_redirects=True, + ) as resp: + if resp.status_code == status.HTTP_403_FORBIDDEN: + logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"') + continue - if time.time() - start_time > SCRAPER_TIMEOUT: - raise ForceTimeoutException() + start_time = time.time() + + async for chunk in resp.aiter_bytes(chunk_size=1024): + html_bytes += chunk + + if time.time() - start_time > SCRAPER_TIMEOUT: + raise ForceTimeoutException() + + response = resp + break + + if not (response and html_bytes): + return "" # ===================================== # Copied from requests text property # Try charset from content-type content = None - encoding = resp.encoding - - if not html_bytes: - return "" + encoding = response.encoding # Fallback to auto-detected encoding. if encoding is None: - encoding = resp.apparent_encoding + encoding = response.apparent_encoding # Decode unicode from given encoding. try: diff --git a/mealie/services/scraper/user-agents.txt b/mealie/services/scraper/user-agents.txt new file mode 100644 index 000000000..9b67731d8 --- /dev/null +++ b/mealie/services/scraper/user-agents.txt @@ -0,0 +1,3 @@ +Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Mobile/15E148 Safari/604. +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0. +Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87 diff --git a/mealie/services/scraper/user_agents_manager.py b/mealie/services/scraper/user_agents_manager.py new file mode 100644 index 000000000..531380480 --- /dev/null +++ b/mealie/services/scraper/user_agents_manager.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +import os +import random + +_USER_AGENTS_MANAGER: UserAgentsManager | None = None + + +def get_user_agents_manager() -> UserAgentsManager: + global _USER_AGENTS_MANAGER + + if not _USER_AGENTS_MANAGER: + _USER_AGENTS_MANAGER = UserAgentsManager() + + return _USER_AGENTS_MANAGER + + +class UserAgentsManager: + def __init__(self) -> None: + self._user_agents: list[str] | None = None + self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt") + + def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]: + # From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers + if user_agent is None: + user_agent = random.choice(self.user_agents) + + return { + "User-Agent": user_agent, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Cache-Control": "max-age=0", + } + + @property + def user_agents(self) -> list[str]: + if not self._user_agents: + self._user_agents = self._fetch_user_agents() + + return self._user_agents + + def _fetch_user_agents(self) -> list[str]: + user_agents: list[str] = [] + + try: + from recipe_scrapers._abstract import HEADERS + + user_agents.append(HEADERS["User-Agent"]) + except (ImportError, KeyError): + user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0") + + with open(self._user_agents_text_path) as f: + for line in f: + if not line: + continue + user_agents.append(line.strip()) + + return user_agents