feat: Better Scraping/More User Agents (#5091)

2025-08-02 20:15:24 +02:00 · 2025-02-25 04:57:05 -06:00 · 2025-02-25 04:57:05 -06:00 · 48484e5b1a
commit 48484e5b1a
parent 173e8792a6
4 changed files with 109 additions and 32 deletions
--- a/mealie/services/recipe/recipe_data_service.py
+++ b/mealie/services/recipe/recipe_data_service.py
@ -9,13 +9,7 @@ from mealie.pkgs import img, safehttp
 from mealie.pkgs.safehttp.transport import AsyncSafeTransport
 from mealie.schema.recipe.recipe import Recipe
 from mealie.services._base_service import BaseService
-
-try:
-    from recipe_scrapers._abstract import HEADERS
-
-    _FIREFOX_UA = HEADERS["User-Agent"]
-except (ImportError, KeyError):
-    _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0"
+from mealie.services.scraper.user_agents_manager import get_user_agents_manager


 async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
@ -32,13 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False):


 async def largest_content_len(urls: list[str]) -> tuple[str, int]:
+    user_agent_manager = get_user_agents_manager()
+
    largest_url = ""
    largest_len = 0

    max_concurrency = 10

    async def do(client: AsyncClient, url: str) -> Response:
-        return await client.head(url, headers={"User-Agent": _FIREFOX_UA})
+        return await client.head(url, headers=user_agent_manager.get_scrape_headers())

    async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
        tasks = [do(client, url) for url in urls]
@ -110,6 +106,7 @@ class RecipeDataService(BaseService):

    async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
        self.logger.info(f"Image URL: {image_url}")
+        user_agent = get_user_agents_manager().user_agents[0]

        image_url_str = ""

@ -140,7 +137,7 @@ class RecipeDataService(BaseService):

        async with AsyncClient(transport=AsyncSafeTransport()) as client:
            try:
-                r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA})
+                r = await client.get(image_url_str, headers={"User-Agent": user_agent})
            except Exception:
                self.logger.exception("Fatal Image Request Exception")
                return None
--- a/mealie/services/scraper/scraper_strategies.py
+++ b/mealie/services/scraper/scraper_strategies.py
@ -6,7 +6,7 @@ from typing import Any
 import bs4
 import extruct
 from fastapi import HTTPException, status
-from httpx import AsyncClient
+from httpx import AsyncClient, Response
 from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
 from slugify import slugify
 from w3lib.html import get_base_url
@ -20,16 +20,10 @@ from mealie.services.openai import OpenAIService
 from mealie.services.scraper.scraped_extras import ScrapedExtras

 from . import cleaner
-
-try:
-    from recipe_scrapers._abstract import HEADERS
-
-    _FIREFOX_UA = HEADERS["User-Agent"]
-except (ImportError, KeyError):
-    _FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0"
-
+from .user_agents_manager import get_user_agents_manager

 SCRAPER_TIMEOUT = 15
+logger = get_logger()


 class ForceTimeoutException(Exception):
@ -42,32 +36,50 @@ async def safe_scrape_html(url: str) -> str:
    if the request takes longer than 15 seconds. This is used to mitigate
    DDOS attacks from users providing a url with arbitrary large content.
    """
+    user_agents_manager = get_user_agents_manager()
+
+    logger.debug(f"Scraping URL: {url}")
    async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
-        html_bytes = b""
-        async with client.stream(
-            "GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": _FIREFOX_UA}, follow_redirects=True
-        ) as resp:
-            start_time = time.time()
+        for user_agent in user_agents_manager.user_agents:
+            logger.debug(f'Trying User-Agent: "{user_agent}"')

-            async for chunk in resp.aiter_bytes(chunk_size=1024):
-                html_bytes += chunk
+            response: Response | None = None
+            html_bytes = b""
+            async with client.stream(
+                "GET",
+                url,
+                timeout=SCRAPER_TIMEOUT,
+                headers=user_agents_manager.get_scrape_headers(user_agent),
+                follow_redirects=True,
+            ) as resp:
+                if resp.status_code == status.HTTP_403_FORBIDDEN:
+                    logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"')
+                    continue

-                if time.time() - start_time > SCRAPER_TIMEOUT:
-                    raise ForceTimeoutException()
+                start_time = time.time()
+
+                async for chunk in resp.aiter_bytes(chunk_size=1024):
+                    html_bytes += chunk
+
+                    if time.time() - start_time > SCRAPER_TIMEOUT:
+                        raise ForceTimeoutException()
+
+                response = resp
+                break
+
+        if not (response and html_bytes):
+            return ""

        # =====================================
        # Copied from requests text property

        # Try charset from content-type
        content = None
-        encoding = resp.encoding
-
-        if not html_bytes:
-            return ""
+        encoding = response.encoding

        # Fallback to auto-detected encoding.
        if encoding is None:
-            encoding = resp.apparent_encoding
+            encoding = response.apparent_encoding

        # Decode unicode from given encoding.
        try:
--- a/mealie/services/scraper/user-agents.txt
+++ b/mealie/services/scraper/user-agents.txt
@ -0,0 +1,3 @@
+Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Mobile/15E148 Safari/604.
+Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.
+Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87
--- a/mealie/services/scraper/user_agents_manager.py
+++ b/mealie/services/scraper/user_agents_manager.py
@ -0,0 +1,65 @@
+from __future__ import annotations
+
+import os
+import random
+
+_USER_AGENTS_MANAGER: UserAgentsManager | None = None
+
+
+def get_user_agents_manager() -> UserAgentsManager:
+    global _USER_AGENTS_MANAGER
+
+    if not _USER_AGENTS_MANAGER:
+        _USER_AGENTS_MANAGER = UserAgentsManager()
+
+    return _USER_AGENTS_MANAGER
+
+
+class UserAgentsManager:
+    def __init__(self) -> None:
+        self._user_agents: list[str] | None = None
+        self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt")
+
+    def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]:
+        # From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers
+        if user_agent is None:
+            user_agent = random.choice(self.user_agents)
+
+        return {
+            "User-Agent": user_agent,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+            "Cache-Control": "max-age=0",
+        }
+
+    @property
+    def user_agents(self) -> list[str]:
+        if not self._user_agents:
+            self._user_agents = self._fetch_user_agents()
+
+        return self._user_agents
+
+    def _fetch_user_agents(self) -> list[str]:
+        user_agents: list[str] = []
+
+        try:
+            from recipe_scrapers._abstract import HEADERS
+
+            user_agents.append(HEADERS["User-Agent"])
+        except (ImportError, KeyError):
+            user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0")
+
+        with open(self._user_agents_text_path) as f:
+            for line in f:
+                if not line:
+                    continue
+                user_agents.append(line.strip())
+
+        return user_agents