1
0
Fork 0
mirror of https://github.com/mealie-recipes/mealie.git synced 2025-08-04 13:05:21 +02:00

feat: Better Scraping/More User Agents (#5091)

This commit is contained in:
Michael Genson 2025-02-25 04:57:05 -06:00 committed by GitHub
parent 173e8792a6
commit 48484e5b1a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 109 additions and 32 deletions

View file

@ -9,13 +9,7 @@ from mealie.pkgs import img, safehttp
from mealie.pkgs.safehttp.transport import AsyncSafeTransport from mealie.pkgs.safehttp.transport import AsyncSafeTransport
from mealie.schema.recipe.recipe import Recipe from mealie.schema.recipe.recipe import Recipe
from mealie.services._base_service import BaseService from mealie.services._base_service import BaseService
from mealie.services.scraper.user_agents_manager import get_user_agents_manager
try:
from recipe_scrapers._abstract import HEADERS
_FIREFOX_UA = HEADERS["User-Agent"]
except (ImportError, KeyError):
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0"
async def gather_with_concurrency(n, *coros, ignore_exceptions=False): async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
@ -32,13 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
async def largest_content_len(urls: list[str]) -> tuple[str, int]: async def largest_content_len(urls: list[str]) -> tuple[str, int]:
user_agent_manager = get_user_agents_manager()
largest_url = "" largest_url = ""
largest_len = 0 largest_len = 0
max_concurrency = 10 max_concurrency = 10
async def do(client: AsyncClient, url: str) -> Response: async def do(client: AsyncClient, url: str) -> Response:
return await client.head(url, headers={"User-Agent": _FIREFOX_UA}) return await client.head(url, headers=user_agent_manager.get_scrape_headers())
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
tasks = [do(client, url) for url in urls] tasks = [do(client, url) for url in urls]
@ -110,6 +106,7 @@ class RecipeDataService(BaseService):
async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None: async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
self.logger.info(f"Image URL: {image_url}") self.logger.info(f"Image URL: {image_url}")
user_agent = get_user_agents_manager().user_agents[0]
image_url_str = "" image_url_str = ""
@ -140,7 +137,7 @@ class RecipeDataService(BaseService):
async with AsyncClient(transport=AsyncSafeTransport()) as client: async with AsyncClient(transport=AsyncSafeTransport()) as client:
try: try:
r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA}) r = await client.get(image_url_str, headers={"User-Agent": user_agent})
except Exception: except Exception:
self.logger.exception("Fatal Image Request Exception") self.logger.exception("Fatal Image Request Exception")
return None return None

View file

@ -6,7 +6,7 @@ from typing import Any
import bs4 import bs4
import extruct import extruct
from fastapi import HTTPException, status from fastapi import HTTPException, status
from httpx import AsyncClient from httpx import AsyncClient, Response
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
from slugify import slugify from slugify import slugify
from w3lib.html import get_base_url from w3lib.html import get_base_url
@ -20,16 +20,10 @@ from mealie.services.openai import OpenAIService
from mealie.services.scraper.scraped_extras import ScrapedExtras from mealie.services.scraper.scraped_extras import ScrapedExtras
from . import cleaner from . import cleaner
from .user_agents_manager import get_user_agents_manager
try:
from recipe_scrapers._abstract import HEADERS
_FIREFOX_UA = HEADERS["User-Agent"]
except (ImportError, KeyError):
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0"
SCRAPER_TIMEOUT = 15 SCRAPER_TIMEOUT = 15
logger = get_logger()
class ForceTimeoutException(Exception): class ForceTimeoutException(Exception):
@ -42,32 +36,50 @@ async def safe_scrape_html(url: str) -> str:
if the request takes longer than 15 seconds. This is used to mitigate if the request takes longer than 15 seconds. This is used to mitigate
DDOS attacks from users providing a url with arbitrary large content. DDOS attacks from users providing a url with arbitrary large content.
""" """
user_agents_manager = get_user_agents_manager()
logger.debug(f"Scraping URL: {url}")
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client: async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
html_bytes = b"" for user_agent in user_agents_manager.user_agents:
async with client.stream( logger.debug(f'Trying User-Agent: "{user_agent}"')
"GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": _FIREFOX_UA}, follow_redirects=True
) as resp:
start_time = time.time()
async for chunk in resp.aiter_bytes(chunk_size=1024): response: Response | None = None
html_bytes += chunk html_bytes = b""
async with client.stream(
"GET",
url,
timeout=SCRAPER_TIMEOUT,
headers=user_agents_manager.get_scrape_headers(user_agent),
follow_redirects=True,
) as resp:
if resp.status_code == status.HTTP_403_FORBIDDEN:
logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"')
continue
if time.time() - start_time > SCRAPER_TIMEOUT: start_time = time.time()
raise ForceTimeoutException()
async for chunk in resp.aiter_bytes(chunk_size=1024):
html_bytes += chunk
if time.time() - start_time > SCRAPER_TIMEOUT:
raise ForceTimeoutException()
response = resp
break
if not (response and html_bytes):
return ""
# ===================================== # =====================================
# Copied from requests text property # Copied from requests text property
# Try charset from content-type # Try charset from content-type
content = None content = None
encoding = resp.encoding encoding = response.encoding
if not html_bytes:
return ""
# Fallback to auto-detected encoding. # Fallback to auto-detected encoding.
if encoding is None: if encoding is None:
encoding = resp.apparent_encoding encoding = response.apparent_encoding
# Decode unicode from given encoding. # Decode unicode from given encoding.
try: try:

View file

@ -0,0 +1,3 @@
Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Mobile/15E148 Safari/604.
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.
Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87

View file

@ -0,0 +1,65 @@
from __future__ import annotations
import os
import random
_USER_AGENTS_MANAGER: UserAgentsManager | None = None
def get_user_agents_manager() -> UserAgentsManager:
global _USER_AGENTS_MANAGER
if not _USER_AGENTS_MANAGER:
_USER_AGENTS_MANAGER = UserAgentsManager()
return _USER_AGENTS_MANAGER
class UserAgentsManager:
def __init__(self) -> None:
self._user_agents: list[str] | None = None
self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt")
def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]:
# From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers
if user_agent is None:
user_agent = random.choice(self.user_agents)
return {
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
@property
def user_agents(self) -> list[str]:
if not self._user_agents:
self._user_agents = self._fetch_user_agents()
return self._user_agents
def _fetch_user_agents(self) -> list[str]:
user_agents: list[str] = []
try:
from recipe_scrapers._abstract import HEADERS
user_agents.append(HEADERS["User-Agent"])
except (ImportError, KeyError):
user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0")
with open(self._user_agents_text_path) as f:
for line in f:
if not line:
continue
user_agents.append(line.strip())
return user_agents