mirror of
https://github.com/mealie-recipes/mealie.git
synced 2025-08-04 13:05:21 +02:00
feat: Better Scraping/More User Agents (#5091)
This commit is contained in:
parent
173e8792a6
commit
48484e5b1a
4 changed files with 109 additions and 32 deletions
|
@ -9,13 +9,7 @@ from mealie.pkgs import img, safehttp
|
||||||
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
|
from mealie.pkgs.safehttp.transport import AsyncSafeTransport
|
||||||
from mealie.schema.recipe.recipe import Recipe
|
from mealie.schema.recipe.recipe import Recipe
|
||||||
from mealie.services._base_service import BaseService
|
from mealie.services._base_service import BaseService
|
||||||
|
from mealie.services.scraper.user_agents_manager import get_user_agents_manager
|
||||||
try:
|
|
||||||
from recipe_scrapers._abstract import HEADERS
|
|
||||||
|
|
||||||
_FIREFOX_UA = HEADERS["User-Agent"]
|
|
||||||
except (ImportError, KeyError):
|
|
||||||
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0"
|
|
||||||
|
|
||||||
|
|
||||||
async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
|
async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
|
||||||
|
@ -32,13 +26,15 @@ async def gather_with_concurrency(n, *coros, ignore_exceptions=False):
|
||||||
|
|
||||||
|
|
||||||
async def largest_content_len(urls: list[str]) -> tuple[str, int]:
|
async def largest_content_len(urls: list[str]) -> tuple[str, int]:
|
||||||
|
user_agent_manager = get_user_agents_manager()
|
||||||
|
|
||||||
largest_url = ""
|
largest_url = ""
|
||||||
largest_len = 0
|
largest_len = 0
|
||||||
|
|
||||||
max_concurrency = 10
|
max_concurrency = 10
|
||||||
|
|
||||||
async def do(client: AsyncClient, url: str) -> Response:
|
async def do(client: AsyncClient, url: str) -> Response:
|
||||||
return await client.head(url, headers={"User-Agent": _FIREFOX_UA})
|
return await client.head(url, headers=user_agent_manager.get_scrape_headers())
|
||||||
|
|
||||||
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
|
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
|
||||||
tasks = [do(client, url) for url in urls]
|
tasks = [do(client, url) for url in urls]
|
||||||
|
@ -110,6 +106,7 @@ class RecipeDataService(BaseService):
|
||||||
|
|
||||||
async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
|
async def scrape_image(self, image_url: str | dict[str, str] | list[str]) -> None:
|
||||||
self.logger.info(f"Image URL: {image_url}")
|
self.logger.info(f"Image URL: {image_url}")
|
||||||
|
user_agent = get_user_agents_manager().user_agents[0]
|
||||||
|
|
||||||
image_url_str = ""
|
image_url_str = ""
|
||||||
|
|
||||||
|
@ -140,7 +137,7 @@ class RecipeDataService(BaseService):
|
||||||
|
|
||||||
async with AsyncClient(transport=AsyncSafeTransport()) as client:
|
async with AsyncClient(transport=AsyncSafeTransport()) as client:
|
||||||
try:
|
try:
|
||||||
r = await client.get(image_url_str, headers={"User-Agent": _FIREFOX_UA})
|
r = await client.get(image_url_str, headers={"User-Agent": user_agent})
|
||||||
except Exception:
|
except Exception:
|
||||||
self.logger.exception("Fatal Image Request Exception")
|
self.logger.exception("Fatal Image Request Exception")
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -6,7 +6,7 @@ from typing import Any
|
||||||
import bs4
|
import bs4
|
||||||
import extruct
|
import extruct
|
||||||
from fastapi import HTTPException, status
|
from fastapi import HTTPException, status
|
||||||
from httpx import AsyncClient
|
from httpx import AsyncClient, Response
|
||||||
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
|
from recipe_scrapers import NoSchemaFoundInWildMode, SchemaScraperFactory, scrape_html
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
from w3lib.html import get_base_url
|
from w3lib.html import get_base_url
|
||||||
|
@ -20,16 +20,10 @@ from mealie.services.openai import OpenAIService
|
||||||
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
from mealie.services.scraper.scraped_extras import ScrapedExtras
|
||||||
|
|
||||||
from . import cleaner
|
from . import cleaner
|
||||||
|
from .user_agents_manager import get_user_agents_manager
|
||||||
try:
|
|
||||||
from recipe_scrapers._abstract import HEADERS
|
|
||||||
|
|
||||||
_FIREFOX_UA = HEADERS["User-Agent"]
|
|
||||||
except (ImportError, KeyError):
|
|
||||||
_FIREFOX_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0"
|
|
||||||
|
|
||||||
|
|
||||||
SCRAPER_TIMEOUT = 15
|
SCRAPER_TIMEOUT = 15
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
class ForceTimeoutException(Exception):
|
class ForceTimeoutException(Exception):
|
||||||
|
@ -42,11 +36,26 @@ async def safe_scrape_html(url: str) -> str:
|
||||||
if the request takes longer than 15 seconds. This is used to mitigate
|
if the request takes longer than 15 seconds. This is used to mitigate
|
||||||
DDOS attacks from users providing a url with arbitrary large content.
|
DDOS attacks from users providing a url with arbitrary large content.
|
||||||
"""
|
"""
|
||||||
|
user_agents_manager = get_user_agents_manager()
|
||||||
|
|
||||||
|
logger.debug(f"Scraping URL: {url}")
|
||||||
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
|
async with AsyncClient(transport=safehttp.AsyncSafeTransport()) as client:
|
||||||
|
for user_agent in user_agents_manager.user_agents:
|
||||||
|
logger.debug(f'Trying User-Agent: "{user_agent}"')
|
||||||
|
|
||||||
|
response: Response | None = None
|
||||||
html_bytes = b""
|
html_bytes = b""
|
||||||
async with client.stream(
|
async with client.stream(
|
||||||
"GET", url, timeout=SCRAPER_TIMEOUT, headers={"User-Agent": _FIREFOX_UA}, follow_redirects=True
|
"GET",
|
||||||
|
url,
|
||||||
|
timeout=SCRAPER_TIMEOUT,
|
||||||
|
headers=user_agents_manager.get_scrape_headers(user_agent),
|
||||||
|
follow_redirects=True,
|
||||||
) as resp:
|
) as resp:
|
||||||
|
if resp.status_code == status.HTTP_403_FORBIDDEN:
|
||||||
|
logger.debug(f'403 Forbidden with User-Agent: "{user_agent}"')
|
||||||
|
continue
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
async for chunk in resp.aiter_bytes(chunk_size=1024):
|
async for chunk in resp.aiter_bytes(chunk_size=1024):
|
||||||
|
@ -55,19 +64,22 @@ async def safe_scrape_html(url: str) -> str:
|
||||||
if time.time() - start_time > SCRAPER_TIMEOUT:
|
if time.time() - start_time > SCRAPER_TIMEOUT:
|
||||||
raise ForceTimeoutException()
|
raise ForceTimeoutException()
|
||||||
|
|
||||||
|
response = resp
|
||||||
|
break
|
||||||
|
|
||||||
|
if not (response and html_bytes):
|
||||||
|
return ""
|
||||||
|
|
||||||
# =====================================
|
# =====================================
|
||||||
# Copied from requests text property
|
# Copied from requests text property
|
||||||
|
|
||||||
# Try charset from content-type
|
# Try charset from content-type
|
||||||
content = None
|
content = None
|
||||||
encoding = resp.encoding
|
encoding = response.encoding
|
||||||
|
|
||||||
if not html_bytes:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# Fallback to auto-detected encoding.
|
# Fallback to auto-detected encoding.
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
encoding = resp.apparent_encoding
|
encoding = response.apparent_encoding
|
||||||
|
|
||||||
# Decode unicode from given encoding.
|
# Decode unicode from given encoding.
|
||||||
try:
|
try:
|
||||||
|
|
3
mealie/services/scraper/user-agents.txt
Normal file
3
mealie/services/scraper/user-agents.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
Mozilla/5.0 (iPhone; CPU iPhone OS 18_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Mobile/15E148 Safari/604.
|
||||||
|
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.
|
||||||
|
Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.6834.164 Mobile Safari/537.36 EdgA/131.0.2903.87
|
65
mealie/services/scraper/user_agents_manager.py
Normal file
65
mealie/services/scraper/user_agents_manager.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
_USER_AGENTS_MANAGER: UserAgentsManager | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_agents_manager() -> UserAgentsManager:
|
||||||
|
global _USER_AGENTS_MANAGER
|
||||||
|
|
||||||
|
if not _USER_AGENTS_MANAGER:
|
||||||
|
_USER_AGENTS_MANAGER = UserAgentsManager()
|
||||||
|
|
||||||
|
return _USER_AGENTS_MANAGER
|
||||||
|
|
||||||
|
|
||||||
|
class UserAgentsManager:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._user_agents: list[str] | None = None
|
||||||
|
self._user_agents_text_path = os.path.join(os.path.dirname(__file__), "user-agents.txt")
|
||||||
|
|
||||||
|
def get_scrape_headers(self, user_agent: str | None = None) -> dict[str, str]:
|
||||||
|
# From: https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping/#optimize-request-headers
|
||||||
|
if user_agent is None:
|
||||||
|
user_agent = random.choice(self.user_agents)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"User-Agent": user_agent,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.5",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
"Cache-Control": "max-age=0",
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def user_agents(self) -> list[str]:
|
||||||
|
if not self._user_agents:
|
||||||
|
self._user_agents = self._fetch_user_agents()
|
||||||
|
|
||||||
|
return self._user_agents
|
||||||
|
|
||||||
|
def _fetch_user_agents(self) -> list[str]:
|
||||||
|
user_agents: list[str] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
from recipe_scrapers._abstract import HEADERS
|
||||||
|
|
||||||
|
user_agents.append(HEADERS["User-Agent"])
|
||||||
|
except (ImportError, KeyError):
|
||||||
|
user_agents.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/128.0")
|
||||||
|
|
||||||
|
with open(self._user_agents_text_path) as f:
|
||||||
|
for line in f:
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
user_agents.append(line.strip())
|
||||||
|
|
||||||
|
return user_agents
|
Loading…
Add table
Add a link
Reference in a new issue