Files
price-tracker/src/scraper.py
Oli Passey 5726183115 scrape fix
2025-06-27 17:25:56 +01:00

354 lines
13 KiB
Python

"""
Web scraping functionality for price tracking
"""
import asyncio
import aiohttp
import logging
import random
import re
from typing import Dict, List, Optional, Any, Tuple
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from .config import Config
logger = logging.getLogger(__name__)
class PriceScraper:
"""Base class for price scraping functionality."""
def __init__(self, config: Config):
self.config = config
self.ua = UserAgent()
self.session = None
async def __aenter__(self):
"""Async context manager entry."""
connector = aiohttp.TCPConnector(limit=self.config.max_concurrent_requests)
timeout = aiohttp.ClientTimeout(total=self.config.timeout)
self.session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
headers={'User-Agent': self.ua.random}
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
if self.session:
await self.session.close()
def _get_headers(self, url: str = None) -> Dict[str, str]:
"""Get request headers with random user agent and site-specific headers."""
user_agents = self.config.user_agents
if user_agents:
user_agent = random.choice(user_agents)
else:
user_agent = self.ua.random
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
}
# Add site-specific headers
if url:
if 'amazon.co.uk' in url:
headers.update({
'Referer': 'https://www.amazon.co.uk/',
})
elif 'jjfoodservice.com' in url:
headers.update({
'Referer': 'https://www.jjfoodservice.com/',
})
elif 'atoz-catering.co.uk' in url:
headers.update({
'Referer': 'https://www.atoz-catering.co.uk/',
})
return headers
async def _fetch_page(self, url: str) -> Optional[str]:
"""Fetch a web page with retry logic and anti-bot measures."""
base_delay = random.uniform(1, 3) # Random delay between 1-3 seconds
for attempt in range(self.config.retry_attempts):
try:
# Add delay before each request (except first)
if attempt > 0:
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
await asyncio.sleep(delay)
headers = self._get_headers(url)
async with self.session.get(url, headers=headers) as response:
if response.status == 200:
return await response.text()
elif response.status == 403:
logger.warning(f"Access denied (403) for {url} - may be blocked by anti-bot measures")
# For 403 errors, wait longer before retry
if attempt < self.config.retry_attempts - 1:
await asyncio.sleep(random.uniform(5, 10))
elif response.status == 429:
logger.warning(f"Rate limited (429) for {url}")
# For rate limiting, wait even longer
if attempt < self.config.retry_attempts - 1:
await asyncio.sleep(random.uniform(10, 20))
else:
logger.warning(f"HTTP {response.status} for {url}")
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt < self.config.retry_attempts - 1:
await asyncio.sleep(base_delay * (2 ** attempt))
logger.error(f"Failed to fetch {url} after {self.config.retry_attempts} attempts")
return None
def _extract_price(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[float]:
"""Extract price from HTML using CSS selectors."""
for selector in selectors:
try:
elements = soup.select(selector)
for element in elements:
price_text = element.get_text(strip=True)
price = self._parse_price(price_text)
if price is not None:
return price
except Exception as e:
logger.debug(f"Error with selector {selector}: {e}")
continue
return None
def _parse_price(self, price_text: str) -> Optional[float]:
"""Parse price from text string."""
if not price_text:
return None
# Remove common currency symbols and clean text
price_text = re.sub(r'[^\d.,]+', '', price_text)
price_text = price_text.replace(',', '')
# Try to extract price as float
try:
return float(price_text)
except (ValueError, TypeError):
# Try to find price pattern
price_match = re.search(r'(\d+\.?\d*)', price_text)
if price_match:
return float(price_match.group(1))
return None
def _extract_text(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[str]:
"""Extract text from HTML using CSS selectors."""
for selector in selectors:
try:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
except Exception as e:
logger.debug(f"Error with selector {selector}: {e}")
continue
return None
def _detect_site(self, url: str) -> Optional[str]:
"""Detect which site this URL belongs to."""
domain = urlparse(url).netloc.lower()
# UK Catering sites (handled by UKCateringScraper)
if 'jjfoodservice.com' in domain:
return 'jjfoodservice'
elif 'atoz-catering.co.uk' in domain:
return 'atoz_catering'
elif 'amazon.co.uk' in domain:
return 'amazon_uk'
# International sites (handled by base PriceScraper)
elif 'amazon.com' in domain or 'amazon.' in domain:
return 'amazon'
elif 'ebay' in domain:
return 'ebay'
elif 'walmart' in domain:
return 'walmart'
return None
async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
"""Scrape price for a single product from a URL."""
result = {
'success': False,
'price': None,
'currency': 'GBP',
'title': None,
'availability': None,
'url': url,
'error': None
}
try:
# Auto-detect site if not provided
if not site_name:
site_name = self._detect_site(url)
if not site_name:
result['error'] = "Could not detect site from URL"
return result
# Get site configuration
site_config = self.config.get_site_config(site_name)
if not site_config:
result['error'] = f"No configuration found for site: {site_name}"
return result
if not self.config.is_site_enabled(site_name):
result['error'] = f"Site {site_name} is disabled"
return result
# Fetch page content
html_content = await self._fetch_page(url)
if not html_content:
result['error'] = "Failed to fetch page content"
return result
# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Extract price
price_selectors = site_config.get('selectors', {}).get('price', [])
price = self._extract_price(soup, price_selectors)
if price is None:
result['error'] = "Could not extract price from page"
return result
# Extract additional information
title_selectors = site_config.get('selectors', {}).get('title', [])
title = self._extract_text(soup, title_selectors)
availability_selectors = site_config.get('selectors', {}).get('availability', [])
availability_text = self._extract_text(soup, availability_selectors)
availability = self._parse_availability(availability_text)
result.update({
'success': True,
'price': price,
'title': title,
'availability': availability
})
logger.info(f"Successfully scraped {site_name}: ${price}")
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
result['error'] = str(e)
return result
def _parse_availability(self, availability_text: str) -> bool:
"""Parse availability from text."""
if not availability_text:
return True # Assume available if no info
availability_text = availability_text.lower()
# Common out of stock indicators
out_of_stock_indicators = [
'out of stock', 'unavailable', 'sold out', 'not available',
'temporarily out of stock', 'currently unavailable'
]
for indicator in out_of_stock_indicators:
if indicator in availability_text:
return False
return True
def should_use_uk_scraper(self, url: str) -> bool:
"""Determine if this URL should use the UK catering scraper."""
site_name = self._detect_site(url)
uk_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
return site_name in uk_sites
@classmethod
def get_uk_catering_sites(cls) -> set:
"""Get the list of UK catering sites."""
return {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
class ScraperManager:
"""Manages multiple price scrapers and coordinates scraping tasks."""
def __init__(self, config: Config):
self.config = config
self.semaphore = asyncio.Semaphore(config.max_concurrent_requests)
async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
"""Scrape prices for a single product across all configured sites."""
product_id = product['id']
urls = product['urls']
results = {}
async with PriceScraper(self.config) as scraper:
tasks = []
for site_name, url in urls.items():
if self.config.is_site_enabled(site_name):
task = self._scrape_with_semaphore(scraper, url, site_name)
tasks.append((site_name, task))
# Add delay between requests
await asyncio.sleep(self.config.delay_between_requests)
# Wait for all tasks to complete
for site_name, task in tasks:
try:
result = await task
results[site_name] = result
except Exception as e:
logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
results[site_name] = {
'success': False,
'error': str(e)
}
return results
async def _scrape_with_semaphore(self, scraper: PriceScraper, url: str, site_name: str):
"""Scrape with semaphore to limit concurrent requests."""
async with self.semaphore:
return await scraper.scrape_product_price(url, site_name)
async def scrape_all_products(self, products: List[Dict[str, Any]]) -> Dict[int, Dict[str, Dict[str, Any]]]:
"""Scrape prices for all products."""
results = {}
for product in products:
try:
product_id = product['id']
logger.info(f"Scraping product: {product['name']} (ID: {product_id})")
product_results = await self.scrape_product(product)
results[product_id] = product_results
# Add delay between products
await asyncio.sleep(self.config.delay_between_requests)
except Exception as e:
logger.error(f"Error scraping product {product.get('id', 'unknown')}: {e}")
return results