354 lines
13 KiB
Python
354 lines
13 KiB
Python
"""
|
|
Web scraping functionality for price tracking
|
|
"""
|
|
|
|
import asyncio
|
|
import aiohttp
|
|
import logging
|
|
import random
|
|
import re
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from urllib.parse import urljoin, urlparse
|
|
from bs4 import BeautifulSoup
|
|
from fake_useragent import UserAgent
|
|
|
|
from .config import Config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PriceScraper:
|
|
"""Base class for price scraping functionality."""
|
|
|
|
def __init__(self, config: Config):
|
|
self.config = config
|
|
self.ua = UserAgent()
|
|
self.session = None
|
|
|
|
async def __aenter__(self):
|
|
"""Async context manager entry."""
|
|
connector = aiohttp.TCPConnector(limit=self.config.max_concurrent_requests)
|
|
timeout = aiohttp.ClientTimeout(total=self.config.timeout)
|
|
self.session = aiohttp.ClientSession(
|
|
connector=connector,
|
|
timeout=timeout,
|
|
headers={'User-Agent': self.ua.random}
|
|
)
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
"""Async context manager exit."""
|
|
if self.session:
|
|
await self.session.close()
|
|
|
|
def _get_headers(self, url: str = None) -> Dict[str, str]:
|
|
"""Get request headers with random user agent and site-specific headers."""
|
|
user_agents = self.config.user_agents
|
|
if user_agents:
|
|
user_agent = random.choice(user_agents)
|
|
else:
|
|
user_agent = self.ua.random
|
|
|
|
headers = {
|
|
'User-Agent': user_agent,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
}
|
|
|
|
# Add site-specific headers
|
|
if url:
|
|
if 'amazon.co.uk' in url:
|
|
headers.update({
|
|
'Referer': 'https://www.amazon.co.uk/',
|
|
})
|
|
elif 'jjfoodservice.com' in url:
|
|
headers.update({
|
|
'Referer': 'https://www.jjfoodservice.com/',
|
|
})
|
|
elif 'atoz-catering.co.uk' in url:
|
|
headers.update({
|
|
'Referer': 'https://www.atoz-catering.co.uk/',
|
|
})
|
|
|
|
return headers
|
|
|
|
async def _fetch_page(self, url: str) -> Optional[str]:
|
|
"""Fetch a web page with retry logic and anti-bot measures."""
|
|
base_delay = random.uniform(1, 3) # Random delay between 1-3 seconds
|
|
|
|
for attempt in range(self.config.retry_attempts):
|
|
try:
|
|
# Add delay before each request (except first)
|
|
if attempt > 0:
|
|
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
|
await asyncio.sleep(delay)
|
|
|
|
headers = self._get_headers(url)
|
|
|
|
async with self.session.get(url, headers=headers) as response:
|
|
if response.status == 200:
|
|
return await response.text()
|
|
elif response.status == 403:
|
|
logger.warning(f"Access denied (403) for {url} - may be blocked by anti-bot measures")
|
|
# For 403 errors, wait longer before retry
|
|
if attempt < self.config.retry_attempts - 1:
|
|
await asyncio.sleep(random.uniform(5, 10))
|
|
elif response.status == 429:
|
|
logger.warning(f"Rate limited (429) for {url}")
|
|
# For rate limiting, wait even longer
|
|
if attempt < self.config.retry_attempts - 1:
|
|
await asyncio.sleep(random.uniform(10, 20))
|
|
else:
|
|
logger.warning(f"HTTP {response.status} for {url}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
|
|
if attempt < self.config.retry_attempts - 1:
|
|
await asyncio.sleep(base_delay * (2 ** attempt))
|
|
|
|
logger.error(f"Failed to fetch {url} after {self.config.retry_attempts} attempts")
|
|
return None
|
|
|
|
def _extract_price(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[float]:
|
|
"""Extract price from HTML using CSS selectors."""
|
|
for selector in selectors:
|
|
try:
|
|
elements = soup.select(selector)
|
|
for element in elements:
|
|
price_text = element.get_text(strip=True)
|
|
price = self._parse_price(price_text)
|
|
if price is not None:
|
|
return price
|
|
except Exception as e:
|
|
logger.debug(f"Error with selector {selector}: {e}")
|
|
continue
|
|
|
|
return None
|
|
|
|
def _parse_price(self, price_text: str) -> Optional[float]:
|
|
"""Parse price from text string."""
|
|
if not price_text:
|
|
return None
|
|
|
|
# Remove common currency symbols and clean text
|
|
price_text = re.sub(r'[^\d.,]+', '', price_text)
|
|
price_text = price_text.replace(',', '')
|
|
|
|
# Try to extract price as float
|
|
try:
|
|
return float(price_text)
|
|
except (ValueError, TypeError):
|
|
# Try to find price pattern
|
|
price_match = re.search(r'(\d+\.?\d*)', price_text)
|
|
if price_match:
|
|
return float(price_match.group(1))
|
|
|
|
return None
|
|
|
|
def _extract_text(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[str]:
|
|
"""Extract text from HTML using CSS selectors."""
|
|
for selector in selectors:
|
|
try:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
return element.get_text(strip=True)
|
|
except Exception as e:
|
|
logger.debug(f"Error with selector {selector}: {e}")
|
|
continue
|
|
|
|
return None
|
|
|
|
def _detect_site(self, url: str) -> Optional[str]:
|
|
"""Detect which site this URL belongs to."""
|
|
domain = urlparse(url).netloc.lower()
|
|
|
|
# UK Catering sites (handled by UKCateringScraper)
|
|
if 'jjfoodservice.com' in domain:
|
|
return 'jjfoodservice'
|
|
elif 'atoz-catering.co.uk' in domain:
|
|
return 'atoz_catering'
|
|
elif 'amazon.co.uk' in domain:
|
|
return 'amazon_uk'
|
|
|
|
# International sites (handled by base PriceScraper)
|
|
elif 'amazon.com' in domain or 'amazon.' in domain:
|
|
return 'amazon'
|
|
elif 'ebay' in domain:
|
|
return 'ebay'
|
|
elif 'walmart' in domain:
|
|
return 'walmart'
|
|
|
|
return None
|
|
|
|
async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
|
|
"""Scrape price for a single product from a URL."""
|
|
result = {
|
|
'success': False,
|
|
'price': None,
|
|
'currency': 'GBP',
|
|
'title': None,
|
|
'availability': None,
|
|
'url': url,
|
|
'error': None
|
|
}
|
|
|
|
try:
|
|
# Auto-detect site if not provided
|
|
if not site_name:
|
|
site_name = self._detect_site(url)
|
|
if not site_name:
|
|
result['error'] = "Could not detect site from URL"
|
|
return result
|
|
|
|
# Get site configuration
|
|
site_config = self.config.get_site_config(site_name)
|
|
if not site_config:
|
|
result['error'] = f"No configuration found for site: {site_name}"
|
|
return result
|
|
|
|
if not self.config.is_site_enabled(site_name):
|
|
result['error'] = f"Site {site_name} is disabled"
|
|
return result
|
|
|
|
# Fetch page content
|
|
html_content = await self._fetch_page(url)
|
|
if not html_content:
|
|
result['error'] = "Failed to fetch page content"
|
|
return result
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Extract price
|
|
price_selectors = site_config.get('selectors', {}).get('price', [])
|
|
price = self._extract_price(soup, price_selectors)
|
|
|
|
if price is None:
|
|
result['error'] = "Could not extract price from page"
|
|
return result
|
|
|
|
# Extract additional information
|
|
title_selectors = site_config.get('selectors', {}).get('title', [])
|
|
title = self._extract_text(soup, title_selectors)
|
|
|
|
availability_selectors = site_config.get('selectors', {}).get('availability', [])
|
|
availability_text = self._extract_text(soup, availability_selectors)
|
|
availability = self._parse_availability(availability_text)
|
|
|
|
result.update({
|
|
'success': True,
|
|
'price': price,
|
|
'title': title,
|
|
'availability': availability
|
|
})
|
|
|
|
logger.info(f"Successfully scraped {site_name}: ${price}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scraping {url}: {e}")
|
|
result['error'] = str(e)
|
|
|
|
return result
|
|
|
|
def _parse_availability(self, availability_text: str) -> bool:
|
|
"""Parse availability from text."""
|
|
if not availability_text:
|
|
return True # Assume available if no info
|
|
|
|
availability_text = availability_text.lower()
|
|
|
|
# Common out of stock indicators
|
|
out_of_stock_indicators = [
|
|
'out of stock', 'unavailable', 'sold out', 'not available',
|
|
'temporarily out of stock', 'currently unavailable'
|
|
]
|
|
|
|
for indicator in out_of_stock_indicators:
|
|
if indicator in availability_text:
|
|
return False
|
|
|
|
return True
|
|
|
|
def should_use_uk_scraper(self, url: str) -> bool:
|
|
"""Determine if this URL should use the UK catering scraper."""
|
|
site_name = self._detect_site(url)
|
|
uk_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
|
return site_name in uk_sites
|
|
|
|
@classmethod
|
|
def get_uk_catering_sites(cls) -> set:
|
|
"""Get the list of UK catering sites."""
|
|
return {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
|
|
|
|
|
class ScraperManager:
|
|
"""Manages multiple price scrapers and coordinates scraping tasks."""
|
|
|
|
def __init__(self, config: Config):
|
|
self.config = config
|
|
self.semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
|
|
|
async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
|
"""Scrape prices for a single product across all configured sites."""
|
|
product_id = product['id']
|
|
urls = product['urls']
|
|
|
|
results = {}
|
|
|
|
async with PriceScraper(self.config) as scraper:
|
|
tasks = []
|
|
|
|
for site_name, url in urls.items():
|
|
if self.config.is_site_enabled(site_name):
|
|
task = self._scrape_with_semaphore(scraper, url, site_name)
|
|
tasks.append((site_name, task))
|
|
|
|
# Add delay between requests
|
|
await asyncio.sleep(self.config.delay_between_requests)
|
|
|
|
# Wait for all tasks to complete
|
|
for site_name, task in tasks:
|
|
try:
|
|
result = await task
|
|
results[site_name] = result
|
|
except Exception as e:
|
|
logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
|
|
results[site_name] = {
|
|
'success': False,
|
|
'error': str(e)
|
|
}
|
|
|
|
return results
|
|
|
|
async def _scrape_with_semaphore(self, scraper: PriceScraper, url: str, site_name: str):
|
|
"""Scrape with semaphore to limit concurrent requests."""
|
|
async with self.semaphore:
|
|
return await scraper.scrape_product_price(url, site_name)
|
|
|
|
async def scrape_all_products(self, products: List[Dict[str, Any]]) -> Dict[int, Dict[str, Dict[str, Any]]]:
|
|
"""Scrape prices for all products."""
|
|
results = {}
|
|
|
|
for product in products:
|
|
try:
|
|
product_id = product['id']
|
|
logger.info(f"Scraping product: {product['name']} (ID: {product_id})")
|
|
|
|
product_results = await self.scrape_product(product)
|
|
results[product_id] = product_results
|
|
|
|
# Add delay between products
|
|
await asyncio.sleep(self.config.delay_between_requests)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scraping product {product.get('id', 'unknown')}: {e}")
|
|
|
|
return results
|