""" Web scraping functionality for price tracking """ import asyncio import aiohttp import logging import random import re from typing import Dict, List, Optional, Any, Tuple from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from fake_useragent import UserAgent from .config import Config logger = logging.getLogger(__name__) class PriceScraper: """Base class for price scraping functionality.""" def __init__(self, config: Config): self.config = config self.ua = UserAgent() self.session = None async def __aenter__(self): """Async context manager entry.""" connector = aiohttp.TCPConnector(limit=self.config.max_concurrent_requests) timeout = aiohttp.ClientTimeout(total=self.config.timeout) self.session = aiohttp.ClientSession( connector=connector, timeout=timeout, headers={'User-Agent': self.ua.random} ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit.""" if self.session: await self.session.close() def _get_headers(self, url: str = None) -> Dict[str, str]: """Get request headers with random user agent and site-specific headers.""" user_agents = self.config.user_agents if user_agents: user_agent = random.choice(user_agents) else: user_agent = self.ua.random headers = { 'User-Agent': user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', } # Add site-specific headers if url: if 'amazon.co.uk' in url: headers.update({ 'Referer': 'https://www.amazon.co.uk/', }) elif 'jjfoodservice.com' in url: headers.update({ 'Referer': 'https://www.jjfoodservice.com/', }) elif 'atoz-catering.co.uk' in url: headers.update({ 'Referer': 'https://www.atoz-catering.co.uk/', }) return headers async def _fetch_page(self, url: str) -> Optional[str]: """Fetch a web page with retry logic and anti-bot measures.""" base_delay = random.uniform(1, 3) # Random delay between 1-3 seconds for attempt in range(self.config.retry_attempts): try: # Add delay before each request (except first) if attempt > 0: delay = base_delay * (2 ** attempt) + random.uniform(0, 1) await asyncio.sleep(delay) headers = self._get_headers(url) async with self.session.get(url, headers=headers) as response: if response.status == 200: return await response.text() elif response.status == 403: logger.warning(f"Access denied (403) for {url} - may be blocked by anti-bot measures") # For 403 errors, wait longer before retry if attempt < self.config.retry_attempts - 1: await asyncio.sleep(random.uniform(5, 10)) elif response.status == 429: logger.warning(f"Rate limited (429) for {url}") # For rate limiting, wait even longer if attempt < self.config.retry_attempts - 1: await asyncio.sleep(random.uniform(10, 20)) else: logger.warning(f"HTTP {response.status} for {url}") except Exception as e: logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}") if attempt < self.config.retry_attempts - 1: await asyncio.sleep(base_delay * (2 ** attempt)) logger.error(f"Failed to fetch {url} after {self.config.retry_attempts} attempts") return None def _extract_price(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[float]: """Extract price from HTML using CSS selectors.""" for selector in selectors: try: elements = soup.select(selector) for element in elements: price_text = element.get_text(strip=True) price = self._parse_price(price_text) if price is not None: return price except Exception as e: logger.debug(f"Error with selector {selector}: {e}") continue return None def _parse_price(self, price_text: str) -> Optional[float]: """Parse price from text string.""" if not price_text: return None # Remove common currency symbols and clean text price_text = re.sub(r'[^\d.,]+', '', price_text) price_text = price_text.replace(',', '') # Try to extract price as float try: return float(price_text) except (ValueError, TypeError): # Try to find price pattern price_match = re.search(r'(\d+\.?\d*)', price_text) if price_match: return float(price_match.group(1)) return None def _extract_text(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[str]: """Extract text from HTML using CSS selectors.""" for selector in selectors: try: element = soup.select_one(selector) if element: return element.get_text(strip=True) except Exception as e: logger.debug(f"Error with selector {selector}: {e}") continue return None def _detect_site(self, url: str) -> Optional[str]: """Detect which site this URL belongs to.""" domain = urlparse(url).netloc.lower() # UK Catering sites (handled by UKCateringScraper) if 'jjfoodservice.com' in domain: return 'jjfoodservice' elif 'atoz-catering.co.uk' in domain: return 'atoz_catering' elif 'amazon.co.uk' in domain: return 'amazon_uk' # International sites (handled by base PriceScraper) elif 'amazon.com' in domain or 'amazon.' in domain: return 'amazon' elif 'ebay' in domain: return 'ebay' elif 'walmart' in domain: return 'walmart' return None async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]: """Scrape price for a single product from a URL.""" result = { 'success': False, 'price': None, 'currency': 'GBP', 'title': None, 'availability': None, 'url': url, 'error': None } try: # Auto-detect site if not provided if not site_name: site_name = self._detect_site(url) if not site_name: result['error'] = "Could not detect site from URL" return result # Get site configuration site_config = self.config.get_site_config(site_name) if not site_config: result['error'] = f"No configuration found for site: {site_name}" return result if not self.config.is_site_enabled(site_name): result['error'] = f"Site {site_name} is disabled" return result # Fetch page content html_content = await self._fetch_page(url) if not html_content: result['error'] = "Failed to fetch page content" return result # Parse HTML soup = BeautifulSoup(html_content, 'html.parser') # Extract price price_selectors = site_config.get('selectors', {}).get('price', []) price = self._extract_price(soup, price_selectors) if price is None: result['error'] = "Could not extract price from page" return result # Extract additional information title_selectors = site_config.get('selectors', {}).get('title', []) title = self._extract_text(soup, title_selectors) availability_selectors = site_config.get('selectors', {}).get('availability', []) availability_text = self._extract_text(soup, availability_selectors) availability = self._parse_availability(availability_text) result.update({ 'success': True, 'price': price, 'title': title, 'availability': availability }) logger.info(f"Successfully scraped {site_name}: ${price}") except Exception as e: logger.error(f"Error scraping {url}: {e}") result['error'] = str(e) return result def _parse_availability(self, availability_text: str) -> bool: """Parse availability from text.""" if not availability_text: return True # Assume available if no info availability_text = availability_text.lower() # Common out of stock indicators out_of_stock_indicators = [ 'out of stock', 'unavailable', 'sold out', 'not available', 'temporarily out of stock', 'currently unavailable' ] for indicator in out_of_stock_indicators: if indicator in availability_text: return False return True def should_use_uk_scraper(self, url: str) -> bool: """Determine if this URL should use the UK catering scraper.""" site_name = self._detect_site(url) uk_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'} return site_name in uk_sites @classmethod def get_uk_catering_sites(cls) -> set: """Get the list of UK catering sites.""" return {'jjfoodservice', 'atoz_catering', 'amazon_uk'} class ScraperManager: """Manages multiple price scrapers and coordinates scraping tasks.""" def __init__(self, config: Config): self.config = config self.semaphore = asyncio.Semaphore(config.max_concurrent_requests) async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """Scrape prices for a single product across all configured sites.""" product_id = product['id'] urls = product['urls'] results = {} async with PriceScraper(self.config) as scraper: tasks = [] for site_name, url in urls.items(): if self.config.is_site_enabled(site_name): task = self._scrape_with_semaphore(scraper, url, site_name) tasks.append((site_name, task)) # Add delay between requests await asyncio.sleep(self.config.delay_between_requests) # Wait for all tasks to complete for site_name, task in tasks: try: result = await task results[site_name] = result except Exception as e: logger.error(f"Error scraping {site_name} for product {product_id}: {e}") results[site_name] = { 'success': False, 'error': str(e) } return results async def _scrape_with_semaphore(self, scraper: PriceScraper, url: str, site_name: str): """Scrape with semaphore to limit concurrent requests.""" async with self.semaphore: return await scraper.scrape_product_price(url, site_name) async def scrape_all_products(self, products: List[Dict[str, Any]]) -> Dict[int, Dict[str, Dict[str, Any]]]: """Scrape prices for all products.""" results = {} for product in products: try: product_id = product['id'] logger.info(f"Scraping product: {product['name']} (ID: {product_id})") product_results = await self.scrape_product(product) results[product_id] = product_results # Add delay between products await asyncio.sleep(self.config.delay_between_requests) except Exception as e: logger.error(f"Error scraping product {product.get('id', 'unknown')}: {e}") return results