Initial Push

2025-06-27 10:36:26 +01:00
parent cf1023c14a
commit 191184ba5e
31 changed files with 4531 additions and 68 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -0,0 +1,334 @@
+"""
+Web scraping functionality for price tracking
+"""
+
+import asyncio
+import aiohttp
+import logging
+import random
+import re
+from typing import Dict, List, Optional, Any, Tuple
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
+
+from .config import Config
+
+logger = logging.getLogger(__name__)
+
+
+class PriceScraper:
+    """Base class for price scraping functionality."""
+    
+    def __init__(self, config: Config):
+        self.config = config
+        self.ua = UserAgent()
+        self.session = None
+    
+    async def __aenter__(self):
+        """Async context manager entry."""
+        connector = aiohttp.TCPConnector(limit=self.config.max_concurrent_requests)
+        timeout = aiohttp.ClientTimeout(total=self.config.timeout)
+        self.session = aiohttp.ClientSession(
+            connector=connector,
+            timeout=timeout,
+            headers={'User-Agent': self.ua.random}
+        )
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        if self.session:
+            await self.session.close()
+    
+    def _get_headers(self, url: str = None) -> Dict[str, str]:
+        """Get request headers with random user agent and site-specific headers."""
+        user_agents = self.config.user_agents
+        if user_agents:
+            user_agent = random.choice(user_agents)
+        else:
+            user_agent = self.ua.random
+        
+        headers = {
+            'User-Agent': user_agent,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'Sec-Fetch-Dest': 'document',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Site': 'none',
+        }
+        
+        # Add site-specific headers
+        if url:
+            if 'amazon.co.uk' in url:
+                headers.update({
+                    'Referer': 'https://www.amazon.co.uk/',
+                })
+            elif 'jjfoodservice.com' in url:
+                headers.update({
+                    'Referer': 'https://www.jjfoodservice.com/',
+                })
+            elif 'atoz-catering.co.uk' in url:
+                headers.update({
+                    'Referer': 'https://www.atoz-catering.co.uk/',
+                })
+        
+        return headers
+    
+    async def _fetch_page(self, url: str) -> Optional[str]:
+        """Fetch a web page with retry logic and anti-bot measures."""
+        base_delay = random.uniform(1, 3)  # Random delay between 1-3 seconds
+        
+        for attempt in range(self.config.retry_attempts):
+            try:
+                # Add delay before each request (except first)
+                if attempt > 0:
+                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
+                    await asyncio.sleep(delay)
+                
+                headers = self._get_headers(url)
+                
+                async with self.session.get(url, headers=headers) as response:
+                    if response.status == 200:
+                        return await response.text()
+                    elif response.status == 403:
+                        logger.warning(f"Access denied (403) for {url} - may be blocked by anti-bot measures")
+                        # For 403 errors, wait longer before retry
+                        if attempt < self.config.retry_attempts - 1:
+                            await asyncio.sleep(random.uniform(5, 10))
+                    elif response.status == 429:
+                        logger.warning(f"Rate limited (429) for {url}")
+                        # For rate limiting, wait even longer
+                        if attempt < self.config.retry_attempts - 1:
+                            await asyncio.sleep(random.uniform(10, 20))
+                    else:
+                        logger.warning(f"HTTP {response.status} for {url}")
+                        
+            except Exception as e:
+                logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
+                if attempt < self.config.retry_attempts - 1:
+                    await asyncio.sleep(base_delay * (2 ** attempt))
+        
+        logger.error(f"Failed to fetch {url} after {self.config.retry_attempts} attempts")
+        return None
+    
+    def _extract_price(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[float]:
+        """Extract price from HTML using CSS selectors."""
+        for selector in selectors:
+            try:
+                elements = soup.select(selector)
+                for element in elements:
+                    price_text = element.get_text(strip=True)
+                    price = self._parse_price(price_text)
+                    if price is not None:
+                        return price
+            except Exception as e:
+                logger.debug(f"Error with selector {selector}: {e}")
+                continue
+        
+        return None
+    
+    def _parse_price(self, price_text: str) -> Optional[float]:
+        """Parse price from text string."""
+        if not price_text:
+            return None
+        
+        # Remove common currency symbols and clean text
+        price_text = re.sub(r'[^\d.,]+', '', price_text)
+        price_text = price_text.replace(',', '')
+        
+        # Try to extract price as float
+        try:
+            return float(price_text)
+        except (ValueError, TypeError):
+            # Try to find price pattern
+            price_match = re.search(r'(\d+\.?\d*)', price_text)
+            if price_match:
+                return float(price_match.group(1))
+        
+        return None
+    
+    def _extract_text(self, soup: BeautifulSoup, selectors: List[str]) -> Optional[str]:
+        """Extract text from HTML using CSS selectors."""
+        for selector in selectors:
+            try:
+                element = soup.select_one(selector)
+                if element:
+                    return element.get_text(strip=True)
+            except Exception as e:
+                logger.debug(f"Error with selector {selector}: {e}")
+                continue
+        
+        return None
+    
+    def _detect_site(self, url: str) -> Optional[str]:
+        """Detect which site this URL belongs to."""
+        domain = urlparse(url).netloc.lower()
+        
+        if 'amazon' in domain:
+            return 'amazon'
+        elif 'ebay' in domain:
+            return 'ebay'
+        elif 'walmart' in domain:
+            return 'walmart'
+        # Add more site detection logic here
+        
+        return None
+    
+    async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
+        """Scrape price for a single product from a URL."""
+        result = {
+            'success': False,
+            'price': None,
+            'currency': 'GBP',
+            'title': None,
+            'availability': None,
+            'url': url,
+            'error': None
+        }
+        
+        try:
+            # Auto-detect site if not provided
+            if not site_name:
+                site_name = self._detect_site(url)
+                if not site_name:
+                    result['error'] = "Could not detect site from URL"
+                    return result
+            
+            # Get site configuration
+            site_config = self.config.get_site_config(site_name)
+            if not site_config:
+                result['error'] = f"No configuration found for site: {site_name}"
+                return result
+            
+            if not self.config.is_site_enabled(site_name):
+                result['error'] = f"Site {site_name} is disabled"
+                return result
+            
+            # Fetch page content
+            html_content = await self._fetch_page(url)
+            if not html_content:
+                result['error'] = "Failed to fetch page content"
+                return result
+            
+            # Parse HTML
+            soup = BeautifulSoup(html_content, 'html.parser')
+            
+            # Extract price
+            price_selectors = site_config.get('selectors', {}).get('price', [])
+            price = self._extract_price(soup, price_selectors)
+            
+            if price is None:
+                result['error'] = "Could not extract price from page"
+                return result
+            
+            # Extract additional information
+            title_selectors = site_config.get('selectors', {}).get('title', [])
+            title = self._extract_text(soup, title_selectors)
+            
+            availability_selectors = site_config.get('selectors', {}).get('availability', [])
+            availability_text = self._extract_text(soup, availability_selectors)
+            availability = self._parse_availability(availability_text)
+            
+            result.update({
+                'success': True,
+                'price': price,
+                'title': title,
+                'availability': availability
+            })
+            
+            logger.info(f"Successfully scraped {site_name}: ${price}")
+            
+        except Exception as e:
+            logger.error(f"Error scraping {url}: {e}")
+            result['error'] = str(e)
+        
+        return result
+    
+    def _parse_availability(self, availability_text: str) -> bool:
+        """Parse availability from text."""
+        if not availability_text:
+            return True  # Assume available if no info
+        
+        availability_text = availability_text.lower()
+        
+        # Common out of stock indicators
+        out_of_stock_indicators = [
+            'out of stock', 'unavailable', 'sold out', 'not available',
+            'temporarily out of stock', 'currently unavailable'
+        ]
+        
+        for indicator in out_of_stock_indicators:
+            if indicator in availability_text:
+                return False
+        
+        return True
+
+
+class ScraperManager:
+    """Manages multiple price scrapers and coordinates scraping tasks."""
+    
+    def __init__(self, config: Config):
+        self.config = config
+        self.semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+    
+    async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
+        """Scrape prices for a single product across all configured sites."""
+        product_id = product['id']
+        urls = product['urls']
+        
+        results = {}
+        
+        async with PriceScraper(self.config) as scraper:
+            tasks = []
+            
+            for site_name, url in urls.items():
+                if self.config.is_site_enabled(site_name):
+                    task = self._scrape_with_semaphore(scraper, url, site_name)
+                    tasks.append((site_name, task))
+                    
+                    # Add delay between requests
+                    await asyncio.sleep(self.config.delay_between_requests)
+            
+            # Wait for all tasks to complete
+            for site_name, task in tasks:
+                try:
+                    result = await task
+                    results[site_name] = result
+                except Exception as e:
+                    logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
+                    results[site_name] = {
+                        'success': False,
+                        'error': str(e)
+                    }
+        
+        return results
+    
+    async def _scrape_with_semaphore(self, scraper: PriceScraper, url: str, site_name: str):
+        """Scrape with semaphore to limit concurrent requests."""
+        async with self.semaphore:
+            return await scraper.scrape_product_price(url, site_name)
+    
+    async def scrape_all_products(self, products: List[Dict[str, Any]]) -> Dict[int, Dict[str, Dict[str, Any]]]:
+        """Scrape prices for all products."""
+        results = {}
+        
+        for product in products:
+            try:
+                product_id = product['id']
+                logger.info(f"Scraping product: {product['name']} (ID: {product_id})")
+                
+                product_results = await self.scrape_product(product)
+                results[product_id] = product_results
+                
+                # Add delay between products
+                await asyncio.sleep(self.config.delay_between_requests)
+                
+            except Exception as e:
+                logger.error(f"Error scraping product {product.get('id', 'unknown')}: {e}")
+        
+        return results