scrape fix

2025-06-27 17:25:56 +01:00
parent ee0142121a
commit 5726183115
27 changed files with 2353 additions and 621 deletions
--- a/debug_atoz_pricing.py
+++ b/debug_atoz_pricing.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""
+Debug script specifically for A to Z Catering pricing issues
+"""
+
+import requests
+from bs4 import BeautifulSoup
+import re
+import sys
+import os
+
+# Add the src directory to the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+def fetch_and_analyze_atoz_page(url):
+    """Fetch and analyze the A to Z page to identify pricing issues."""
+    
+    print(f"Analyzing A to Z page: {url}")
+    print("=" * 80)
+    
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    
+    try:
+        response = requests.get(url, headers=headers, timeout=30)
+        print(f"HTTP Status: {response.status_code}")
+        
+        if response.status_code != 200:
+            print("Failed to fetch page")
+            return
+        
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # 1. Find all elements containing prices
+        print("\n1. ALL PRICE ELEMENTS FOUND:")
+        print("-" * 40)
+        price_pattern = re.compile(r'£\d+\.?\d*')
+        price_elements = soup.find_all(string=price_pattern)
+        
+        for i, price_text in enumerate(price_elements):
+            parent = price_text.parent if hasattr(price_text, 'parent') else None
+            parent_class = parent.get('class', []) if parent else []
+            parent_tag = parent.name if parent else 'N/A'
+            
+            print(f"  {i+1:2d}. '{price_text.strip()}' in <{parent_tag}> class={parent_class}")
+        
+        # 2. Check for delivery-specific elements
+        print("\n2. DELIVERY-RELATED ELEMENTS:")
+        print("-" * 40)
+        delivery_keywords = ['delivery', 'delivered']
+        
+        for keyword in delivery_keywords:
+            elements = soup.find_all(string=re.compile(keyword, re.IGNORECASE))
+            for elem in elements[:5]:  # Show first 5
+                parent = elem.parent if hasattr(elem, 'parent') else None
+                parent_class = parent.get('class', []) if parent else []
+                text = elem.strip()[:100]
+                print(f"  '{text}' in class={parent_class}")
+        
+        # 3. Check h3 and h4 elements (A to Z specific)
+        print("\n3. H3/H4 ELEMENTS WITH PRICES:")
+        print("-" * 40)
+        headers = soup.find_all(['h3', 'h4'])
+        for header in headers:
+            text = header.get_text(strip=True)
+            if '£' in text:
+                print(f"  <{header.name}>: {text}")
+        
+        # 4. Test specific selectors from our config
+        print("\n4. TESTING OUR SELECTORS:")
+        print("-" * 40)
+        
+        test_selectors = [
+            '.delivery-price',
+            '.price-delivery', 
+            '.price',
+            '.product-price',
+            '.collection-price',
+            'span:contains("£")',
+            'h3:contains("Delivery")',
+            'h4:contains("Delivery")',
+            '*[class*="price"]'
+        ]
+        
+        for selector in test_selectors:
+            try:
+                if ':contains(' in selector:
+                    # Handle contains selectors differently
+                    if 'h3:contains("Delivery")' == selector:
+                        elements = [h for h in soup.find_all('h3') if 'delivery' in h.get_text().lower()]
+                    elif 'h4:contains("Delivery")' == selector:
+                        elements = [h for h in soup.find_all('h4') if 'delivery' in h.get_text().lower()]
+                    elif 'span:contains("£")' == selector:
+                        elements = [s for s in soup.find_all('span') if '£' in s.get_text()]
+                    else:
+                        elements = []
+                else:
+                    elements = soup.select(selector)
+                
+                if elements:
+                    print(f"  ✓ {selector} -> {len(elements)} elements:")
+                    for i, elem in enumerate(elements[:3]):  # Show first 3
+                        text = elem.get_text(strip=True)
+                        if '£' in text:
+                            print(f"     [{i+1}] {text}")
+                else:
+                    print(f"  ✗ {selector} -> No elements")
+                    
+            except Exception as e:
+                print(f"  ⚠ {selector} -> Error: {e}")
+        
+        # 5. Look for the specific prices mentioned (12.99 and 1.39)
+        print("\n5. SPECIFIC PRICE ANALYSIS:")
+        print("-" * 40)
+        
+        if '12.99' in response.text:
+            print("✓ £12.99 found in page content")
+            # Find context around 12.99
+            matches = list(re.finditer(r'12\.99', response.text))
+            for match in matches[:3]:  # Show first 3 occurrences
+                start = max(0, match.start() - 100)
+                end = min(len(response.text), match.end() + 100)
+                context = response.text[start:end].replace('\n', ' ').replace('\t', ' ')
+                print(f"  Context: ...{context}...")
+        else:
+            print("✗ £12.99 NOT found in page content")
+        
+        if '1.39' in response.text:
+            print("✓ £1.39 found in page content")
+            # Find context around 1.39
+            matches = list(re.finditer(r'1\.39', response.text))
+            for match in matches[:3]:  # Show first 3 occurrences
+                start = max(0, match.start() - 100)
+                end = min(len(response.text), match.end() + 100)
+                context = response.text[start:end].replace('\n', ' ').replace('\t', ' ')
+                print(f"  Context: ...{context}...")
+        else:
+            print("✗ £1.39 NOT found in page content")
+        
+        # 6. Try to simulate our current parsing logic
+        print("\n6. SIMULATING CURRENT PARSING LOGIC:")
+        print("-" * 40)
+        
+        # Test our general price selectors
+        general_selectors = [
+            '.price',
+            '.product-price', 
+            'span:contains("£")',
+            '.price-value',
+        ]
+        
+        found_prices = []
+        for selector in general_selectors:
+            try:
+                if selector == 'span:contains("£")':
+                    elements = [s for s in soup.find_all('span') if '£' in s.get_text()]
+                else:
+                    elements = soup.select(selector)
+                
+                for element in elements:
+                    price_text = element.get_text(strip=True)
+                    if '£' in price_text:
+                        # Extract price using regex
+                        price_matches = re.findall(r'£(\d+\.?\d*)', price_text)
+                        for match in price_matches:
+                            try:
+                                price_value = float(match)
+                                found_prices.append((price_value, selector, price_text))
+                            except ValueError:
+                                pass
+                                
+            except Exception as e:
+                print(f"Error with {selector}: {e}")
+        
+        print(f"Found {len(found_prices)} prices total:")
+        for price, selector, text in found_prices:
+            print(f"  £{price} from '{selector}': {text[:50]}")
+        
+        if found_prices:
+            # Show what our current logic would select
+            min_price = min(price for price, _, _ in found_prices)
+            max_price = max(price for price, _, _ in found_prices)
+            last_price = found_prices[-1][0] if found_prices else None
+            
+            print(f"\nCurrent logic would likely select:")
+            print(f"  Minimum price: £{min_price}")
+            print(f"  Maximum price: £{max_price}")
+            print(f"  Last price found: £{last_price}")
+        
+    except Exception as e:
+        print(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    url = "https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24"
+    fetch_and_analyze_atoz_page(url)