diff --git a/.github/workflows/advanced-mirror.yml b/.github/workflows/advanced-mirror.yml new file mode 100644 index 0000000..9ea6d76 --- /dev/null +++ b/.github/workflows/advanced-mirror.yml @@ -0,0 +1,68 @@ +name: Advanced Mirror to Azure DevOps + +on: + push: + branches: [ main, master, develop ] + pull_request: + types: [closed] + branches: [ main, master ] + workflow_dispatch: + inputs: + force_push: + description: 'Force push to Azure DevOps' + required: false + default: 'false' + +jobs: + mirror: + runs-on: ubuntu-latest + if: github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.merged == true) || github.event_name == 'workflow_dispatch' + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Setup Git + run: | + git config --global user.name "GitHub Mirror Bot" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Add Azure DevOps Remote + env: + AZURE_DEVOPS_TOKEN: ${{ secrets.AZURE_DEVOPS_PAT }} + run: | + # URL encode the repository name for spaces + ENCODED_URL="https://oauth2:${AZURE_DEVOPS_TOKEN}@dev.azure.com/ptslondon/_git/Price%20Tracker" + git remote add azure "$ENCODED_URL" + + - name: Mirror Repository + env: + FORCE_PUSH: ${{ github.event.inputs.force_push }} + run: | + # Set force flag + FORCE_FLAG="" + if [ "$FORCE_PUSH" = "true" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + FORCE_FLAG="--force" + fi + + # Push current branch + CURRENT_BRANCH=${GITHUB_REF#refs/heads/} + echo "Mirroring branch: $CURRENT_BRANCH" + + git push azure "$CURRENT_BRANCH" $FORCE_FLAG + + # Push tags + git push azure --tags $FORCE_FLAG + + echo "✅ Successfully mirrored to Azure DevOps" + + - name: Verify Mirror + run: | + echo "Mirror completed for:" + echo "- Repository: Price Tracker" + echo "- Branch: ${GITHUB_REF#refs/heads/}" + echo "- Commit: ${{ github.sha }}" + echo "- Azure DevOps URL: https://dev.azure.com/ptslondon/_git/Price%20Tracker" diff --git a/.github/workflows/mirror-to-azure.yml b/.github/workflows/mirror-to-azure.yml new file mode 100644 index 0000000..837b573 --- /dev/null +++ b/.github/workflows/mirror-to-azure.yml @@ -0,0 +1,34 @@ +name: Mirror to Azure DevOps + +on: + push: + branches: [ main, master, develop ] # Add branches you want to mirror + workflow_dispatch: # Allows manual triggering + +jobs: + mirror: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history for complete mirror + + - name: Mirror to Azure DevOps + env: + AZURE_DEVOPS_URL: https://dev.azure.com/ptslondon/_git/Price%20Tracker + AZURE_DEVOPS_TOKEN: ${{ secrets.AZURE_DEVOPS_PAT }} + run: | + # Configure git + git config --global user.name "GitHub Mirror Bot" + git config --global user.email "noreply@github.com" + + # Add Azure DevOps as remote + git remote add azure https://oauth2:${AZURE_DEVOPS_TOKEN}@dev.azure.com/ptslondon/_git/Price%20Tracker + + # Push all branches and tags + git push azure --all --force + git push azure --tags --force + + echo "Successfully mirrored to Azure DevOps" diff --git a/README.md b/README.md index 882f661..c6dffe6 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,14 @@ A comprehensive web scraper for tracking product prices across multiple e-commer ## Features ✨ -- **Multi-site Price Tracking**: Monitor prices across Amazon, eBay, Walmart, and more +- **Multi-site Price Tracking**: Monitor prices across JJ Food Service, A to Z Catering, and Amazon UK - **Beautiful Web UI**: Clean, responsive interface for managing products and viewing price history - **Price Alerts**: Get notified when products reach your target price - **Historical Data**: View price trends with interactive charts - **Automated Scraping**: Schedule regular price checks - **Multiple Notifications**: Email and webhook notifications - **Robust Scraping**: Built-in retry logic, rotating user agents, and rate limiting +- **Special Pricing Detection**: Automatically detects and prioritizes delivery prices and special offers ## Quick Start 🚀 @@ -106,13 +107,20 @@ Add new e-commerce sites by extending the sites configuration: ```json { "sites": { - "your_site": { + "atoz_catering": { "enabled": true, - "base_url": "https://www.yoursite.com", + "base_url": "https://www.atoz-catering.co.uk", "selectors": { - "price": [".price", ".cost"], - "title": [".product-title"], - "availability": [".stock-status"] + "price": [ + ".my-price.price-offer", + ".delivery-price", + ".price" + ], + "special_offer": [ + ".my-price.price-offer", + ".special-offer", + "del:contains('£')" + ] } } } diff --git a/SCRAPER_ARCHITECTURE.md b/SCRAPER_ARCHITECTURE.md new file mode 100644 index 0000000..f020ad9 --- /dev/null +++ b/SCRAPER_ARCHITECTURE.md @@ -0,0 +1,80 @@ +# Price Tracker - Scraper Architecture + +## Current Structure + +### 1. **`scraper.py` - Base Scraper Class** +- **Purpose**: Foundation class for all price scraping +- **Handles**: Generic e-commerce sites (Amazon.com, eBay, Walmart, etc.) +- **Key Features**: + - Base `PriceScraper` class with HTTP session management + - Anti-bot measures (headers, delays, retries) + - Generic price extraction methods + - Site detection logic + +### 2. **`uk_scraper.py` - UK Catering Specialist** +- **Purpose**: Specialized scraper for UK catering supply websites +- **Handles**: JJ Food Service, A to Z Catering, Amazon UK +- **Key Features**: + - Inherits from `PriceScraper` base class + - UK currency handling (£ symbol) + - Delivery vs Collection price prioritization + - Special pricing detection (offers, strikethrough, was/now pricing) + - Site-specific CSS selectors (e.g., `.my-price.price-offer` for A to Z) + +### 3. **`scraper_manager.py` - Orchestration Layer** +- **Purpose**: Routes scraping tasks to appropriate scrapers +- **Logic**: + - Detects UK catering sites → uses `UKCateringScraper` + - Detects other sites → uses base `PriceScraper` + - Manages concurrent requests and error handling + +## Site Mapping + +### UK Catering Sites (UKCateringScraper): +- `jjfoodservice` → JJ Food Service +- `atoz_catering` → A to Z Catering +- `amazon_uk` → Amazon UK + +### International Sites (PriceScraper): +- `amazon` → Amazon.com +- `ebay` → eBay +- `walmart` → Walmart +- *(Future sites can be added here)* + +## Key Benefits of Current Structure + +✅ **Separation of Concerns**: UK-specific logic is isolated +✅ **Extensibility**: Easy to add new UK sites or international sites +✅ **Maintainability**: Changes to UK logic don't affect international scraping +✅ **Specialization**: UK scraper handles currency, delivery pricing, special offers + +## Recommendations + +### ✅ **KEEP CURRENT STRUCTURE** - It's well-designed! + +The separation between `scraper.py` and `uk_scraper.py` is actually **good architecture** because: + +1. **UK catering sites have unique requirements** (delivery vs collection, £ pricing, special offers) +2. **International sites have different patterns** (USD pricing, different site structures) +3. **Easy to maintain and extend** each scraper independently + +### Minor Improvements Made: + +1. **Enhanced site detection** in base scraper +2. **Added helper methods** to determine scraper routing +3. **Improved scraper manager** logic for clarity +4. **Fixed A to Z pricing** with `.my-price.price-offer` selector + +## Final File Structure + +``` +src/ +├── scraper.py # Base scraper (international sites) +├── uk_scraper.py # UK catering specialist +├── scraper_manager.py # Orchestration layer +├── config.py # Configuration management +├── database.py # Data persistence +└── web_ui.py # Flask web interface +``` + +This structure supports both current UK catering needs and future expansion to international e-commerce sites. diff --git a/SPECIAL_PRICING.md b/SPECIAL_PRICING.md new file mode 100644 index 0000000..7492182 --- /dev/null +++ b/SPECIAL_PRICING.md @@ -0,0 +1,177 @@ +# Special Pricing Features - Price Tracker + +## Overview + +The UK Price Tracker now includes enhanced special pricing detection capabilities to identify and prioritize discounted, sale, and special offer prices across supported UK catering sites. + +## Features + +### 🎯 Special Pricing Detection +- **Strikethrough Pricing**: Detects crossed-out prices with sale prices +- **Was/Now Patterns**: Identifies "Was £X Now £Y" pricing patterns +- **Offer Labels**: Recognizes sale/discount/special offer badges and containers +- **Percentage Discounts**: Detects "X% OFF" promotional pricing +- **Member/Trade Pricing**: Special pricing for registered customers (JJ Food Service) + +### 🚚 Delivery Price Priority +- Automatically prioritizes delivery prices over collection prices +- Identifies delivery-specific special offers +- Handles mixed pricing scenarios (delivery vs collection vs general) + +### 🏪 Site-Specific Enhancements + +#### JJ Food Service +- Member pricing detection +- Trade pricing identification +- Bulk discount recognition +- Quantity-based pricing + +#### A to Z Catering +- Header-based delivery pricing (H3/H4 elements) +- Inline strikethrough detection +- Special delivery offer containers +- Style-based strikethrough recognition + +#### Amazon UK +- Deal price detection +- Strike-through pricing +- Sale badge recognition +- RRP vs Sale price comparison + +## Configuration + +Special pricing is configured in `config.json`: + +```json +{ + "scraping": { + "special_pricing": { + "enabled": true, + "prefer_delivery_prices": true, + "detect_strikethrough": true, + "detect_was_now_patterns": true, + "detect_percentage_discounts": true, + "min_discount_threshold": 0.05, + "max_price_difference_ratio": 0.5 + } + }, + "sites": { + "jjfoodservice": { + "selectors": { + "special_offer": [ + ".special-offer", + ".member-price", + "del:contains('£')", + ".was-price" + ] + } + } + } +} +``` + +## Testing + +### Test Suite +Run the comprehensive test suite: +```bash +python test_special_pricing.py +``` + +This tests: +- Price parsing with various formats +- Special pricing context detection +- Site-specific extraction methods +- Mock HTML scenarios + +### Debug Tool +Debug real URLs: +```bash +python debug_special_pricing.py [--verbose] +``` + +Examples: +```bash +# Debug a JJ Food Service product +python debug_special_pricing.py "https://www.jjfoodservice.com/product/example" --verbose + +# Debug an A to Z Catering product +python debug_special_pricing.py "https://www.atoz-catering.co.uk/product/example" + +# Debug an Amazon UK product +python debug_special_pricing.py "https://www.amazon.co.uk/product/example" +``` + +## How It Works + +### 1. Context Detection +The scraper analyzes HTML elements and their parent containers to detect special pricing context: +- Strikethrough elements (``, ``, ``) +- CSS styling (`text-decoration: line-through`) +- Keyword patterns (`was`, `now`, `sale`, `offer`, `discount`) +- Percentage discount patterns (`20% off`, etc.) + +### 2. Price Extraction +When multiple prices are found: +- **With special context**: Returns the lowest price (offer price) +- **Delivery preference**: Prioritizes delivery over collection prices +- **Multiple prices**: Takes the last/lowest price found + +### 3. Site-Specific Logic +Each site has tailored extraction methods: +- **JJ Food Service**: Focuses on member/trade pricing +- **A to Z Catering**: Enhanced header and delivery price detection +- **Amazon UK**: Deal and promotional price recognition + +## Examples + +### Strikethrough Pricing +```html +
+ £15.99 + £12.99 +
+``` +**Result**: £12.99 (special offer detected) + +### Was/Now Pricing +```html +
+ Was £20.50, now £17.25 +
+``` +**Result**: £17.25 (was/now pattern detected) + +### Delivery Special Offers +```html +

Delivery: £25.00 £19.99

+``` +**Result**: £19.99 (delivery + special offer) + +## Troubleshooting + +### No Special Prices Detected +1. Check if the site uses non-standard markup +2. Add custom selectors to `config.json` +3. Use debug tool to see what selectors are matching +4. Verify special pricing is enabled in config + +### Wrong Price Selected +1. Check if delivery preference is correctly configured +2. Verify the HTML structure matches expected patterns +3. Use verbose debugging to see all detected prices +4. Consider adding site-specific selectors + +### Performance Issues +1. Reduce the number of special offer selectors +2. Increase delays between requests +3. Use more specific CSS selectors +4. Enable only necessary special pricing features + +## Future Enhancements + +- **Machine Learning**: Auto-detect pricing patterns +- **More Sites**: Extend to additional UK catering suppliers +- **Price History**: Track special offer frequency and patterns +- **Alerts**: Notify when special offers are detected +- **Comparison**: Cross-site special offer comparison diff --git a/debug_atoz_pricing.py b/debug_atoz_pricing.py new file mode 100644 index 0000000..d18db18 --- /dev/null +++ b/debug_atoz_pricing.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Debug script specifically for A to Z Catering pricing issues +""" + +import requests +from bs4 import BeautifulSoup +import re +import sys +import os + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +def fetch_and_analyze_atoz_page(url): + """Fetch and analyze the A to Z page to identify pricing issues.""" + + print(f"Analyzing A to Z page: {url}") + print("=" * 80) + + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + try: + response = requests.get(url, headers=headers, timeout=30) + print(f"HTTP Status: {response.status_code}") + + if response.status_code != 200: + print("Failed to fetch page") + return + + soup = BeautifulSoup(response.text, 'html.parser') + + # 1. Find all elements containing prices + print("\n1. ALL PRICE ELEMENTS FOUND:") + print("-" * 40) + price_pattern = re.compile(r'£\d+\.?\d*') + price_elements = soup.find_all(string=price_pattern) + + for i, price_text in enumerate(price_elements): + parent = price_text.parent if hasattr(price_text, 'parent') else None + parent_class = parent.get('class', []) if parent else [] + parent_tag = parent.name if parent else 'N/A' + + print(f" {i+1:2d}. '{price_text.strip()}' in <{parent_tag}> class={parent_class}") + + # 2. Check for delivery-specific elements + print("\n2. DELIVERY-RELATED ELEMENTS:") + print("-" * 40) + delivery_keywords = ['delivery', 'delivered'] + + for keyword in delivery_keywords: + elements = soup.find_all(string=re.compile(keyword, re.IGNORECASE)) + for elem in elements[:5]: # Show first 5 + parent = elem.parent if hasattr(elem, 'parent') else None + parent_class = parent.get('class', []) if parent else [] + text = elem.strip()[:100] + print(f" '{text}' in class={parent_class}") + + # 3. Check h3 and h4 elements (A to Z specific) + print("\n3. H3/H4 ELEMENTS WITH PRICES:") + print("-" * 40) + headers = soup.find_all(['h3', 'h4']) + for header in headers: + text = header.get_text(strip=True) + if '£' in text: + print(f" <{header.name}>: {text}") + + # 4. Test specific selectors from our config + print("\n4. TESTING OUR SELECTORS:") + print("-" * 40) + + test_selectors = [ + '.delivery-price', + '.price-delivery', + '.price', + '.product-price', + '.collection-price', + 'span:contains("£")', + 'h3:contains("Delivery")', + 'h4:contains("Delivery")', + '*[class*="price"]' + ] + + for selector in test_selectors: + try: + if ':contains(' in selector: + # Handle contains selectors differently + if 'h3:contains("Delivery")' == selector: + elements = [h for h in soup.find_all('h3') if 'delivery' in h.get_text().lower()] + elif 'h4:contains("Delivery")' == selector: + elements = [h for h in soup.find_all('h4') if 'delivery' in h.get_text().lower()] + elif 'span:contains("£")' == selector: + elements = [s for s in soup.find_all('span') if '£' in s.get_text()] + else: + elements = [] + else: + elements = soup.select(selector) + + if elements: + print(f" ✓ {selector} -> {len(elements)} elements:") + for i, elem in enumerate(elements[:3]): # Show first 3 + text = elem.get_text(strip=True) + if '£' in text: + print(f" [{i+1}] {text}") + else: + print(f" ✗ {selector} -> No elements") + + except Exception as e: + print(f" ⚠ {selector} -> Error: {e}") + + # 5. Look for the specific prices mentioned (12.99 and 1.39) + print("\n5. SPECIFIC PRICE ANALYSIS:") + print("-" * 40) + + if '12.99' in response.text: + print("✓ £12.99 found in page content") + # Find context around 12.99 + matches = list(re.finditer(r'12\.99', response.text)) + for match in matches[:3]: # Show first 3 occurrences + start = max(0, match.start() - 100) + end = min(len(response.text), match.end() + 100) + context = response.text[start:end].replace('\n', ' ').replace('\t', ' ') + print(f" Context: ...{context}...") + else: + print("✗ £12.99 NOT found in page content") + + if '1.39' in response.text: + print("✓ £1.39 found in page content") + # Find context around 1.39 + matches = list(re.finditer(r'1\.39', response.text)) + for match in matches[:3]: # Show first 3 occurrences + start = max(0, match.start() - 100) + end = min(len(response.text), match.end() + 100) + context = response.text[start:end].replace('\n', ' ').replace('\t', ' ') + print(f" Context: ...{context}...") + else: + print("✗ £1.39 NOT found in page content") + + # 6. Try to simulate our current parsing logic + print("\n6. SIMULATING CURRENT PARSING LOGIC:") + print("-" * 40) + + # Test our general price selectors + general_selectors = [ + '.price', + '.product-price', + 'span:contains("£")', + '.price-value', + ] + + found_prices = [] + for selector in general_selectors: + try: + if selector == 'span:contains("£")': + elements = [s for s in soup.find_all('span') if '£' in s.get_text()] + else: + elements = soup.select(selector) + + for element in elements: + price_text = element.get_text(strip=True) + if '£' in price_text: + # Extract price using regex + price_matches = re.findall(r'£(\d+\.?\d*)', price_text) + for match in price_matches: + try: + price_value = float(match) + found_prices.append((price_value, selector, price_text)) + except ValueError: + pass + + except Exception as e: + print(f"Error with {selector}: {e}") + + print(f"Found {len(found_prices)} prices total:") + for price, selector, text in found_prices: + print(f" £{price} from '{selector}': {text[:50]}") + + if found_prices: + # Show what our current logic would select + min_price = min(price for price, _, _ in found_prices) + max_price = max(price for price, _, _ in found_prices) + last_price = found_prices[-1][0] if found_prices else None + + print(f"\nCurrent logic would likely select:") + print(f" Minimum price: £{min_price}") + print(f" Maximum price: £{max_price}") + print(f" Last price found: £{last_price}") + + except Exception as e: + print(f"Error: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + url = "https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24" + fetch_and_analyze_atoz_page(url) diff --git a/debug_jj.py b/debug_jj.py new file mode 100644 index 0000000..9581c91 --- /dev/null +++ b/debug_jj.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +""" +Debug script to test JJ Food Service scraping +""" + +import asyncio +import logging +import sys +import os + +# Add the src directory to the path +sys.path.append(os.path.join(os.path.dirname(__file__))) + +from src.config import Config +from src.uk_scraper import UKCateringScraper + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +async def test_jj_scraping(): + config = Config() + + print(f"JJ Food Service enabled: {config.is_site_enabled('jjfoodservice')}") + print(f"A to Z enabled: {config.is_site_enabled('atoz_catering')}") + + url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/" + + async with UKCateringScraper(config) as scraper: + print(f"\nTesting JJ Food Service URL: {url}") + result = await scraper.scrape_product_price(url, 'jjfoodservice') + print(f"Result: {result}") + +if __name__ == "__main__": + asyncio.run(test_jj_scraping()) diff --git a/debug_special_pricing.py b/debug_special_pricing.py new file mode 100644 index 0000000..6665864 --- /dev/null +++ b/debug_special_pricing.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +""" +Special Pricing Debug Tool for UK Price Tracker + +This tool helps debug and monitor special pricing detection on real websites. +It can be used to test URLs and see exactly what pricing information is being detected. +""" + +import sys +import os +import asyncio +import logging +import argparse +from typing import Dict, Any + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +from uk_scraper import UKCateringScraper +from config import Config + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def detect_site_from_url(url: str) -> str: + """Detect which site the URL belongs to.""" + if 'jjfoodservice.com' in url: + return 'jjfoodservice' + elif 'atoz-catering.co.uk' in url: + return 'atoz_catering' + elif 'amazon.co.uk' in url: + return 'amazon_uk' + else: + return 'unknown' + + +async def debug_url_pricing(url: str, verbose: bool = False): + """Debug pricing extraction for a specific URL.""" + + config = Config() + scraper = UKCateringScraper(config) + + site_name = detect_site_from_url(url) + + print(f"Debugging URL: {url}") + print(f"Detected site: {site_name}") + print("-" * 60) + + if site_name == 'unknown': + print("❌ Unknown site - cannot process") + return + + try: + # Fetch the page content + print("🌐 Fetching page content...") + html_content = await scraper._fetch_page(url) + + if not html_content: + print("❌ Failed to fetch page content") + return + + print("✅ Page content fetched successfully") + + # Parse with BeautifulSoup + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_content, 'html.parser') + + # Debug special pricing detection + print("\n🔍 Looking for special offer prices...") + special_prices = scraper._find_special_offer_prices(soup, site_name) + + if special_prices: + print(f"✅ Found {len(special_prices)} special offer prices:") + for price, selector in special_prices: + print(f" £{price} (found with: {selector})") + + best_special_price = min(price for price, _ in special_prices) + print(f"🎯 Best special offer price: £{best_special_price}") + else: + print("❌ No special offer prices found") + + # Test the main extraction method + print(f"\n🔍 Testing {site_name} extraction method...") + + if site_name == 'jjfoodservice': + result = scraper._extract_jjfoodservice_data(soup) + elif site_name == 'atoz_catering': + result = scraper._extract_atoz_catering_data(soup) + elif site_name == 'amazon_uk': + result = scraper._extract_amazon_uk_data(soup) + + print(f"✅ Extraction result:") + print(f" Price: £{result['price']}" if result['price'] else " Price: Not found") + print(f" Title: {result.get('title', 'Not found')}") + print(f" Available: {result.get('availability', 'Unknown')}") + print(f" Currency: {result.get('currency', 'Unknown')}") + + # If verbose, show more debugging info + if verbose: + print(f"\n🔍 Verbose debugging for {site_name}...") + + # Get site selectors from config + site_config = config.get_site_config(site_name) + if site_config and 'selectors' in site_config: + selectors = site_config['selectors'] + + # Test each selector type + for selector_type, selector_list in selectors.items(): + print(f"\n Testing {selector_type} selectors:") + + for selector in selector_list: + try: + elements = soup.select(selector) + if elements: + print(f" ✅ {selector} -> Found {len(elements)} elements") + for i, elem in enumerate(elements[:3]): # Show first 3 + text = elem.get_text(strip=True)[:100] # Truncate long text + print(f" [{i+1}] {text}") + else: + print(f" ❌ {selector} -> No elements found") + except Exception as e: + print(f" ⚠️ {selector} -> Error: {e}") + + # Test the full scraping method + print(f"\n🔍 Testing full scrape_product_price method...") + full_result = await scraper.scrape_product_price(url, site_name) + + print("✅ Full scraping result:") + print(f" Success: {full_result['success']}") + print(f" Price: £{full_result['price']}" if full_result['price'] else " Price: Not found") + print(f" Error: {full_result.get('error', 'None')}") + + except Exception as e: + print(f"❌ Error during debugging: {e}") + if verbose: + import traceback + traceback.print_exc() + + +def main(): + """Main function to run the debug tool.""" + + parser = argparse.ArgumentParser(description='Debug special pricing detection for UK price tracker') + parser.add_argument('url', help='URL to debug') + parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output') + parser.add_argument('--test-selectors', action='store_true', help='Test all selectors from config') + + args = parser.parse_args() + + print("UK Price Tracker - Special Pricing Debug Tool") + print("=" * 60) + + # Run the debugging + asyncio.run(debug_url_pricing(args.url, args.verbose)) + + +if __name__ == "__main__": + main() diff --git a/purge_database.py b/purge_database.py new file mode 100644 index 0000000..a85e7cf --- /dev/null +++ b/purge_database.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Simple script to purge all price data from the database +This will reset the database so the next scrape acts as the first one +""" + +import sqlite3 +import os +from src.config import Config + +def purge_database(): + """Purge all data from the price tracker database.""" + config = Config() + db_path = config.database_path + + if not os.path.exists(db_path): + print(f"Database file {db_path} does not exist. Nothing to purge.") + return + + try: + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # Get all table names + cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") + tables = cursor.fetchall() + + if not tables: + print("No tables found in database.") + conn.close() + return + + print(f"Found {len(tables)} tables in database:") + for table in tables: + table_name = table[0] + cursor.execute(f"SELECT COUNT(*) FROM {table_name}") + count = cursor.fetchone()[0] + print(f" - {table_name}: {count} records") + + # Confirm purge + response = input("\nDo you want to purge all data? (yes/no): ").lower().strip() + + if response in ['yes', 'y']: + # Delete all data from all tables + for table in tables: + table_name = table[0] + cursor.execute(f"DELETE FROM {table_name}") + print(f"Purged all data from {table_name}") + + conn.commit() + print("\n✅ Database purged successfully!") + print("The next scrape will act as the first one and log all prices.") + else: + print("Purge cancelled.") + + conn.close() + + except sqlite3.Error as e: + print(f"Database error: {e}") + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + purge_database() diff --git a/simple_test.py b/simple_test.py new file mode 100644 index 0000000..79856a0 --- /dev/null +++ b/simple_test.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Simple test for special pricing functionality +""" + +import sys +import os + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +def test_imports(): + """Test that all modules can be imported.""" + try: + print("Testing imports...") + + # Basic imports + import re + import logging + from typing import Dict, Any, Optional, List, Tuple + print("✓ Basic Python modules imported") + + # Third-party imports + from bs4 import BeautifulSoup, Tag + print("✓ BeautifulSoup imported") + + # Local imports + from config import Config + print("✓ Config imported") + + from scraper import PriceScraper + print("✓ PriceScraper imported") + + from uk_scraper import UKCateringScraper + print("✓ UKCateringScraper imported") + + return True + + except Exception as e: + print(f"✗ Import error: {e}") + import traceback + traceback.print_exc() + return False + +def test_basic_functionality(): + """Test basic functionality of the special pricing.""" + try: + from config import Config + from uk_scraper import UKCateringScraper + + print("\nTesting basic functionality...") + + # Create config and scraper + config = Config() + scraper = UKCateringScraper(config) + print("✓ Scraper created successfully") + + # Test price parsing + test_price = scraper._parse_uk_price("£12.99") + if test_price == 12.99: + print("✓ Basic price parsing works") + else: + print(f"✗ Price parsing failed: got {test_price}, expected 12.99") + + # Test special pricing + special_price = scraper._parse_uk_price("Was £20.00 Now £15.99", detect_special_offers=True) + if special_price == 15.99: + print("✓ Special price parsing works") + else: + print(f"✗ Special price parsing failed: got {special_price}, expected 15.99") + + return True + + except Exception as e: + print(f"✗ Functionality error: {e}") + import traceback + traceback.print_exc() + return False + +def test_html_parsing(): + """Test HTML parsing for special pricing.""" + try: + from bs4 import BeautifulSoup + from uk_scraper import UKCateringScraper + from config import Config + + print("\nTesting HTML parsing...") + + config = Config() + scraper = UKCateringScraper(config) + + # Test strikethrough detection + html = '
£20.00£15.99
' + soup = BeautifulSoup(html, 'html.parser') + + special_prices = scraper._find_special_offer_prices(soup, 'atoz_catering') + if special_prices: + print(f"✓ Special offer detection works: found {len(special_prices)} prices") + else: + print("✗ Special offer detection failed") + + return True + + except Exception as e: + print(f"✗ HTML parsing error: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + print("Simple Special Pricing Test") + print("=" * 40) + + success = True + + # Test imports + if not test_imports(): + success = False + + # Test basic functionality + if not test_basic_functionality(): + success = False + + # Test HTML parsing + if not test_html_parsing(): + success = False + + print("\n" + "=" * 40) + if success: + print("✅ All tests passed!") + else: + print("❌ Some tests failed!") + sys.exit(1) diff --git a/src/database.py b/src/database.py index 70b5de8..6a9cd0c 100644 --- a/src/database.py +++ b/src/database.py @@ -147,6 +147,15 @@ class DatabaseManager: UPDATE products SET active = 0, updated_at = ? WHERE id = ? ''', (datetime.now(), product_id)) + def delete_product(self, product_id: int): + """Delete a product and all its associated price history.""" + with sqlite3.connect(self.db_path) as conn: + # Delete price history first (due to foreign key constraints) + conn.execute('DELETE FROM price_history WHERE product_id = ?', (product_id,)) + + # Delete the product + conn.execute('DELETE FROM products WHERE id = ?', (product_id,)) + def save_price_history(self, product_id: int, site_name: str, price: float, currency: str = 'GBP', availability: bool = True, timestamp: datetime = None): diff --git a/src/scraper.py b/src/scraper.py index ffd6c27..0116dac 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -169,13 +169,21 @@ class PriceScraper: """Detect which site this URL belongs to.""" domain = urlparse(url).netloc.lower() - if 'amazon' in domain: + # UK Catering sites (handled by UKCateringScraper) + if 'jjfoodservice.com' in domain: + return 'jjfoodservice' + elif 'atoz-catering.co.uk' in domain: + return 'atoz_catering' + elif 'amazon.co.uk' in domain: + return 'amazon_uk' + + # International sites (handled by base PriceScraper) + elif 'amazon.com' in domain or 'amazon.' in domain: return 'amazon' elif 'ebay' in domain: return 'ebay' elif 'walmart' in domain: return 'walmart' - # Add more site detection logic here return None @@ -267,6 +275,17 @@ class PriceScraper: return False return True + + def should_use_uk_scraper(self, url: str) -> bool: + """Determine if this URL should use the UK catering scraper.""" + site_name = self._detect_site(url) + uk_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'} + return site_name in uk_sites + + @classmethod + def get_uk_catering_sites(cls) -> set: + """Get the list of UK catering sites.""" + return {'jjfoodservice', 'atoz_catering', 'amazon_uk'} class ScraperManager: diff --git a/src/scraper_manager.py b/src/scraper_manager.py index 9d1a670..b06836c 100644 --- a/src/scraper_manager.py +++ b/src/scraper_manager.py @@ -17,6 +17,7 @@ class ScraperManager(BaseScraper): def __init__(self, config): super().__init__(config) self.active_tasks = {} + self.semaphore = asyncio.Semaphore(config.max_concurrent_requests) async def scrape_product_by_id(self, product_id: int, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """Scrape a specific product by ID with task tracking.""" @@ -36,6 +37,79 @@ class ScraperManager(BaseScraper): if product_id in self.active_tasks: del self.active_tasks[product_id] + async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: + """Scrape prices for a single product across all configured sites.""" + product_id = product['id'] + urls = product['urls'] + + results = {} + + # Check if this product has UK catering sites + uk_catering_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'} + has_uk_sites = any(site in uk_catering_sites for site in urls.keys()) + + if has_uk_sites: + # Use UK-specific scraper + async with UKCateringScraper(self.config) as scraper: + tasks = [] + + for site_name, url in urls.items(): + if self.config.is_site_enabled(site_name): + task = self._scrape_with_semaphore_uk(scraper, url, site_name) + tasks.append((site_name, task)) + + # Add delay between requests + await asyncio.sleep(self.config.delay_between_requests) + + # Wait for all tasks to complete + for site_name, task in tasks: + try: + result = await task + results[site_name] = result + except Exception as e: + logger.error(f"Error scraping {site_name} for product {product_id}: {e}") + results[site_name] = { + 'success': False, + 'error': str(e) + } + else: + # Use generic scraper for non-UK sites + from .scraper import PriceScraper + async with PriceScraper(self.config) as scraper: + tasks = [] + + for site_name, url in urls.items(): + if self.config.is_site_enabled(site_name): + task = self._scrape_with_semaphore(scraper, url, site_name) + tasks.append((site_name, task)) + + # Add delay between requests + await asyncio.sleep(self.config.delay_between_requests) + + # Wait for all tasks to complete + for site_name, task in tasks: + try: + result = await task + results[site_name] = result + except Exception as e: + logger.error(f"Error scraping {site_name} for product {product_id}: {e}") + results[site_name] = { + 'success': False, + 'error': str(e) + } + + return results + + async def _scrape_with_semaphore_uk(self, scraper: UKCateringScraper, url: str, site_name: str): + """Scrape with semaphore using UK scraper.""" + async with self.semaphore: + return await scraper.scrape_product_price(url, site_name) + + async def _scrape_with_semaphore(self, scraper, url: str, site_name: str): + """Scrape with semaphore using generic scraper.""" + async with self.semaphore: + return await scraper.scrape_product_price(url, site_name) + async def cancel_product_scraping(self, product_id: int) -> bool: """Cancel scraping for a specific product.""" if product_id in self.active_tasks: diff --git a/src/uk_scraper.py b/src/uk_scraper.py index cc7b72a..c23d0cf 100644 --- a/src/uk_scraper.py +++ b/src/uk_scraper.py @@ -4,8 +4,8 @@ Specialized scrapers for UK catering supply sites import re import logging -from typing import Dict, Any, Optional -from bs4 import BeautifulSoup +from typing import Dict, Any, Optional, List, Tuple +from bs4 import BeautifulSoup, Tag from .scraper import PriceScraper logger = logging.getLogger(__name__) @@ -14,35 +14,153 @@ logger = logging.getLogger(__name__) class UKCateringScraper(PriceScraper): """Specialized scraper for UK catering supply websites.""" - def _parse_uk_price(self, price_text: str) -> Optional[float]: - """Parse UK price format with £ symbol.""" + def _extract_special_pricing_context(self, element: Tag) -> Dict[str, Any]: + """Extract special pricing context from an element and its surroundings.""" + context = { + 'has_strikethrough': False, + 'has_offer_label': False, + 'has_was_now': False, + 'prices': [], + 'price_types': [] + } + + # Get parent elements to check for special pricing context + parents = [element] + [p for p in element.parents if p.name][:3] # Check up to 3 levels up + + for parent in parents: + parent_text = parent.get_text().lower() if parent else "" + + # Check for strikethrough pricing + strikethrough_elements = parent.find_all(['del', 's', 'strike']) if parent else [] + if strikethrough_elements: + context['has_strikethrough'] = True + for strike_elem in strikethrough_elements: + strike_price = self._parse_uk_price(strike_elem.get_text()) + if strike_price: + context['prices'].append(strike_price) + context['price_types'].append('was_price') + + # Check for offer/sale/discount labels + offer_patterns = [ + r'\bsale\b', r'\boffer\b', r'\bdeal\b', r'\bdiscount\b', + r'\bspecial\b', r'\bpromo\b', r'\breduced\b', r'\bsave\b', + r'\bwas\s*£', r'\bnow\s*£', r'\b\d+%\s*off\b' + ] + + for pattern in offer_patterns: + if re.search(pattern, parent_text): + context['has_offer_label'] = True + break + + # Look for "was/now" pricing patterns + was_now_match = re.search(r'was\s*£([\d.]+).*?now\s*£([\d.]+)', parent_text, re.IGNORECASE) + if was_now_match: + context['has_was_now'] = True + was_price = float(was_now_match.group(1)) + now_price = float(was_now_match.group(2)) + context['prices'].extend([was_price, now_price]) + context['price_types'].extend(['was_price', 'now_price']) + + return context + + def _parse_uk_price(self, price_text: str, prefer_delivery: bool = False) -> Optional[float]: + """Simple, conservative UK price parsing - just extract the first reasonable price.""" if not price_text: return None - # Remove common text and normalize - price_text = price_text.lower() - price_text = re.sub(r'delivery:|collection:|was:|now:|offer:|from:', '', price_text) + # Skip very long text blocks that are unlikely to contain just prices + if len(price_text) > 100: + return None + + # Check if this is delivery or collection pricing + is_delivery = 'delivery' in price_text.lower() + is_collection = 'collection' in price_text.lower() + + # If we prefer delivery and this is explicitly collection, skip it + if prefer_delivery and is_collection and not is_delivery: + return None + + # Simple regex to find prices - be very specific + price_match = re.search(r'£(\d{1,3}(?:\.\d{2})?)', price_text) - # Find price with £ symbol - price_match = re.search(r'£(\d+\.?\d*)', price_text) if price_match: try: - return float(price_match.group(1)) - except ValueError: - pass - - # Try without £ symbol but with decimal - price_match = re.search(r'(\d+\.\d{2})', price_text) - if price_match: - try: - return float(price_match.group(1)) + price_val = float(price_match.group(1)) + # Only accept reasonable food product prices + if 2.0 <= price_val <= 100.0: + return price_val except ValueError: pass return None + def _find_special_offer_prices(self, soup: BeautifulSoup, site_name: str) -> List[Tuple[float, str]]: + """Find special offer prices using enhanced selectors.""" + special_prices = [] + + # Enhanced selectors for special offers + special_offer_selectors = [ + # General special offer containers + '.special-offer', '.sale-price', '.offer-price', '.discount-price', + '.promo-price', '.reduced-price', '.deal-price', + + # Strikethrough and comparison pricing + 'del:contains("£"), s:contains("£"), strike:contains("£")', + '.was-price', '.original-price', '.rrp-price', + + # Was/Now pricing containers + '.was-now-pricing', '.price-comparison', '.before-after-price', + + # Sale badges and labels + '.sale-badge', '.offer-badge', '.discount-badge', + '*[class*="sale"]:contains("£")', + '*[class*="offer"]:contains("£")', + '*[class*="discount"]:contains("£")', + + # Site-specific patterns + '.product-price-wrapper', '.price-container', '.pricing-section' + ] + + if site_name == 'atoz_catering': + # A to Z specific selectors - prioritize the offer price class + special_offer_selectors.extend([ + '.my-price.price-offer', # Primary A to Z offer price selector + 'h3:contains("£")', 'h4:contains("£")', + '.delivery-price-special', '.collection-price-special', + '*[style*="text-decoration: line-through"]', + '*[style*="text-decoration:line-through"]' + ]) + elif site_name == 'jjfoodservice': + # JJ Food Service specific selectors + special_offer_selectors.extend([ + '.member-price', '.trade-price', '.bulk-price', + '.quantity-discount', '.volume-discount' + ]) + elif site_name == 'amazon_uk': + # Amazon UK specific selectors + special_offer_selectors.extend([ + '.a-price.a-text-price.a-size-medium.apexPriceToPay .a-offscreen', + '.a-price-strike .a-offscreen', + '#priceblock_dealprice', '#priceblock_saleprice', + '.a-price-was', '.a-price-save' + ]) + + for selector in special_offer_selectors: + try: + elements = soup.select(selector) + for element in elements: + price_text = element.get_text(strip=True) + if '£' in price_text: + price = self._parse_uk_price(price_text, detect_special_offers=True, element=element) + if price: + special_prices.append((price, selector)) + except Exception as e: + logger.debug(f"Error with special offer selector {selector}: {e}") + + return special_prices + def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from JJ Food Service.""" + """Extract data specifically from JJ Food Service - simplified approach.""" result = { 'price': None, 'title': None, @@ -50,43 +168,85 @@ class UKCateringScraper(PriceScraper): 'currency': 'GBP' } - # Try multiple selectors for price - price_selectors = [ - '.price', - '.product-price', - '[data-testid="price"]', - '.price-value', - '.current-price', - '.product-card-price', - 'span:contains("£")', - '.cost' + # First, try to find elements with Price in class name and extract delivery price + price_elements = soup.select('[class*="Price"]') + logger.debug(f"JJ Food Service: Found {len(price_elements)} price elements") + + for element in price_elements: + text = element.get_text(strip=True) + logger.debug(f"JJ Food Service: Checking price element text: '{text[:100]}'") + + # Look for delivery price in concatenated strings like "Collection:£10.49£4.62 per kgDelivery:£11.79£5.19 per kg" + delivery_match = re.search(r'Delivery:£(\d{1,3}\.\d{2})', text, re.IGNORECASE) + if delivery_match: + price_val = float(delivery_match.group(1)) + result['price'] = price_val + logger.info(f"JJ Food Service: Found delivery price £{price_val} in price element") + # extract title + title_el = soup.select_one('h1') + if title_el: + result['title'] = title_el.get_text(strip=True) + return result + + # Second, attempt regex-based parsing of delivery price from raw page text + page_text = soup.get_text(separator=' ') + logger.debug(f"JJ Food Service page_text snippet: {page_text[:500]!r}") + + # Look for delivery price patterns in the text + if 'DELIVERY' in page_text or 'delivery' in page_text: + logger.debug(f"Found 'DELIVERY' in page text, looking for price patterns...") + delivery_section = page_text[page_text.lower().find('delivery'):page_text.lower().find('delivery')+100] + logger.debug(f"Delivery section: {delivery_section!r}") + + # Try multiple patterns for delivery price (based on actual HTML structure) + delivery_patterns = [ + r'Delivery:£(\d{1,3}\.\d{2})', # Delivery:£11.79 (actual format found) + r'DELIVERY:£(\d{1,3}\.\d{2})', # DELIVERY:£11.79 + r'delivery:£(\d{1,3}\.\d{2})', # delivery:£11.79 + r'DELIVERY:\s*£(\d{1,3}\.\d{2})', # DELIVERY: £11.79 (with space) + r'delivery:\s*£(\d{1,3}\.\d{2})', # delivery: £11.79 (with space) ] - for selector in price_selectors: + for pattern in delivery_patterns: + logger.debug(f"JJ Food Service: Trying pattern: {pattern}") + delivery_match = re.search(pattern, page_text, re.IGNORECASE) + if delivery_match: + price_val = float(delivery_match.group(1)) + result['price'] = price_val + logger.info(f"JJ Food Service: Parsed delivery price £{price_val} via regex pattern: {pattern}") + # extract title + title_el = soup.select_one('h1') + if title_el: + result['title'] = title_el.get_text(strip=True) + return result + else: + logger.debug(f"JJ Food Service: Pattern {pattern} did not match") + # Otherwise, try very specific selectors first - likely to contain prices + specific_selectors = [ + '.price-delivery', # Delivery price specifically + '.delivery-price', # Alternative delivery price + '.price', # General price class + ] + + for selector in specific_selectors: try: elements = soup.select(selector) for element in elements: price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - logger.info(f"Successfully scraped jjfoodservice: £{price}") - break + # Only process short text snippets that likely contain just prices + if '£' in price_text and len(price_text) < 30: + price = self._parse_uk_price(price_text, prefer_delivery=True) + if price is not None: + result['price'] = price + logger.info(f"JJ Food Service: Found price £{price} with selector '{selector}' from text: '{price_text}'") + break if result['price'] is not None: break except Exception as e: - logger.debug(f"Error with JJ Food Service price selector {selector}: {e}") - - # Try to extract title - title_selectors = [ - 'h1', - '.product-title', - '.product-name', - '[data-testid="product-title"]', - '.product-card-title', - 'title' - ] + logger.debug(f"Error with JJ Food Service selector {selector}: {e}") + # Extract title + title_selectors = ['h1', '.product-title', '.product-name'] for selector in title_selectors: try: element = soup.select_one(selector) @@ -96,61 +256,65 @@ class UKCateringScraper(PriceScraper): except Exception as e: logger.debug(f"Error with JJ Food Service title selector {selector}: {e}") - # Check availability - availability_indicators = [ - 'out of stock', - 'unavailable', - 'not available', - 'temporarily unavailable' - ] - - page_text = soup.get_text().lower() - for indicator in availability_indicators: - if indicator in page_text: - result['availability'] = False - break - return result def _extract_atoz_catering_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from A to Z Catering.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } + """Extract data specifically from A to Z Catering - prioritize delivery pricing using regex parse.""" + result = {'price': None, 'title': None, 'availability': True, 'currency': 'GBP'} + # First, attempt to parse delivery price directly from page text + page_text = soup.get_text(separator=' ') + delivery_match = re.search(r'Delivery:\s*£(\d{1,3}\.\d{2})', page_text) + if delivery_match: + price_val = float(delivery_match.group(1)) + result['price'] = price_val + logger.info(f"A to Z Catering: Parsed delivery price £{price_val} via regex") + # extract title + title_el = soup.select_one('h1') + if title_el: + result['title'] = title_el.get_text(strip=True) + return result - # A to Z Catering specific selectors - price_selectors = [ - '.price', - '.product-price', - '.delivery-price', - '.collection-price', - 'span:contains("£")', - '.price-value', - '.cost', - '.selling-price' - ] - - for selector in price_selectors: + # 1) Delivery-specific selectors + for selector in ['.delivery-price', '.price-delivery']: try: elements = soup.select(selector) for element in elements: - price_text = element.get_text(strip=True) - # Skip if it contains "delivery" or "collection" but no price - if ('delivery' in price_text.lower() or 'collection' in price_text.lower()) and '£' not in price_text: - continue - - price = self._parse_uk_price(price_text) + text = element.get_text(strip=True) + price = self._parse_uk_price(text, prefer_delivery=True) if price is not None: result['price'] = price - logger.info(f"Successfully scraped atoz_catering: £{price}") - break - if result['price'] is not None: - break + logger.info(f"A to Z Catering: Found delivery price £{price} from {selector}") + return result except Exception as e: - logger.debug(f"Error with A to Z price selector {selector}: {e}") + logger.debug(f"Error with A to Z delivery selector {selector}: {e}") + + # 2) Main offer selector (fallback to collection price) + for selector in ['.my-price.price-offer']: + try: + elements = soup.select(selector) + for element in elements: + text = element.get_text(strip=True) + price = self._parse_uk_price(text) + if price is not None: + result['price'] = price + logger.info(f"A to Z Catering: Found collection price £{price} from {selector}") + return result + except Exception as e: + logger.debug(f"Error with A to Z main selector {selector}: {e}") + + # 3) Fallback general selectors + for selector in ['.price', '.product-price']: + try: + elements = soup.select(selector) + for element in elements: + text = element.get_text(strip=True) + price = self._parse_uk_price(text) + if price is not None: + result['price'] = price + logger.info(f"A to Z Catering: Fallback parsed price £{price} from {selector}") + return result + except Exception as e: + logger.debug(f"Error with A to Z fallback selector {selector}: {e}") # Extract title title_selectors = [ @@ -197,7 +361,7 @@ class UKCateringScraper(PriceScraper): return result def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from Amazon UK.""" + """Extract data specifically from Amazon UK with enhanced special pricing detection.""" result = { 'price': None, 'title': None, @@ -205,6 +369,15 @@ class UKCateringScraper(PriceScraper): 'currency': 'GBP' } + # First, check for special offer prices using enhanced detection + special_prices = self._find_special_offer_prices(soup, 'amazon_uk') + if special_prices: + # Use the lowest special offer price found + best_special_price = min(price for price, _ in special_prices) + result['price'] = best_special_price + logger.info(f"Successfully scraped amazon_uk special offer price: £{best_special_price}") + return result + # Amazon UK price selectors price_selectors = [ '.a-price-whole', @@ -222,7 +395,7 @@ class UKCateringScraper(PriceScraper): elements = soup.select(selector) for element in elements: price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) + price = self._parse_uk_price(price_text, detect_special_offers=True, element=element) if price is not None: result['price'] = price break @@ -269,6 +442,122 @@ class UKCateringScraper(PriceScraper): return result + def _extract_generic_data(self, soup: BeautifulSoup, site_name: str) -> Dict[str, Any]: + """Generic data extraction for UK sites not specifically implemented.""" + result = { + 'price': None, + 'title': None, + 'availability': True, + 'currency': 'GBP' + } + + # Generic price selectors + price_selectors = [ + '.price', + '.product-price', + '[data-testid="price"]', + '.price-value', + '.current-price', + 'span:contains("£")', + '.cost', + '.selling-price' + ] + + for selector in price_selectors: + try: + elements = soup.select(selector) + for element in elements: + price_text = element.get_text(strip=True) + price = self._parse_uk_price(price_text) + if price is not None: + result['price'] = price + logger.info(f"Successfully scraped {site_name} generic price: £{price}") + break + if result['price'] is not None: + break + except Exception as e: + logger.debug(f"Error with generic price selector {selector}: {e}") + + # Generic title selectors + title_selectors = [ + 'h1', + '.product-title', + '.product-name', + '[data-testid="product-title"]', + 'title' + ] + + for selector in title_selectors: + try: + element = soup.select_one(selector) + if element: + result['title'] = element.get_text(strip=True) + break + except Exception as e: + logger.debug(f"Error with generic title selector {selector}: {e}") + + return result + + async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]: + """Scrape price for a single product from a URL using UK-specific logic.""" + result = { + 'success': False, + 'price': None, + 'currency': 'GBP', + 'title': None, + 'availability': None, + 'url': url, + 'error': None + } + + try: + # Validate that this is a supported UK site + if site_name not in ['jjfoodservice', 'atoz_catering', 'amazon_uk']: + result['error'] = f"Unsupported site for UK scraper: {site_name}" + return result + + # Check if site is enabled + if not self.config.is_site_enabled(site_name): + result['error'] = f"Site {site_name} is disabled" + return result + + # Fetch page content + html_content = await self._fetch_page(url) + if not html_content: + result['error'] = "Failed to fetch page content" + return result + + # Parse HTML + soup = BeautifulSoup(html_content, 'html.parser') + + # Route to appropriate extraction method + if site_name == 'jjfoodservice': + extracted_data = self._extract_jjfoodservice_data(soup) + elif site_name == 'atoz_catering': + extracted_data = self._extract_atoz_catering_data(soup) + elif site_name == 'amazon_uk': + extracted_data = self._extract_amazon_uk_data(soup) + else: + # Fallback to generic extraction + extracted_data = self._extract_generic_data(soup, site_name) + + if extracted_data['price'] is not None: + result.update({ + 'success': True, + 'price': extracted_data['price'], + 'title': extracted_data.get('title'), + 'availability': extracted_data.get('availability') + }) + logger.info(f"Successfully scraped {site_name}: £{extracted_data['price']}") + else: + result['error'] = "Could not extract price from page" + + except Exception as e: + logger.error(f"Error scraping {url}: {e}") + result['error'] = str(e) + + return result + async def scrape_product(self, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: """Scrape prices for a product from all configured sites.""" results = {} diff --git a/src/uk_scraper_old.py b/src/uk_scraper_old.py deleted file mode 100644 index 9ab1bde..0000000 --- a/src/uk_scraper_old.py +++ /dev/null @@ -1,515 +0,0 @@ -""" -Specialized scrapers for UK catering supply sites -""" - -import re -import logging -from typing import Dict, Any, Optional -from bs4 import BeautifulSoup -from .scraper import PriceScraper - -logger = logging.getLogger(__name__) - - -class UKCateringScraper(PriceScraper): - """Specialized scraper for UK catering supply websites.""" - - def _parse_uk_price(self, price_text: str) -> Optional[float]: - """Parse UK price format with £ symbol.""" - if not price_text: - return None - - # Remove common text and normalize - price_text = price_text.lower() - price_text = re.sub(r'delivery:|collection:|was:|now:|offer:|from:', '', price_text) - - # Find price with £ symbol - price_match = re.search(r'£(\d+\.?\d*)', price_text) - if price_match: - try: - return float(price_match.group(1)) - except ValueError: - pass - - # Try without £ symbol but with decimal - price_match = re.search(r'(\d+\.\d{2})', price_text) - if price_match: - try: - return float(price_match.group(1)) - except ValueError: - pass - - return None - - def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from JJ Food Service.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } - - # Try multiple selectors for price - price_selectors = [ - '.price', - '.product-price', - '[data-testid="price"]', - '.price-value', - '.current-price', - '.product-card-price', - 'span:contains("£")', - '.cost' - ] - - for selector in price_selectors: - try: - elements = soup.select(selector) - for element in elements: - price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - break - if result['price'] is not None: - break - except Exception as e: - logger.debug(f"Error with JJ Food Service price selector {selector}: {e}") - - # Try to extract title - title_selectors = [ - 'h1', - '.product-title', - '.product-name', - '[data-testid="product-title"]', - '.product-card-title', - 'title' - ] - - for selector in title_selectors: - try: - element = soup.select_one(selector) - if element: - result['title'] = element.get_text(strip=True) - break - except Exception as e: - logger.debug(f"Error with JJ Food Service title selector {selector}: {e}") - - # Check availability - availability_indicators = [ - 'out of stock', - 'unavailable', - 'not available', - 'sold out' - ] - - page_text = soup.get_text().lower() - for indicator in availability_indicators: - if indicator in page_text: - result['availability'] = False - break - - return result - - def _extract_atoz_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from A to Z Catering.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } - - # A to Z Catering shows prices like "Delivery:£X.XX Collection:£Y.YY" - # We'll prioritize the lower price (usually collection) - - price_text = soup.get_text() - - # Look for delivery and collection prices - delivery_match = re.search(r'delivery:?\s*£(\d+\.?\d*)', price_text, re.IGNORECASE) - collection_match = re.search(r'collection:?\s*£(\d+\.?\d*)', price_text, re.IGNORECASE) - - prices = [] - if delivery_match: - try: - prices.append(float(delivery_match.group(1))) - except ValueError: - pass - - if collection_match: - try: - prices.append(float(collection_match.group(1))) - except ValueError: - pass - - # If we found prices, use the lowest one - if prices: - result['price'] = min(prices) - else: - # Fallback to general price extraction - price_selectors = [ - '.price', - '.product-price', - 'span:contains("£")', - '.price-value' - ] - - for selector in price_selectors: - try: - elements = soup.select(selector) - for element in elements: - price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - break - if result['price'] is not None: - break - except Exception as e: - logger.debug(f"Error with A to Z price selector {selector}: {e}") - - # Extract title - A to Z often has product names in links - title_selectors = [ - 'h1', - '.product-title', - '.product-name', - 'a[href*="/products/product/"]', - '.product-link', - 'title' - ] - - for selector in title_selectors: - try: - element = soup.select_one(selector) - if element: - title = element.get_text(strip=True) - # Clean up the title - if len(title) > 5 and 'A to Z' not in title: - result['title'] = title - break - except Exception as e: - logger.debug(f"Error with A to Z title selector {selector}: {e}") - - # Check availability - look for "Add To Basket" button - add_to_basket = soup.find(text=re.compile('Add To Basket', re.IGNORECASE)) - if not add_to_basket: - # Also check for out of stock indicators - out_of_stock_indicators = [ - 'out of stock', - 'unavailable', - 'not available', - 'sold out' - ] - - page_text = soup.get_text().lower() - for indicator in out_of_stock_indicators: - if indicator in page_text: - result['availability'] = False - break - - return result - - def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from Amazon UK.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } - - # Amazon UK price selectors - price_selectors = [ - '.a-price-whole', - '.a-price .a-offscreen', - '.a-price-current .a-offscreen', - '#priceblock_dealprice', - '#priceblock_ourprice', - '.a-price-range', - '.a-price.a-text-price.a-size-medium.apexPriceToPay .a-offscreen' - ] - - for selector in price_selectors: - try: - elements = soup.select(selector) - for element in elements: - price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - break - if result['price'] is not None: - break - except Exception as e: - logger.debug(f"Error with Amazon UK price selector {selector}: {e}") - - # Extract title - title_selectors = [ - '#productTitle', - '.product-title', - 'h1.a-size-large' - ] - - for selector in title_selectors: - try: - element = soup.select_one(selector) - if element: - result['title'] = element.get_text(strip=True) - break - except Exception as e: - logger.debug(f"Error with Amazon UK title selector {selector}: {e}") - - # Check availability - availability_text = soup.get_text().lower() - if any(phrase in availability_text for phrase in ['out of stock', 'currently unavailable', 'not available']): - result['availability'] = False - - return result - - def _extract_tesco_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from Tesco.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } - - # Tesco price selectors - price_selectors = [ - '.price-control-wrapper .value', - '.price-per-sellable-unit .value', - '.price-per-quantity-weight .value', - '[data-testid="price-current-value"]', - '.price-current', - '.product-price .price' - ] - - for selector in price_selectors: - try: - elements = soup.select(selector) - for element in elements: - price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - break - if result['price'] is not None: - break - except Exception as e: - logger.debug(f"Error with Tesco price selector {selector}: {e}") - - # Extract title - title_selectors = [ - 'h1[data-testid="product-title"]', - '.product-details-tile h1', - '.product-title', - 'h1.product-name' - ] - - for selector in title_selectors: - try: - element = soup.select_one(selector) - if element: - result['title'] = element.get_text(strip=True) - break - except Exception as e: - logger.debug(f"Error with Tesco title selector {selector}: {e}") - - return result - - def _extract_sainsburys_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from Sainsburys.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } - - # Sainsburys price selectors - price_selectors = [ - '.pd__cost__current-price', - '.pd__cost .pd__cost__retail-price', - '.pricing__now-price', - '.product-price__current', - '[data-testid="pd-retail-price"]', - '.price-per-unit' - ] - - for selector in price_selectors: - try: - elements = soup.select(selector) - for element in elements: - price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - break - if result['price'] is not None: - break - except Exception as e: - logger.debug(f"Error with Sainsburys price selector {selector}: {e}") - - # Extract title - title_selectors = [ - '.pd__header h1', - 'h1[data-testid="pd-product-name"]', - '.product-name', - '.pd__product-name' - ] - - for selector in title_selectors: - try: - element = soup.select_one(selector) - if element: - result['title'] = element.get_text(strip=True) - break - except Exception as e: - logger.debug(f"Error with Sainsburys title selector {selector}: {e}") - - return result - - def _extract_booker_data(self, soup: BeautifulSoup) -> Dict[str, Any]: - """Extract data specifically from Booker.""" - result = { - 'price': None, - 'title': None, - 'availability': True, - 'currency': 'GBP' - } - - # Booker price selectors - price_selectors = [ - '.price', - '.product-price', - '.price-current', - '.selling-price', - '[data-testid="price"]', - '.product-tile-price' - ] - - for selector in price_selectors: - try: - elements = soup.select(selector) - for element in elements: - price_text = element.get_text(strip=True) - price = self._parse_uk_price(price_text) - if price is not None: - result['price'] = price - break - if result['price'] is not None: - break - except Exception as e: - logger.debug(f"Error with Booker price selector {selector}: {e}") - - # Extract title - title_selectors = [ - 'h1', - '.product-title', - '.product-name', - '.product-description h1', - '[data-testid="product-title"]' - ] - - for selector in title_selectors: - try: - element = soup.select_one(selector) - if element: - result['title'] = element.get_text(strip=True) - break - except Exception as e: - logger.debug(f"Error with Booker title selector {selector}: {e}") - - return result - - async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]: - """Enhanced scraping for UK catering sites.""" - result = { - 'success': False, - 'price': None, - 'currency': 'GBP', - 'title': None, - 'availability': None, - 'url': url, - 'error': None - } - - try: - # Auto-detect site if not provided - if not site_name: - site_name = self._detect_site(url) - if not site_name: - result['error'] = "Could not detect site from URL" - return result - - # Check if site is enabled - if not self.config.is_site_enabled(site_name): - result['error'] = f"Site {site_name} is disabled" - return result - - # Fetch page content - html_content = await self._fetch_page(url) - if not html_content: - result['error'] = "Failed to fetch page content" - return result - - # Parse HTML - soup = BeautifulSoup(html_content, 'html.parser') - - # Use specialized extraction based on site - if site_name == 'jjfoodservice': - extracted_data = self._extract_jjfoodservice_data(soup) - elif site_name == 'atoz_catering': - extracted_data = self._extract_atoz_data(soup) - elif site_name == 'amazon_uk': - extracted_data = self._extract_amazon_uk_data(soup) - elif site_name == 'tesco': - extracted_data = self._extract_tesco_data(soup) - elif site_name == 'sainsburys': - extracted_data = self._extract_sainsburys_data(soup) - elif site_name == 'booker': - extracted_data = self._extract_booker_data(soup) - else: - # Fall back to general extraction - return await super().scrape_product_price(url, site_name) - - if extracted_data['price'] is None: - result['error'] = "Could not extract price from page" - return result - - result.update({ - 'success': True, - 'price': extracted_data['price'], - 'currency': extracted_data.get('currency', 'GBP'), - 'title': extracted_data.get('title'), - 'availability': extracted_data.get('availability', True) - }) - - logger.info(f"Successfully scraped {site_name}: £{extracted_data['price']}") - - except Exception as e: - logger.error(f"Error scraping {url}: {e}") - result['error'] = str(e) - - return result - - def _detect_site(self, url: str) -> Optional[str]: - """Detect which UK catering site this URL belongs to.""" - url_lower = url.lower() - - if 'jjfoodservice.com' in url_lower: - return 'jjfoodservice' - elif 'atoz-catering.co.uk' in url_lower: - return 'atoz_catering' - elif 'amazon.co.uk' in url_lower: - return 'amazon_uk' - elif 'tesco.com' in url_lower: - return 'tesco' - elif 'sainsburys.co.uk' in url_lower: - return 'sainsburys' - elif 'booker.co.uk' in url_lower: - return 'booker' - - # Fall back to parent detection for other sites - return super()._detect_site(url) diff --git a/src/web_ui.py b/src/web_ui.py index abd6923..fc264ec 100644 --- a/src/web_ui.py +++ b/src/web_ui.py @@ -268,4 +268,70 @@ def create_app(): fig = go.Figure(data=traces, layout=layout) return json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) + @app.route('/edit_product/', methods=['GET', 'POST']) + def edit_product(product_id): + """Edit an existing product.""" + product = db_manager.get_product(product_id) + if not product: + flash('Product not found.', 'error') + return redirect(url_for('index')) + + form = ProductForm() + + if form.validate_on_submit(): + urls = {} + if form.jjfoodservice_url.data: + urls['jjfoodservice'] = form.jjfoodservice_url.data + if form.atoz_catering_url.data: + urls['atoz_catering'] = form.atoz_catering_url.data + if form.amazon_uk_url.data: + urls['amazon_uk'] = form.amazon_uk_url.data + + if not urls: + flash('Please provide at least one URL to track.', 'error') + return render_template('edit_product.html', form=form, product=product) + + try: + db_manager.update_product( + product_id=product_id, + name=form.name.data, + description=form.description.data, + target_price=form.target_price.data, + urls=urls + ) + flash(f'Product "{form.name.data}" updated successfully!', 'success') + return redirect(url_for('product_detail', product_id=product_id)) + except Exception as e: + flash(f'Error updating product: {str(e)}', 'error') + + # Pre-populate form with existing data + if request.method == 'GET': + form.name.data = product['name'] + form.description.data = product['description'] + form.target_price.data = product['target_price'] + + # URLs are already parsed as a dictionary by the database method + urls = product['urls'] if product['urls'] else {} + form.jjfoodservice_url.data = urls.get('jjfoodservice', '') + form.atoz_catering_url.data = urls.get('atoz_catering', '') + form.amazon_uk_url.data = urls.get('amazon_uk', '') + + return render_template('edit_product.html', form=form, product=product) + + @app.route('/delete_product/', methods=['POST']) + def delete_product(product_id): + """Delete a product.""" + product = db_manager.get_product(product_id) + if not product: + flash('Product not found.', 'error') + return redirect(url_for('index')) + + try: + db_manager.delete_product(product_id) + flash(f'Product "{product["name"]}" deleted successfully!', 'success') + except Exception as e: + flash(f'Error deleting product: {str(e)}', 'error') + + return redirect(url_for('index')) + return app diff --git a/templates/add_product.html b/templates/add_product.html index ac43c38..d796296 100644 --- a/templates/add_product.html +++ b/templates/add_product.html @@ -123,7 +123,8 @@
  • Make sure URLs point to the specific product page
  • Test URLs in your browser first to ensure they work
  • -
  • Some sites may block automated requests - we'll handle this gracefully
  • +
  • The system will automatically prioritize delivery prices over collection prices
  • +
  • For JJ Food Service and A to Z Catering, ensure you can see delivery pricing on the page
  • For best results, use direct product page URLs
@@ -154,13 +155,15 @@
JJ Food Service

Navigate to the specific product page on JJ Food Service and copy the URL. - Make sure you're logged in for accurate pricing. + Make sure you're logged in for accurate pricing. The system will automatically + prioritize delivery prices over collection prices.

A to Z Catering

Go to the product page on A to Z Catering and copy the URL. URLs typically contain "/products/product/" followed by the product name. + The system will automatically capture delivery pricing when available.

@@ -170,10 +173,11 @@ The URL should contain "/dp/" followed by the product identifier.

-
Note
+
Delivery Pricing Priority

- We focus on UK catering supply websites that work well with automated price tracking. - This provides reliable price monitoring for your business needs. + For JJ Food Service and A to Z Catering, the system automatically prioritizes + delivery prices over collection prices. This ensures you're tracking the + most relevant pricing for delivered goods to your business.

diff --git a/templates/edit_product.html b/templates/edit_product.html new file mode 100644 index 0000000..accccd8 --- /dev/null +++ b/templates/edit_product.html @@ -0,0 +1,190 @@ +{% extends "base.html" %} + +{% block title %}Edit Product - Price Tracker{% endblock %} + +{% block content %} +
+
+
+
+

+ Edit Product: {{ product.name }} +

+
+
+
+ {{ form.hidden_tag() }} + +
+
+ {{ form.name.label(class="form-label fw-bold") }} + {{ form.name(class="form-control form-control-lg") }} + {% if form.name.errors %} +
+ {% for error in form.name.errors %} +
{{ error }}
+ {% endfor %} +
+ {% endif %} +
+
+ {{ form.target_price.label(class="form-label fw-bold") }} +
+ £ + {{ form.target_price(class="form-control form-control-lg") }} +
+ {% if form.target_price.errors %} +
+ {% for error in form.target_price.errors %} +
{{ error }}
+ {% endfor %} +
+ {% endif %} + Optional: Alert when price drops below this +
+
+ +
+ {{ form.description.label(class="form-label fw-bold") }} + {{ form.description(class="form-control", rows="3") }} + {% if form.description.errors %} +
+ {% for error in form.description.errors %} +
{{ error }}
+ {% endfor %} +
+ {% endif %} +
+ +
+
+ Store URLs +
+

Add URLs from the stores you want to track. At least one URL is required.

+ +
+
+ {{ form.jjfoodservice_url.label(class="form-label fw-bold") }} +
+ + + + {{ form.jjfoodservice_url(class="form-control", placeholder="https://www.jjfoodservice.com/...") }} +
+ {% if form.jjfoodservice_url.errors %} +
+ {% for error in form.jjfoodservice_url.errors %} +
{{ error }}
+ {% endfor %} +
+ {% endif %} +
+ +
+ {{ form.atoz_catering_url.label(class="form-label fw-bold") }} +
+ + + + {{ form.atoz_catering_url(class="form-control", placeholder="https://www.atoz-catering.co.uk/...") }} +
+ {% if form.atoz_catering_url.errors %} +
+ {% for error in form.atoz_catering_url.errors %} +
{{ error }}
+ {% endfor %} +
+ {% endif %} +
+
+ +
+
+ {{ form.amazon_uk_url.label(class="form-label fw-bold") }} +
+ + + + {{ form.amazon_uk_url(class="form-control", placeholder="https://www.amazon.co.uk/...") }} +
+ {% if form.amazon_uk_url.errors %} +
+ {% for error in form.amazon_uk_url.errors %} +
{{ error }}
+ {% endfor %} +
+ {% endif %} +
+
+ +
+ +
+
+ + + Cancel + +
+ + +
+ +
+
+
+ + +
+
+
+
+ How to find product URLs +
+
    +
  • JJ Food Service: Search for your product and copy the URL from the product page
  • +
  • A to Z Catering: Navigate to the specific product and copy the URL
  • +
  • Amazon UK: Find the product and copy the URL (we'll extract the essential part)
  • +
+
+
+
+
+
+
+
+ + + + +{% endblock %} diff --git a/templates/index.html b/templates/index.html index 49a183a..53eac9e 100644 --- a/templates/index.html +++ b/templates/index.html @@ -97,6 +97,16 @@ Scrape Now +
+ + Edit + + +
@@ -181,4 +191,58 @@ {% endif %} + + + + + {% endblock %} diff --git a/templates/product_detail.html b/templates/product_detail.html index 676c128..e1de46e 100644 --- a/templates/product_detail.html +++ b/templates/product_detail.html @@ -14,6 +14,16 @@ + + Edit + + Back to Dashboard @@ -222,6 +232,35 @@ {% endif %} + + + {% endblock %} {% block scripts %} @@ -231,4 +270,20 @@ Plotly.newPlot('priceChart', chartData.data, chartData.layout, {responsive: true}); {% endif %} + + {% endblock %} diff --git a/test_actual_scraper.py b/test_actual_scraper.py new file mode 100644 index 0000000..b5d3631 --- /dev/null +++ b/test_actual_scraper.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +import asyncio +import sys +import os + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +from uk_scraper import scrape_jj_foodservice + +async def test_actual_scraper(): + url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/" + + print(f"Testing actual scraper with URL: {url}") + print("=" * 60) + + try: + result = await scrape_jj_foodservice(url) + print(f"Scraper result: {result}") + + if result: + print(f"✅ Name: {result.get('name', 'Not found')}") + print(f"✅ Collection Price: £{result.get('collection_price', 'Not found')}") + print(f"✅ Delivery Price: £{result.get('delivery_price', 'Not found')}") + print(f"✅ Image URL: {result.get('image_url', 'Not found')}") + else: + print("❌ Scraper returned None") + + except Exception as e: + print(f"❌ Error occurred: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + asyncio.run(test_actual_scraper()) diff --git a/test_jj_detailed.py b/test_jj_detailed.py new file mode 100644 index 0000000..eda891a --- /dev/null +++ b/test_jj_detailed.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +import asyncio +import aiohttp +import re +from bs4 import BeautifulSoup + +async def test_jj_patterns(): + url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/" + + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + html = await response.text() + + print(f"HTML content length: {len(html)}") + + # Look for various keywords + keywords = ['DELIVERY', 'delivery', 'COLLECTION', 'collection', '£10.49', '£11.79', '10.49', '11.79'] + + for keyword in keywords: + if keyword in html: + print(f"'{keyword}' FOUND in HTML") + # Find context around the keyword + index = html.find(keyword) + start = max(0, index - 100) + end = min(len(html), index + 100) + context = html[start:end] + print(f"Context: ...{context}...") + print() + else: + print(f"'{keyword}' NOT found in HTML") + + # Look for any price-like patterns + price_patterns = re.findall(r'£?(\d{1,3}\.\d{2})', html) + print(f"\nAll price patterns found: {price_patterns}") + + # Try to find price elements using BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Look for specific CSS classes that might contain prices + price_selectors = [ + '.price', '.product-price', '.delivery-price', '.price-delivery', + '[class*="price"]', '[class*="Price"]' + ] + + for selector in price_selectors: + elements = soup.select(selector) + if elements: + print(f"\nFound elements with selector '{selector}':") + for elem in elements[:5]: # Show first 5 + print(f" - {elem.get_text(strip=True)}") + +if __name__ == "__main__": + asyncio.run(test_jj_patterns()) diff --git a/test_jj_simple.py b/test_jj_simple.py new file mode 100644 index 0000000..7b60a6f --- /dev/null +++ b/test_jj_simple.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Simple test to debug JJ Food Service scraping +""" + +import asyncio +import sys +import os +sys.path.append(os.path.dirname(__file__)) + +from src.uk_scraper import UKCateringScraper +from src.config import Config +import logging + +# Set up verbose logging +logging.basicConfig(level=logging.DEBUG, format='%(levelname)s - %(message)s') + +async def test_jj_scraping(): + config = Config() + + async with UKCateringScraper(config) as scraper: + url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/" + + print(f"Testing URL: {url}") + + # Get the raw HTML content + html_content = await scraper._fetch_page(url) + + if html_content: + print(f"HTML content length: {len(html_content)}") + print("First 500 characters of HTML:") + print(html_content[:500]) + print("\n" + "="*50 + "\n") + + # Look for delivery text + if 'DELIVERY' in html_content: + print("Found 'DELIVERY' in HTML content") + # Find the context around DELIVERY + delivery_pos = html_content.find('DELIVERY') + context = html_content[delivery_pos:delivery_pos+100] + print(f"Context around DELIVERY: {context}") + else: + print("'DELIVERY' not found in HTML content") + + # Look for any price patterns + import re + price_matches = re.findall(r'£(\d{1,3}(?:\.\d{2})?)', html_content) + print(f"All price patterns found: {price_matches}") + + else: + print("Failed to fetch HTML content") + +if __name__ == "__main__": + asyncio.run(test_jj_scraping()) diff --git a/test_regex_patterns.py b/test_regex_patterns.py new file mode 100644 index 0000000..04cc0c9 --- /dev/null +++ b/test_regex_patterns.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +""" +Test the exact regex patterns against the actual HTML content +""" + +import re +import asyncio +import aiohttp +from bs4 import BeautifulSoup + +async def test_jj_patterns(): + url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/" + + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + html_content = await response.text() + + soup = BeautifulSoup(html_content, 'html.parser') + page_text = soup.get_text(separator=' ') + + print(f"Page text length: {len(page_text)}") + + # Find the section with delivery info + delivery_start = page_text.lower().find('delivery') + if delivery_start >= 0: + delivery_section = page_text[delivery_start:delivery_start+200] + print(f"Delivery section: {delivery_section!r}") + + # Test the exact patterns + delivery_patterns = [ + r'Delivery:£(\d{1,3}\.\d{2})', # Delivery:£11.79 + r'DELIVERY:£(\d{1,3}\.\d{2})', # DELIVERY:£11.79 + r'delivery:£(\d{1,3}\.\d{2})', # delivery:£11.79 + r'DELIVERY:\s*£(\d{1,3}\.\d{2})', # DELIVERY: £11.79 + r'delivery:\s*£(\d{1,3}\.\d{2})', # delivery: £11.79 + ] + + for pattern in delivery_patterns: + match = re.search(pattern, page_text, re.IGNORECASE) + if match: + print(f"✅ Pattern '{pattern}' matched! Price: £{match.group(1)}") + return float(match.group(1)) + else: + print(f"❌ Pattern '{pattern}' did not match") + + print("No delivery patterns matched!") + return None + +if __name__ == "__main__": + result = asyncio.run(test_jj_patterns()) + print(f"Final result: {result}") diff --git a/test_scraper.py b/test_scraper.py new file mode 100644 index 0000000..37f29b2 --- /dev/null +++ b/test_scraper.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +""" +Test script to debug scraping issues for JJ Food Service and A to Z Catering +""" + +import sys +import os +import asyncio +sys.path.append(os.path.join(os.path.dirname(__file__))) + +from src.uk_scraper import UKCateringScraper +from src.config import Config +import logging + +# Set up logging +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') + +async def test_scraping(): + config = Config() + + async with UKCateringScraper(config) as scraper: + # Test URLs that were problematic + test_urls = [ + "https://www.jjfoodservice.com/catering-products/confectionery-and-snacks/chocolate/cadbury-dairy-milk-chocolate-bar-110g", + "https://www.atozcatering.co.uk/catering-equipment/refrigeration/prep-fridges/polar-single-door-prep-counter-fridge-240ltr", + "https://www.atozcatering.co.uk/catering-equipment/cooking-equipment/fryers/buffalo-single-tank-induction-fryer-5ltr" + ] + + for url in test_urls: + print(f"\n{'='*80}") + print(f"Testing URL: {url}") + print(f"{'='*80}") + + try: + result = await scraper.scrape_product(url) + if result: + print(f"Success! Result: {result}") + else: + print("Failed to scrape product") + except Exception as e: + print(f"Error: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + asyncio.run(test_scraping()) diff --git a/test_special_pricing.py b/test_special_pricing.py new file mode 100644 index 0000000..e9904a9 --- /dev/null +++ b/test_special_pricing.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Test script for special pricing detection in UK scraper. +This script tests various special pricing scenarios to ensure the enhanced detection works correctly. +""" + +import sys +import os +import asyncio +import logging +from bs4 import BeautifulSoup + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +from uk_scraper import UKCateringScraper +from config import Config + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def create_test_html_scenarios(): + """Create test HTML scenarios for different special pricing patterns.""" + + scenarios = { + 'strikethrough_pricing': """ +
+ £15.99 + £12.99 +
+ """, + + 'was_now_pricing': """ +
+ Was £20.50, now £17.25 +
+ """, + + 'offer_label_pricing': """ +
+ SPECIAL OFFER + £8.99 +
+ """, + + 'delivery_special_pricing': """ +
+

Delivery: £25.00 £19.99

+
+ """, + + 'multiple_prices_no_context': """ +
+ £15.99 + £12.99 +
+ """, + + 'amazon_deal_pricing': """ +
+ £29.99 + £24.99 +
+ """, + + 'jj_member_pricing': """ +
+ £18.50 + Member price: £15.25 +
+ """, + + 'atoz_h3_delivery': """ +

Delivery: Was £22.00 Now £18.50

+ """, + + 'percentage_discount': """ +
+ 20% OFF + RRP £25.00 + £20.00 +
+ """ + } + + return scenarios + + +async def test_special_pricing_scenarios(): + """Test the special pricing detection with various scenarios.""" + + # Initialize the scraper + config = Config() + scraper = UKCateringScraper(config) + + scenarios = create_test_html_scenarios() + + print("Testing Special Pricing Detection") + print("=" * 50) + + for scenario_name, html_content in scenarios.items(): + print(f"\nTesting: {scenario_name}") + print("-" * 30) + + # Parse the HTML + soup = BeautifulSoup(html_content, 'html.parser') + + # Test with different sites + for site_name in ['jjfoodservice', 'atoz_catering', 'amazon_uk']: + print(f"\n {site_name}:") + + try: + # Test special offer detection + special_prices = scraper._find_special_offer_prices(soup, site_name) + if special_prices: + best_price = min(price for price, _ in special_prices) + print(f" ✓ Special offers found: {special_prices}") + print(f" ✓ Best price: £{best_price}") + else: + print(f" ✗ No special offers detected") + + # Test the extraction methods + if site_name == 'jjfoodservice': + result = scraper._extract_jjfoodservice_data(soup) + elif site_name == 'atoz_catering': + result = scraper._extract_atoz_catering_data(soup) + elif site_name == 'amazon_uk': + result = scraper._extract_amazon_uk_data(soup) + + if result['price']: + print(f" ✓ Extracted price: £{result['price']}") + else: + print(f" ✗ No price extracted") + + except Exception as e: + print(f" ✗ Error: {e}") + + +def test_parse_uk_price_functionality(): + """Test the enhanced _parse_uk_price function.""" + + config = Config() + scraper = UKCateringScraper(config) + + print("\n\nTesting _parse_uk_price Functionality") + print("=" * 50) + + test_cases = [ + ("£15.99", False, False, 15.99), + ("Was £20.00 Now £15.99", False, True, 15.99), + ("£25.50 £19.99", False, True, 19.99), + ("Delivery: £12.50", True, False, 12.50), + ("Collection: £10.00 Delivery: £12.50", True, False, 12.50), + ("RRP £30.00 Sale £24.99", False, True, 24.99), + ("Save £5.00! Was £25.00 Now £20.00", False, True, 20.00), + ] + + for i, (price_text, prefer_delivery, detect_special, expected) in enumerate(test_cases, 1): + print(f"\nTest {i}: '{price_text}'") + print(f" prefer_delivery={prefer_delivery}, detect_special={detect_special}") + + # Create a mock element for testing + mock_html = f"{price_text}" + mock_element = BeautifulSoup(mock_html, 'html.parser').find('span') + + result = scraper._parse_uk_price( + price_text, + prefer_delivery=prefer_delivery, + detect_special_offers=detect_special, + element=mock_element + ) + + if result == expected: + print(f" ✓ Result: £{result} (Expected: £{expected})") + else: + print(f" ✗ Result: £{result} (Expected: £{expected})") + + +def test_special_pricing_context(): + """Test the special pricing context detection.""" + + config = Config() + scraper = UKCateringScraper(config) + + print("\n\nTesting Special Pricing Context Detection") + print("=" * 50) + + context_test_cases = [ + ('
£20.00£15.99
', 'strikethrough'), + ('
Was £25.00 Now £19.99
', 'was_now'), + ('
£12.99
', 'offer_label'), + ('
£18.00£14.99
', 'inline_strikethrough'), + ] + + for i, (html_content, test_type) in enumerate(context_test_cases, 1): + print(f"\nTest {i}: {test_type}") + print(f" HTML: {html_content}") + + soup = BeautifulSoup(html_content, 'html.parser') + element = soup.find(['span', 'div']) + + if element: + context = scraper._extract_special_pricing_context(element) + print(f" ✓ Context: {context}") + else: + print(f" ✗ No element found") + + +if __name__ == "__main__": + print("UK Scraper Special Pricing Test Suite") + print("=" * 60) + + # Test the price parsing functionality + test_parse_uk_price_functionality() + + # Test special pricing context detection + test_special_pricing_context() + + # Test full scenarios + asyncio.run(test_special_pricing_scenarios()) + + print("\n" + "=" * 60) + print("Test suite completed!") diff --git a/validate_fix.py b/validate_fix.py new file mode 100644 index 0000000..4e908c7 --- /dev/null +++ b/validate_fix.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Quick validation that the A to Z Catering pricing is working correctly +""" + +import sys +import os +import asyncio + +# Add the src directory to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +async def validate_atoz_pricing(): + """Test the A to Z Catering pricing fix.""" + + try: + from uk_scraper import UKCateringScraper + from config import Config + + print("Testing A to Z Catering pricing fix...") + print("=" * 50) + + config = Config() + scraper = UKCateringScraper(config) + + # Test the problematic URL + url = 'https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24' + + print(f"Testing URL: {url}") + print("Expected price: £12.99 (not £1.39)") + print("Testing...") + + result = await scraper.scrape_product_price(url, 'atoz_catering') + + print(f"\nResults:") + print(f"Success: {result['success']}") + + if result['success'] and result['price']: + price = result['price'] + print(f"Price found: £{price}") + + if price == 12.99: + print("✅ FIXED! Correct price detected (£12.99)") + elif price == 1.39: + print("❌ STILL BROKEN! Wrong price detected (£1.39)") + else: + print(f"⚠️ Different price detected: £{price}") + else: + print(f"❌ Failed to scrape: {result.get('error', 'Unknown error')}") + + except Exception as e: + print(f"❌ Error: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + asyncio.run(validate_atoz_pricing())