scrape fix
This commit is contained in:
68
.github/workflows/advanced-mirror.yml
vendored
Normal file
68
.github/workflows/advanced-mirror.yml
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
name: Advanced Mirror to Azure DevOps
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, master, develop ]
|
||||
pull_request:
|
||||
types: [closed]
|
||||
branches: [ main, master ]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
force_push:
|
||||
description: 'Force push to Azure DevOps'
|
||||
required: false
|
||||
default: 'false'
|
||||
|
||||
jobs:
|
||||
mirror:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.merged == true) || github.event_name == 'workflow_dispatch'
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Setup Git
|
||||
run: |
|
||||
git config --global user.name "GitHub Mirror Bot"
|
||||
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Add Azure DevOps Remote
|
||||
env:
|
||||
AZURE_DEVOPS_TOKEN: ${{ secrets.AZURE_DEVOPS_PAT }}
|
||||
run: |
|
||||
# URL encode the repository name for spaces
|
||||
ENCODED_URL="https://oauth2:${AZURE_DEVOPS_TOKEN}@dev.azure.com/ptslondon/_git/Price%20Tracker"
|
||||
git remote add azure "$ENCODED_URL"
|
||||
|
||||
- name: Mirror Repository
|
||||
env:
|
||||
FORCE_PUSH: ${{ github.event.inputs.force_push }}
|
||||
run: |
|
||||
# Set force flag
|
||||
FORCE_FLAG=""
|
||||
if [ "$FORCE_PUSH" = "true" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
FORCE_FLAG="--force"
|
||||
fi
|
||||
|
||||
# Push current branch
|
||||
CURRENT_BRANCH=${GITHUB_REF#refs/heads/}
|
||||
echo "Mirroring branch: $CURRENT_BRANCH"
|
||||
|
||||
git push azure "$CURRENT_BRANCH" $FORCE_FLAG
|
||||
|
||||
# Push tags
|
||||
git push azure --tags $FORCE_FLAG
|
||||
|
||||
echo "✅ Successfully mirrored to Azure DevOps"
|
||||
|
||||
- name: Verify Mirror
|
||||
run: |
|
||||
echo "Mirror completed for:"
|
||||
echo "- Repository: Price Tracker"
|
||||
echo "- Branch: ${GITHUB_REF#refs/heads/}"
|
||||
echo "- Commit: ${{ github.sha }}"
|
||||
echo "- Azure DevOps URL: https://dev.azure.com/ptslondon/_git/Price%20Tracker"
|
||||
34
.github/workflows/mirror-to-azure.yml
vendored
Normal file
34
.github/workflows/mirror-to-azure.yml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
name: Mirror to Azure DevOps
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, master, develop ] # Add branches you want to mirror
|
||||
workflow_dispatch: # Allows manual triggering
|
||||
|
||||
jobs:
|
||||
mirror:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # Fetch full history for complete mirror
|
||||
|
||||
- name: Mirror to Azure DevOps
|
||||
env:
|
||||
AZURE_DEVOPS_URL: https://dev.azure.com/ptslondon/_git/Price%20Tracker
|
||||
AZURE_DEVOPS_TOKEN: ${{ secrets.AZURE_DEVOPS_PAT }}
|
||||
run: |
|
||||
# Configure git
|
||||
git config --global user.name "GitHub Mirror Bot"
|
||||
git config --global user.email "noreply@github.com"
|
||||
|
||||
# Add Azure DevOps as remote
|
||||
git remote add azure https://oauth2:${AZURE_DEVOPS_TOKEN}@dev.azure.com/ptslondon/_git/Price%20Tracker
|
||||
|
||||
# Push all branches and tags
|
||||
git push azure --all --force
|
||||
git push azure --tags --force
|
||||
|
||||
echo "Successfully mirrored to Azure DevOps"
|
||||
20
README.md
20
README.md
@@ -4,13 +4,14 @@ A comprehensive web scraper for tracking product prices across multiple e-commer
|
||||
|
||||
## Features ✨
|
||||
|
||||
- **Multi-site Price Tracking**: Monitor prices across Amazon, eBay, Walmart, and more
|
||||
- **Multi-site Price Tracking**: Monitor prices across JJ Food Service, A to Z Catering, and Amazon UK
|
||||
- **Beautiful Web UI**: Clean, responsive interface for managing products and viewing price history
|
||||
- **Price Alerts**: Get notified when products reach your target price
|
||||
- **Historical Data**: View price trends with interactive charts
|
||||
- **Automated Scraping**: Schedule regular price checks
|
||||
- **Multiple Notifications**: Email and webhook notifications
|
||||
- **Robust Scraping**: Built-in retry logic, rotating user agents, and rate limiting
|
||||
- **Special Pricing Detection**: Automatically detects and prioritizes delivery prices and special offers
|
||||
|
||||
## Quick Start 🚀
|
||||
|
||||
@@ -106,13 +107,20 @@ Add new e-commerce sites by extending the sites configuration:
|
||||
```json
|
||||
{
|
||||
"sites": {
|
||||
"your_site": {
|
||||
"atoz_catering": {
|
||||
"enabled": true,
|
||||
"base_url": "https://www.yoursite.com",
|
||||
"base_url": "https://www.atoz-catering.co.uk",
|
||||
"selectors": {
|
||||
"price": [".price", ".cost"],
|
||||
"title": [".product-title"],
|
||||
"availability": [".stock-status"]
|
||||
"price": [
|
||||
".my-price.price-offer",
|
||||
".delivery-price",
|
||||
".price"
|
||||
],
|
||||
"special_offer": [
|
||||
".my-price.price-offer",
|
||||
".special-offer",
|
||||
"del:contains('£')"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
80
SCRAPER_ARCHITECTURE.md
Normal file
80
SCRAPER_ARCHITECTURE.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# Price Tracker - Scraper Architecture
|
||||
|
||||
## Current Structure
|
||||
|
||||
### 1. **`scraper.py` - Base Scraper Class**
|
||||
- **Purpose**: Foundation class for all price scraping
|
||||
- **Handles**: Generic e-commerce sites (Amazon.com, eBay, Walmart, etc.)
|
||||
- **Key Features**:
|
||||
- Base `PriceScraper` class with HTTP session management
|
||||
- Anti-bot measures (headers, delays, retries)
|
||||
- Generic price extraction methods
|
||||
- Site detection logic
|
||||
|
||||
### 2. **`uk_scraper.py` - UK Catering Specialist**
|
||||
- **Purpose**: Specialized scraper for UK catering supply websites
|
||||
- **Handles**: JJ Food Service, A to Z Catering, Amazon UK
|
||||
- **Key Features**:
|
||||
- Inherits from `PriceScraper` base class
|
||||
- UK currency handling (£ symbol)
|
||||
- Delivery vs Collection price prioritization
|
||||
- Special pricing detection (offers, strikethrough, was/now pricing)
|
||||
- Site-specific CSS selectors (e.g., `.my-price.price-offer` for A to Z)
|
||||
|
||||
### 3. **`scraper_manager.py` - Orchestration Layer**
|
||||
- **Purpose**: Routes scraping tasks to appropriate scrapers
|
||||
- **Logic**:
|
||||
- Detects UK catering sites → uses `UKCateringScraper`
|
||||
- Detects other sites → uses base `PriceScraper`
|
||||
- Manages concurrent requests and error handling
|
||||
|
||||
## Site Mapping
|
||||
|
||||
### UK Catering Sites (UKCateringScraper):
|
||||
- `jjfoodservice` → JJ Food Service
|
||||
- `atoz_catering` → A to Z Catering
|
||||
- `amazon_uk` → Amazon UK
|
||||
|
||||
### International Sites (PriceScraper):
|
||||
- `amazon` → Amazon.com
|
||||
- `ebay` → eBay
|
||||
- `walmart` → Walmart
|
||||
- *(Future sites can be added here)*
|
||||
|
||||
## Key Benefits of Current Structure
|
||||
|
||||
✅ **Separation of Concerns**: UK-specific logic is isolated
|
||||
✅ **Extensibility**: Easy to add new UK sites or international sites
|
||||
✅ **Maintainability**: Changes to UK logic don't affect international scraping
|
||||
✅ **Specialization**: UK scraper handles currency, delivery pricing, special offers
|
||||
|
||||
## Recommendations
|
||||
|
||||
### ✅ **KEEP CURRENT STRUCTURE** - It's well-designed!
|
||||
|
||||
The separation between `scraper.py` and `uk_scraper.py` is actually **good architecture** because:
|
||||
|
||||
1. **UK catering sites have unique requirements** (delivery vs collection, £ pricing, special offers)
|
||||
2. **International sites have different patterns** (USD pricing, different site structures)
|
||||
3. **Easy to maintain and extend** each scraper independently
|
||||
|
||||
### Minor Improvements Made:
|
||||
|
||||
1. **Enhanced site detection** in base scraper
|
||||
2. **Added helper methods** to determine scraper routing
|
||||
3. **Improved scraper manager** logic for clarity
|
||||
4. **Fixed A to Z pricing** with `.my-price.price-offer` selector
|
||||
|
||||
## Final File Structure
|
||||
|
||||
```
|
||||
src/
|
||||
├── scraper.py # Base scraper (international sites)
|
||||
├── uk_scraper.py # UK catering specialist
|
||||
├── scraper_manager.py # Orchestration layer
|
||||
├── config.py # Configuration management
|
||||
├── database.py # Data persistence
|
||||
└── web_ui.py # Flask web interface
|
||||
```
|
||||
|
||||
This structure supports both current UK catering needs and future expansion to international e-commerce sites.
|
||||
177
SPECIAL_PRICING.md
Normal file
177
SPECIAL_PRICING.md
Normal file
@@ -0,0 +1,177 @@
|
||||
# Special Pricing Features - Price Tracker
|
||||
|
||||
## Overview
|
||||
|
||||
The UK Price Tracker now includes enhanced special pricing detection capabilities to identify and prioritize discounted, sale, and special offer prices across supported UK catering sites.
|
||||
|
||||
## Features
|
||||
|
||||
### 🎯 Special Pricing Detection
|
||||
- **Strikethrough Pricing**: Detects crossed-out prices with sale prices
|
||||
- **Was/Now Patterns**: Identifies "Was £X Now £Y" pricing patterns
|
||||
- **Offer Labels**: Recognizes sale/discount/special offer badges and containers
|
||||
- **Percentage Discounts**: Detects "X% OFF" promotional pricing
|
||||
- **Member/Trade Pricing**: Special pricing for registered customers (JJ Food Service)
|
||||
|
||||
### 🚚 Delivery Price Priority
|
||||
- Automatically prioritizes delivery prices over collection prices
|
||||
- Identifies delivery-specific special offers
|
||||
- Handles mixed pricing scenarios (delivery vs collection vs general)
|
||||
|
||||
### 🏪 Site-Specific Enhancements
|
||||
|
||||
#### JJ Food Service
|
||||
- Member pricing detection
|
||||
- Trade pricing identification
|
||||
- Bulk discount recognition
|
||||
- Quantity-based pricing
|
||||
|
||||
#### A to Z Catering
|
||||
- Header-based delivery pricing (H3/H4 elements)
|
||||
- Inline strikethrough detection
|
||||
- Special delivery offer containers
|
||||
- Style-based strikethrough recognition
|
||||
|
||||
#### Amazon UK
|
||||
- Deal price detection
|
||||
- Strike-through pricing
|
||||
- Sale badge recognition
|
||||
- RRP vs Sale price comparison
|
||||
|
||||
## Configuration
|
||||
|
||||
Special pricing is configured in `config.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"scraping": {
|
||||
"special_pricing": {
|
||||
"enabled": true,
|
||||
"prefer_delivery_prices": true,
|
||||
"detect_strikethrough": true,
|
||||
"detect_was_now_patterns": true,
|
||||
"detect_percentage_discounts": true,
|
||||
"min_discount_threshold": 0.05,
|
||||
"max_price_difference_ratio": 0.5
|
||||
}
|
||||
},
|
||||
"sites": {
|
||||
"jjfoodservice": {
|
||||
"selectors": {
|
||||
"special_offer": [
|
||||
".special-offer",
|
||||
".member-price",
|
||||
"del:contains('£')",
|
||||
".was-price"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Test Suite
|
||||
Run the comprehensive test suite:
|
||||
```bash
|
||||
python test_special_pricing.py
|
||||
```
|
||||
|
||||
This tests:
|
||||
- Price parsing with various formats
|
||||
- Special pricing context detection
|
||||
- Site-specific extraction methods
|
||||
- Mock HTML scenarios
|
||||
|
||||
### Debug Tool
|
||||
Debug real URLs:
|
||||
```bash
|
||||
python debug_special_pricing.py <URL> [--verbose]
|
||||
```
|
||||
|
||||
Examples:
|
||||
```bash
|
||||
# Debug a JJ Food Service product
|
||||
python debug_special_pricing.py "https://www.jjfoodservice.com/product/example" --verbose
|
||||
|
||||
# Debug an A to Z Catering product
|
||||
python debug_special_pricing.py "https://www.atoz-catering.co.uk/product/example"
|
||||
|
||||
# Debug an Amazon UK product
|
||||
python debug_special_pricing.py "https://www.amazon.co.uk/product/example"
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### 1. Context Detection
|
||||
The scraper analyzes HTML elements and their parent containers to detect special pricing context:
|
||||
- Strikethrough elements (`<del>`, `<s>`, `<strike>`)
|
||||
- CSS styling (`text-decoration: line-through`)
|
||||
- Keyword patterns (`was`, `now`, `sale`, `offer`, `discount`)
|
||||
- Percentage discount patterns (`20% off`, etc.)
|
||||
|
||||
### 2. Price Extraction
|
||||
When multiple prices are found:
|
||||
- **With special context**: Returns the lowest price (offer price)
|
||||
- **Delivery preference**: Prioritizes delivery over collection prices
|
||||
- **Multiple prices**: Takes the last/lowest price found
|
||||
|
||||
### 3. Site-Specific Logic
|
||||
Each site has tailored extraction methods:
|
||||
- **JJ Food Service**: Focuses on member/trade pricing
|
||||
- **A to Z Catering**: Enhanced header and delivery price detection
|
||||
- **Amazon UK**: Deal and promotional price recognition
|
||||
|
||||
## Examples
|
||||
|
||||
### Strikethrough Pricing
|
||||
```html
|
||||
<div class="product-price">
|
||||
<del>£15.99</del>
|
||||
<span class="sale-price">£12.99</span>
|
||||
</div>
|
||||
```
|
||||
**Result**: £12.99 (special offer detected)
|
||||
|
||||
### Was/Now Pricing
|
||||
```html
|
||||
<div class="price-container">
|
||||
<span>Was £20.50, now £17.25</span>
|
||||
</div>
|
||||
```
|
||||
**Result**: £17.25 (was/now pattern detected)
|
||||
|
||||
### Delivery Special Offers
|
||||
```html
|
||||
<h3>Delivery: <del>£25.00</del> £19.99</h3>
|
||||
```
|
||||
**Result**: £19.99 (delivery + special offer)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### No Special Prices Detected
|
||||
1. Check if the site uses non-standard markup
|
||||
2. Add custom selectors to `config.json`
|
||||
3. Use debug tool to see what selectors are matching
|
||||
4. Verify special pricing is enabled in config
|
||||
|
||||
### Wrong Price Selected
|
||||
1. Check if delivery preference is correctly configured
|
||||
2. Verify the HTML structure matches expected patterns
|
||||
3. Use verbose debugging to see all detected prices
|
||||
4. Consider adding site-specific selectors
|
||||
|
||||
### Performance Issues
|
||||
1. Reduce the number of special offer selectors
|
||||
2. Increase delays between requests
|
||||
3. Use more specific CSS selectors
|
||||
4. Enable only necessary special pricing features
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- **Machine Learning**: Auto-detect pricing patterns
|
||||
- **More Sites**: Extend to additional UK catering suppliers
|
||||
- **Price History**: Track special offer frequency and patterns
|
||||
- **Alerts**: Notify when special offers are detected
|
||||
- **Comparison**: Cross-site special offer comparison
|
||||
198
debug_atoz_pricing.py
Normal file
198
debug_atoz_pricing.py
Normal file
@@ -0,0 +1,198 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script specifically for A to Z Catering pricing issues
|
||||
"""
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
def fetch_and_analyze_atoz_page(url):
|
||||
"""Fetch and analyze the A to Z page to identify pricing issues."""
|
||||
|
||||
print(f"Analyzing A to Z page: {url}")
|
||||
print("=" * 80)
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
print(f"HTTP Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print("Failed to fetch page")
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# 1. Find all elements containing prices
|
||||
print("\n1. ALL PRICE ELEMENTS FOUND:")
|
||||
print("-" * 40)
|
||||
price_pattern = re.compile(r'£\d+\.?\d*')
|
||||
price_elements = soup.find_all(string=price_pattern)
|
||||
|
||||
for i, price_text in enumerate(price_elements):
|
||||
parent = price_text.parent if hasattr(price_text, 'parent') else None
|
||||
parent_class = parent.get('class', []) if parent else []
|
||||
parent_tag = parent.name if parent else 'N/A'
|
||||
|
||||
print(f" {i+1:2d}. '{price_text.strip()}' in <{parent_tag}> class={parent_class}")
|
||||
|
||||
# 2. Check for delivery-specific elements
|
||||
print("\n2. DELIVERY-RELATED ELEMENTS:")
|
||||
print("-" * 40)
|
||||
delivery_keywords = ['delivery', 'delivered']
|
||||
|
||||
for keyword in delivery_keywords:
|
||||
elements = soup.find_all(string=re.compile(keyword, re.IGNORECASE))
|
||||
for elem in elements[:5]: # Show first 5
|
||||
parent = elem.parent if hasattr(elem, 'parent') else None
|
||||
parent_class = parent.get('class', []) if parent else []
|
||||
text = elem.strip()[:100]
|
||||
print(f" '{text}' in class={parent_class}")
|
||||
|
||||
# 3. Check h3 and h4 elements (A to Z specific)
|
||||
print("\n3. H3/H4 ELEMENTS WITH PRICES:")
|
||||
print("-" * 40)
|
||||
headers = soup.find_all(['h3', 'h4'])
|
||||
for header in headers:
|
||||
text = header.get_text(strip=True)
|
||||
if '£' in text:
|
||||
print(f" <{header.name}>: {text}")
|
||||
|
||||
# 4. Test specific selectors from our config
|
||||
print("\n4. TESTING OUR SELECTORS:")
|
||||
print("-" * 40)
|
||||
|
||||
test_selectors = [
|
||||
'.delivery-price',
|
||||
'.price-delivery',
|
||||
'.price',
|
||||
'.product-price',
|
||||
'.collection-price',
|
||||
'span:contains("£")',
|
||||
'h3:contains("Delivery")',
|
||||
'h4:contains("Delivery")',
|
||||
'*[class*="price"]'
|
||||
]
|
||||
|
||||
for selector in test_selectors:
|
||||
try:
|
||||
if ':contains(' in selector:
|
||||
# Handle contains selectors differently
|
||||
if 'h3:contains("Delivery")' == selector:
|
||||
elements = [h for h in soup.find_all('h3') if 'delivery' in h.get_text().lower()]
|
||||
elif 'h4:contains("Delivery")' == selector:
|
||||
elements = [h for h in soup.find_all('h4') if 'delivery' in h.get_text().lower()]
|
||||
elif 'span:contains("£")' == selector:
|
||||
elements = [s for s in soup.find_all('span') if '£' in s.get_text()]
|
||||
else:
|
||||
elements = []
|
||||
else:
|
||||
elements = soup.select(selector)
|
||||
|
||||
if elements:
|
||||
print(f" ✓ {selector} -> {len(elements)} elements:")
|
||||
for i, elem in enumerate(elements[:3]): # Show first 3
|
||||
text = elem.get_text(strip=True)
|
||||
if '£' in text:
|
||||
print(f" [{i+1}] {text}")
|
||||
else:
|
||||
print(f" ✗ {selector} -> No elements")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ⚠ {selector} -> Error: {e}")
|
||||
|
||||
# 5. Look for the specific prices mentioned (12.99 and 1.39)
|
||||
print("\n5. SPECIFIC PRICE ANALYSIS:")
|
||||
print("-" * 40)
|
||||
|
||||
if '12.99' in response.text:
|
||||
print("✓ £12.99 found in page content")
|
||||
# Find context around 12.99
|
||||
matches = list(re.finditer(r'12\.99', response.text))
|
||||
for match in matches[:3]: # Show first 3 occurrences
|
||||
start = max(0, match.start() - 100)
|
||||
end = min(len(response.text), match.end() + 100)
|
||||
context = response.text[start:end].replace('\n', ' ').replace('\t', ' ')
|
||||
print(f" Context: ...{context}...")
|
||||
else:
|
||||
print("✗ £12.99 NOT found in page content")
|
||||
|
||||
if '1.39' in response.text:
|
||||
print("✓ £1.39 found in page content")
|
||||
# Find context around 1.39
|
||||
matches = list(re.finditer(r'1\.39', response.text))
|
||||
for match in matches[:3]: # Show first 3 occurrences
|
||||
start = max(0, match.start() - 100)
|
||||
end = min(len(response.text), match.end() + 100)
|
||||
context = response.text[start:end].replace('\n', ' ').replace('\t', ' ')
|
||||
print(f" Context: ...{context}...")
|
||||
else:
|
||||
print("✗ £1.39 NOT found in page content")
|
||||
|
||||
# 6. Try to simulate our current parsing logic
|
||||
print("\n6. SIMULATING CURRENT PARSING LOGIC:")
|
||||
print("-" * 40)
|
||||
|
||||
# Test our general price selectors
|
||||
general_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'span:contains("£")',
|
||||
'.price-value',
|
||||
]
|
||||
|
||||
found_prices = []
|
||||
for selector in general_selectors:
|
||||
try:
|
||||
if selector == 'span:contains("£")':
|
||||
elements = [s for s in soup.find_all('span') if '£' in s.get_text()]
|
||||
else:
|
||||
elements = soup.select(selector)
|
||||
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
if '£' in price_text:
|
||||
# Extract price using regex
|
||||
price_matches = re.findall(r'£(\d+\.?\d*)', price_text)
|
||||
for match in price_matches:
|
||||
try:
|
||||
price_value = float(match)
|
||||
found_prices.append((price_value, selector, price_text))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error with {selector}: {e}")
|
||||
|
||||
print(f"Found {len(found_prices)} prices total:")
|
||||
for price, selector, text in found_prices:
|
||||
print(f" £{price} from '{selector}': {text[:50]}")
|
||||
|
||||
if found_prices:
|
||||
# Show what our current logic would select
|
||||
min_price = min(price for price, _, _ in found_prices)
|
||||
max_price = max(price for price, _, _ in found_prices)
|
||||
last_price = found_prices[-1][0] if found_prices else None
|
||||
|
||||
print(f"\nCurrent logic would likely select:")
|
||||
print(f" Minimum price: £{min_price}")
|
||||
print(f" Maximum price: £{max_price}")
|
||||
print(f" Last price found: £{last_price}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
url = "https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24"
|
||||
fetch_and_analyze_atoz_page(url)
|
||||
34
debug_jj.py
Normal file
34
debug_jj.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script to test JJ Food Service scraping
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
||||
|
||||
from src.config import Config
|
||||
from src.uk_scraper import UKCateringScraper
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
async def test_jj_scraping():
|
||||
config = Config()
|
||||
|
||||
print(f"JJ Food Service enabled: {config.is_site_enabled('jjfoodservice')}")
|
||||
print(f"A to Z enabled: {config.is_site_enabled('atoz_catering')}")
|
||||
|
||||
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||
|
||||
async with UKCateringScraper(config) as scraper:
|
||||
print(f"\nTesting JJ Food Service URL: {url}")
|
||||
result = await scraper.scrape_product_price(url, 'jjfoodservice')
|
||||
print(f"Result: {result}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_jj_scraping())
|
||||
160
debug_special_pricing.py
Normal file
160
debug_special_pricing.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Special Pricing Debug Tool for UK Price Tracker
|
||||
|
||||
This tool helps debug and monitor special pricing detection on real websites.
|
||||
It can be used to test URLs and see exactly what pricing information is being detected.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
import argparse
|
||||
from typing import Dict, Any
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from uk_scraper import UKCateringScraper
|
||||
from config import Config
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def detect_site_from_url(url: str) -> str:
|
||||
"""Detect which site the URL belongs to."""
|
||||
if 'jjfoodservice.com' in url:
|
||||
return 'jjfoodservice'
|
||||
elif 'atoz-catering.co.uk' in url:
|
||||
return 'atoz_catering'
|
||||
elif 'amazon.co.uk' in url:
|
||||
return 'amazon_uk'
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
|
||||
async def debug_url_pricing(url: str, verbose: bool = False):
|
||||
"""Debug pricing extraction for a specific URL."""
|
||||
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
|
||||
site_name = detect_site_from_url(url)
|
||||
|
||||
print(f"Debugging URL: {url}")
|
||||
print(f"Detected site: {site_name}")
|
||||
print("-" * 60)
|
||||
|
||||
if site_name == 'unknown':
|
||||
print("❌ Unknown site - cannot process")
|
||||
return
|
||||
|
||||
try:
|
||||
# Fetch the page content
|
||||
print("🌐 Fetching page content...")
|
||||
html_content = await scraper._fetch_page(url)
|
||||
|
||||
if not html_content:
|
||||
print("❌ Failed to fetch page content")
|
||||
return
|
||||
|
||||
print("✅ Page content fetched successfully")
|
||||
|
||||
# Parse with BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Debug special pricing detection
|
||||
print("\n🔍 Looking for special offer prices...")
|
||||
special_prices = scraper._find_special_offer_prices(soup, site_name)
|
||||
|
||||
if special_prices:
|
||||
print(f"✅ Found {len(special_prices)} special offer prices:")
|
||||
for price, selector in special_prices:
|
||||
print(f" £{price} (found with: {selector})")
|
||||
|
||||
best_special_price = min(price for price, _ in special_prices)
|
||||
print(f"🎯 Best special offer price: £{best_special_price}")
|
||||
else:
|
||||
print("❌ No special offer prices found")
|
||||
|
||||
# Test the main extraction method
|
||||
print(f"\n🔍 Testing {site_name} extraction method...")
|
||||
|
||||
if site_name == 'jjfoodservice':
|
||||
result = scraper._extract_jjfoodservice_data(soup)
|
||||
elif site_name == 'atoz_catering':
|
||||
result = scraper._extract_atoz_catering_data(soup)
|
||||
elif site_name == 'amazon_uk':
|
||||
result = scraper._extract_amazon_uk_data(soup)
|
||||
|
||||
print(f"✅ Extraction result:")
|
||||
print(f" Price: £{result['price']}" if result['price'] else " Price: Not found")
|
||||
print(f" Title: {result.get('title', 'Not found')}")
|
||||
print(f" Available: {result.get('availability', 'Unknown')}")
|
||||
print(f" Currency: {result.get('currency', 'Unknown')}")
|
||||
|
||||
# If verbose, show more debugging info
|
||||
if verbose:
|
||||
print(f"\n🔍 Verbose debugging for {site_name}...")
|
||||
|
||||
# Get site selectors from config
|
||||
site_config = config.get_site_config(site_name)
|
||||
if site_config and 'selectors' in site_config:
|
||||
selectors = site_config['selectors']
|
||||
|
||||
# Test each selector type
|
||||
for selector_type, selector_list in selectors.items():
|
||||
print(f"\n Testing {selector_type} selectors:")
|
||||
|
||||
for selector in selector_list:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
print(f" ✅ {selector} -> Found {len(elements)} elements")
|
||||
for i, elem in enumerate(elements[:3]): # Show first 3
|
||||
text = elem.get_text(strip=True)[:100] # Truncate long text
|
||||
print(f" [{i+1}] {text}")
|
||||
else:
|
||||
print(f" ❌ {selector} -> No elements found")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ {selector} -> Error: {e}")
|
||||
|
||||
# Test the full scraping method
|
||||
print(f"\n🔍 Testing full scrape_product_price method...")
|
||||
full_result = await scraper.scrape_product_price(url, site_name)
|
||||
|
||||
print("✅ Full scraping result:")
|
||||
print(f" Success: {full_result['success']}")
|
||||
print(f" Price: £{full_result['price']}" if full_result['price'] else " Price: Not found")
|
||||
print(f" Error: {full_result.get('error', 'None')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during debugging: {e}")
|
||||
if verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to run the debug tool."""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Debug special pricing detection for UK price tracker')
|
||||
parser.add_argument('url', help='URL to debug')
|
||||
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output')
|
||||
parser.add_argument('--test-selectors', action='store_true', help='Test all selectors from config')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("UK Price Tracker - Special Pricing Debug Tool")
|
||||
print("=" * 60)
|
||||
|
||||
# Run the debugging
|
||||
asyncio.run(debug_url_pricing(args.url, args.verbose))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
64
purge_database.py
Normal file
64
purge_database.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to purge all price data from the database
|
||||
This will reset the database so the next scrape acts as the first one
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import os
|
||||
from src.config import Config
|
||||
|
||||
def purge_database():
|
||||
"""Purge all data from the price tracker database."""
|
||||
config = Config()
|
||||
db_path = config.database_path
|
||||
|
||||
if not os.path.exists(db_path):
|
||||
print(f"Database file {db_path} does not exist. Nothing to purge.")
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all table names
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
||||
tables = cursor.fetchall()
|
||||
|
||||
if not tables:
|
||||
print("No tables found in database.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
print(f"Found {len(tables)} tables in database:")
|
||||
for table in tables:
|
||||
table_name = table[0]
|
||||
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f" - {table_name}: {count} records")
|
||||
|
||||
# Confirm purge
|
||||
response = input("\nDo you want to purge all data? (yes/no): ").lower().strip()
|
||||
|
||||
if response in ['yes', 'y']:
|
||||
# Delete all data from all tables
|
||||
for table in tables:
|
||||
table_name = table[0]
|
||||
cursor.execute(f"DELETE FROM {table_name}")
|
||||
print(f"Purged all data from {table_name}")
|
||||
|
||||
conn.commit()
|
||||
print("\n✅ Database purged successfully!")
|
||||
print("The next scrape will act as the first one and log all prices.")
|
||||
else:
|
||||
print("Purge cancelled.")
|
||||
|
||||
conn.close()
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"Database error: {e}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
purge_database()
|
||||
133
simple_test.py
Normal file
133
simple_test.py
Normal file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple test for special pricing functionality
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
def test_imports():
|
||||
"""Test that all modules can be imported."""
|
||||
try:
|
||||
print("Testing imports...")
|
||||
|
||||
# Basic imports
|
||||
import re
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
print("✓ Basic Python modules imported")
|
||||
|
||||
# Third-party imports
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
print("✓ BeautifulSoup imported")
|
||||
|
||||
# Local imports
|
||||
from config import Config
|
||||
print("✓ Config imported")
|
||||
|
||||
from scraper import PriceScraper
|
||||
print("✓ PriceScraper imported")
|
||||
|
||||
from uk_scraper import UKCateringScraper
|
||||
print("✓ UKCateringScraper imported")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Import error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_basic_functionality():
|
||||
"""Test basic functionality of the special pricing."""
|
||||
try:
|
||||
from config import Config
|
||||
from uk_scraper import UKCateringScraper
|
||||
|
||||
print("\nTesting basic functionality...")
|
||||
|
||||
# Create config and scraper
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
print("✓ Scraper created successfully")
|
||||
|
||||
# Test price parsing
|
||||
test_price = scraper._parse_uk_price("£12.99")
|
||||
if test_price == 12.99:
|
||||
print("✓ Basic price parsing works")
|
||||
else:
|
||||
print(f"✗ Price parsing failed: got {test_price}, expected 12.99")
|
||||
|
||||
# Test special pricing
|
||||
special_price = scraper._parse_uk_price("Was £20.00 Now £15.99", detect_special_offers=True)
|
||||
if special_price == 15.99:
|
||||
print("✓ Special price parsing works")
|
||||
else:
|
||||
print(f"✗ Special price parsing failed: got {special_price}, expected 15.99")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Functionality error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_html_parsing():
|
||||
"""Test HTML parsing for special pricing."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
from uk_scraper import UKCateringScraper
|
||||
from config import Config
|
||||
|
||||
print("\nTesting HTML parsing...")
|
||||
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
|
||||
# Test strikethrough detection
|
||||
html = '<div><del>£20.00</del><span>£15.99</span></div>'
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
special_prices = scraper._find_special_offer_prices(soup, 'atoz_catering')
|
||||
if special_prices:
|
||||
print(f"✓ Special offer detection works: found {len(special_prices)} prices")
|
||||
else:
|
||||
print("✗ Special offer detection failed")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ HTML parsing error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Simple Special Pricing Test")
|
||||
print("=" * 40)
|
||||
|
||||
success = True
|
||||
|
||||
# Test imports
|
||||
if not test_imports():
|
||||
success = False
|
||||
|
||||
# Test basic functionality
|
||||
if not test_basic_functionality():
|
||||
success = False
|
||||
|
||||
# Test HTML parsing
|
||||
if not test_html_parsing():
|
||||
success = False
|
||||
|
||||
print("\n" + "=" * 40)
|
||||
if success:
|
||||
print("✅ All tests passed!")
|
||||
else:
|
||||
print("❌ Some tests failed!")
|
||||
sys.exit(1)
|
||||
@@ -147,6 +147,15 @@ class DatabaseManager:
|
||||
UPDATE products SET active = 0, updated_at = ? WHERE id = ?
|
||||
''', (datetime.now(), product_id))
|
||||
|
||||
def delete_product(self, product_id: int):
|
||||
"""Delete a product and all its associated price history."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Delete price history first (due to foreign key constraints)
|
||||
conn.execute('DELETE FROM price_history WHERE product_id = ?', (product_id,))
|
||||
|
||||
# Delete the product
|
||||
conn.execute('DELETE FROM products WHERE id = ?', (product_id,))
|
||||
|
||||
def save_price_history(self, product_id: int, site_name: str, price: float,
|
||||
currency: str = 'GBP', availability: bool = True,
|
||||
timestamp: datetime = None):
|
||||
|
||||
@@ -169,13 +169,21 @@ class PriceScraper:
|
||||
"""Detect which site this URL belongs to."""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
|
||||
if 'amazon' in domain:
|
||||
# UK Catering sites (handled by UKCateringScraper)
|
||||
if 'jjfoodservice.com' in domain:
|
||||
return 'jjfoodservice'
|
||||
elif 'atoz-catering.co.uk' in domain:
|
||||
return 'atoz_catering'
|
||||
elif 'amazon.co.uk' in domain:
|
||||
return 'amazon_uk'
|
||||
|
||||
# International sites (handled by base PriceScraper)
|
||||
elif 'amazon.com' in domain or 'amazon.' in domain:
|
||||
return 'amazon'
|
||||
elif 'ebay' in domain:
|
||||
return 'ebay'
|
||||
elif 'walmart' in domain:
|
||||
return 'walmart'
|
||||
# Add more site detection logic here
|
||||
|
||||
return None
|
||||
|
||||
@@ -267,6 +275,17 @@ class PriceScraper:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def should_use_uk_scraper(self, url: str) -> bool:
|
||||
"""Determine if this URL should use the UK catering scraper."""
|
||||
site_name = self._detect_site(url)
|
||||
uk_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
||||
return site_name in uk_sites
|
||||
|
||||
@classmethod
|
||||
def get_uk_catering_sites(cls) -> set:
|
||||
"""Get the list of UK catering sites."""
|
||||
return {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
||||
|
||||
|
||||
class ScraperManager:
|
||||
|
||||
@@ -17,6 +17,7 @@ class ScraperManager(BaseScraper):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.active_tasks = {}
|
||||
self.semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||
|
||||
async def scrape_product_by_id(self, product_id: int, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||
"""Scrape a specific product by ID with task tracking."""
|
||||
@@ -36,6 +37,79 @@ class ScraperManager(BaseScraper):
|
||||
if product_id in self.active_tasks:
|
||||
del self.active_tasks[product_id]
|
||||
|
||||
async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||
"""Scrape prices for a single product across all configured sites."""
|
||||
product_id = product['id']
|
||||
urls = product['urls']
|
||||
|
||||
results = {}
|
||||
|
||||
# Check if this product has UK catering sites
|
||||
uk_catering_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
||||
has_uk_sites = any(site in uk_catering_sites for site in urls.keys())
|
||||
|
||||
if has_uk_sites:
|
||||
# Use UK-specific scraper
|
||||
async with UKCateringScraper(self.config) as scraper:
|
||||
tasks = []
|
||||
|
||||
for site_name, url in urls.items():
|
||||
if self.config.is_site_enabled(site_name):
|
||||
task = self._scrape_with_semaphore_uk(scraper, url, site_name)
|
||||
tasks.append((site_name, task))
|
||||
|
||||
# Add delay between requests
|
||||
await asyncio.sleep(self.config.delay_between_requests)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
for site_name, task in tasks:
|
||||
try:
|
||||
result = await task
|
||||
results[site_name] = result
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
|
||||
results[site_name] = {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
else:
|
||||
# Use generic scraper for non-UK sites
|
||||
from .scraper import PriceScraper
|
||||
async with PriceScraper(self.config) as scraper:
|
||||
tasks = []
|
||||
|
||||
for site_name, url in urls.items():
|
||||
if self.config.is_site_enabled(site_name):
|
||||
task = self._scrape_with_semaphore(scraper, url, site_name)
|
||||
tasks.append((site_name, task))
|
||||
|
||||
# Add delay between requests
|
||||
await asyncio.sleep(self.config.delay_between_requests)
|
||||
|
||||
# Wait for all tasks to complete
|
||||
for site_name, task in tasks:
|
||||
try:
|
||||
result = await task
|
||||
results[site_name] = result
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
|
||||
results[site_name] = {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
async def _scrape_with_semaphore_uk(self, scraper: UKCateringScraper, url: str, site_name: str):
|
||||
"""Scrape with semaphore using UK scraper."""
|
||||
async with self.semaphore:
|
||||
return await scraper.scrape_product_price(url, site_name)
|
||||
|
||||
async def _scrape_with_semaphore(self, scraper, url: str, site_name: str):
|
||||
"""Scrape with semaphore using generic scraper."""
|
||||
async with self.semaphore:
|
||||
return await scraper.scrape_product_price(url, site_name)
|
||||
|
||||
async def cancel_product_scraping(self, product_id: int) -> bool:
|
||||
"""Cancel scraping for a specific product."""
|
||||
if product_id in self.active_tasks:
|
||||
|
||||
@@ -4,8 +4,8 @@ Specialized scrapers for UK catering supply sites
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from .scraper import PriceScraper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -14,35 +14,153 @@ logger = logging.getLogger(__name__)
|
||||
class UKCateringScraper(PriceScraper):
|
||||
"""Specialized scraper for UK catering supply websites."""
|
||||
|
||||
def _parse_uk_price(self, price_text: str) -> Optional[float]:
|
||||
"""Parse UK price format with £ symbol."""
|
||||
def _extract_special_pricing_context(self, element: Tag) -> Dict[str, Any]:
|
||||
"""Extract special pricing context from an element and its surroundings."""
|
||||
context = {
|
||||
'has_strikethrough': False,
|
||||
'has_offer_label': False,
|
||||
'has_was_now': False,
|
||||
'prices': [],
|
||||
'price_types': []
|
||||
}
|
||||
|
||||
# Get parent elements to check for special pricing context
|
||||
parents = [element] + [p for p in element.parents if p.name][:3] # Check up to 3 levels up
|
||||
|
||||
for parent in parents:
|
||||
parent_text = parent.get_text().lower() if parent else ""
|
||||
|
||||
# Check for strikethrough pricing
|
||||
strikethrough_elements = parent.find_all(['del', 's', 'strike']) if parent else []
|
||||
if strikethrough_elements:
|
||||
context['has_strikethrough'] = True
|
||||
for strike_elem in strikethrough_elements:
|
||||
strike_price = self._parse_uk_price(strike_elem.get_text())
|
||||
if strike_price:
|
||||
context['prices'].append(strike_price)
|
||||
context['price_types'].append('was_price')
|
||||
|
||||
# Check for offer/sale/discount labels
|
||||
offer_patterns = [
|
||||
r'\bsale\b', r'\boffer\b', r'\bdeal\b', r'\bdiscount\b',
|
||||
r'\bspecial\b', r'\bpromo\b', r'\breduced\b', r'\bsave\b',
|
||||
r'\bwas\s*£', r'\bnow\s*£', r'\b\d+%\s*off\b'
|
||||
]
|
||||
|
||||
for pattern in offer_patterns:
|
||||
if re.search(pattern, parent_text):
|
||||
context['has_offer_label'] = True
|
||||
break
|
||||
|
||||
# Look for "was/now" pricing patterns
|
||||
was_now_match = re.search(r'was\s*£([\d.]+).*?now\s*£([\d.]+)', parent_text, re.IGNORECASE)
|
||||
if was_now_match:
|
||||
context['has_was_now'] = True
|
||||
was_price = float(was_now_match.group(1))
|
||||
now_price = float(was_now_match.group(2))
|
||||
context['prices'].extend([was_price, now_price])
|
||||
context['price_types'].extend(['was_price', 'now_price'])
|
||||
|
||||
return context
|
||||
|
||||
def _parse_uk_price(self, price_text: str, prefer_delivery: bool = False) -> Optional[float]:
|
||||
"""Simple, conservative UK price parsing - just extract the first reasonable price."""
|
||||
if not price_text:
|
||||
return None
|
||||
|
||||
# Remove common text and normalize
|
||||
price_text = price_text.lower()
|
||||
price_text = re.sub(r'delivery:|collection:|was:|now:|offer:|from:', '', price_text)
|
||||
# Skip very long text blocks that are unlikely to contain just prices
|
||||
if len(price_text) > 100:
|
||||
return None
|
||||
|
||||
# Check if this is delivery or collection pricing
|
||||
is_delivery = 'delivery' in price_text.lower()
|
||||
is_collection = 'collection' in price_text.lower()
|
||||
|
||||
# If we prefer delivery and this is explicitly collection, skip it
|
||||
if prefer_delivery and is_collection and not is_delivery:
|
||||
return None
|
||||
|
||||
# Simple regex to find prices - be very specific
|
||||
price_match = re.search(r'£(\d{1,3}(?:\.\d{2})?)', price_text)
|
||||
|
||||
# Find price with £ symbol
|
||||
price_match = re.search(r'£(\d+\.?\d*)', price_text)
|
||||
if price_match:
|
||||
try:
|
||||
return float(price_match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try without £ symbol but with decimal
|
||||
price_match = re.search(r'(\d+\.\d{2})', price_text)
|
||||
if price_match:
|
||||
try:
|
||||
return float(price_match.group(1))
|
||||
price_val = float(price_match.group(1))
|
||||
# Only accept reasonable food product prices
|
||||
if 2.0 <= price_val <= 100.0:
|
||||
return price_val
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _find_special_offer_prices(self, soup: BeautifulSoup, site_name: str) -> List[Tuple[float, str]]:
|
||||
"""Find special offer prices using enhanced selectors."""
|
||||
special_prices = []
|
||||
|
||||
# Enhanced selectors for special offers
|
||||
special_offer_selectors = [
|
||||
# General special offer containers
|
||||
'.special-offer', '.sale-price', '.offer-price', '.discount-price',
|
||||
'.promo-price', '.reduced-price', '.deal-price',
|
||||
|
||||
# Strikethrough and comparison pricing
|
||||
'del:contains("£"), s:contains("£"), strike:contains("£")',
|
||||
'.was-price', '.original-price', '.rrp-price',
|
||||
|
||||
# Was/Now pricing containers
|
||||
'.was-now-pricing', '.price-comparison', '.before-after-price',
|
||||
|
||||
# Sale badges and labels
|
||||
'.sale-badge', '.offer-badge', '.discount-badge',
|
||||
'*[class*="sale"]:contains("£")',
|
||||
'*[class*="offer"]:contains("£")',
|
||||
'*[class*="discount"]:contains("£")',
|
||||
|
||||
# Site-specific patterns
|
||||
'.product-price-wrapper', '.price-container', '.pricing-section'
|
||||
]
|
||||
|
||||
if site_name == 'atoz_catering':
|
||||
# A to Z specific selectors - prioritize the offer price class
|
||||
special_offer_selectors.extend([
|
||||
'.my-price.price-offer', # Primary A to Z offer price selector
|
||||
'h3:contains("£")', 'h4:contains("£")',
|
||||
'.delivery-price-special', '.collection-price-special',
|
||||
'*[style*="text-decoration: line-through"]',
|
||||
'*[style*="text-decoration:line-through"]'
|
||||
])
|
||||
elif site_name == 'jjfoodservice':
|
||||
# JJ Food Service specific selectors
|
||||
special_offer_selectors.extend([
|
||||
'.member-price', '.trade-price', '.bulk-price',
|
||||
'.quantity-discount', '.volume-discount'
|
||||
])
|
||||
elif site_name == 'amazon_uk':
|
||||
# Amazon UK specific selectors
|
||||
special_offer_selectors.extend([
|
||||
'.a-price.a-text-price.a-size-medium.apexPriceToPay .a-offscreen',
|
||||
'.a-price-strike .a-offscreen',
|
||||
'#priceblock_dealprice', '#priceblock_saleprice',
|
||||
'.a-price-was', '.a-price-save'
|
||||
])
|
||||
|
||||
for selector in special_offer_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
if '£' in price_text:
|
||||
price = self._parse_uk_price(price_text, detect_special_offers=True, element=element)
|
||||
if price:
|
||||
special_prices.append((price, selector))
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with special offer selector {selector}: {e}")
|
||||
|
||||
return special_prices
|
||||
|
||||
def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from JJ Food Service."""
|
||||
"""Extract data specifically from JJ Food Service - simplified approach."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
@@ -50,43 +168,85 @@ class UKCateringScraper(PriceScraper):
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Try multiple selectors for price
|
||||
price_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'[data-testid="price"]',
|
||||
'.price-value',
|
||||
'.current-price',
|
||||
'.product-card-price',
|
||||
'span:contains("£")',
|
||||
'.cost'
|
||||
# First, try to find elements with Price in class name and extract delivery price
|
||||
price_elements = soup.select('[class*="Price"]')
|
||||
logger.debug(f"JJ Food Service: Found {len(price_elements)} price elements")
|
||||
|
||||
for element in price_elements:
|
||||
text = element.get_text(strip=True)
|
||||
logger.debug(f"JJ Food Service: Checking price element text: '{text[:100]}'")
|
||||
|
||||
# Look for delivery price in concatenated strings like "Collection:£10.49£4.62 per kgDelivery:£11.79£5.19 per kg"
|
||||
delivery_match = re.search(r'Delivery:£(\d{1,3}\.\d{2})', text, re.IGNORECASE)
|
||||
if delivery_match:
|
||||
price_val = float(delivery_match.group(1))
|
||||
result['price'] = price_val
|
||||
logger.info(f"JJ Food Service: Found delivery price £{price_val} in price element")
|
||||
# extract title
|
||||
title_el = soup.select_one('h1')
|
||||
if title_el:
|
||||
result['title'] = title_el.get_text(strip=True)
|
||||
return result
|
||||
|
||||
# Second, attempt regex-based parsing of delivery price from raw page text
|
||||
page_text = soup.get_text(separator=' ')
|
||||
logger.debug(f"JJ Food Service page_text snippet: {page_text[:500]!r}")
|
||||
|
||||
# Look for delivery price patterns in the text
|
||||
if 'DELIVERY' in page_text or 'delivery' in page_text:
|
||||
logger.debug(f"Found 'DELIVERY' in page text, looking for price patterns...")
|
||||
delivery_section = page_text[page_text.lower().find('delivery'):page_text.lower().find('delivery')+100]
|
||||
logger.debug(f"Delivery section: {delivery_section!r}")
|
||||
|
||||
# Try multiple patterns for delivery price (based on actual HTML structure)
|
||||
delivery_patterns = [
|
||||
r'Delivery:£(\d{1,3}\.\d{2})', # Delivery:£11.79 (actual format found)
|
||||
r'DELIVERY:£(\d{1,3}\.\d{2})', # DELIVERY:£11.79
|
||||
r'delivery:£(\d{1,3}\.\d{2})', # delivery:£11.79
|
||||
r'DELIVERY:\s*£(\d{1,3}\.\d{2})', # DELIVERY: £11.79 (with space)
|
||||
r'delivery:\s*£(\d{1,3}\.\d{2})', # delivery: £11.79 (with space)
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
for pattern in delivery_patterns:
|
||||
logger.debug(f"JJ Food Service: Trying pattern: {pattern}")
|
||||
delivery_match = re.search(pattern, page_text, re.IGNORECASE)
|
||||
if delivery_match:
|
||||
price_val = float(delivery_match.group(1))
|
||||
result['price'] = price_val
|
||||
logger.info(f"JJ Food Service: Parsed delivery price £{price_val} via regex pattern: {pattern}")
|
||||
# extract title
|
||||
title_el = soup.select_one('h1')
|
||||
if title_el:
|
||||
result['title'] = title_el.get_text(strip=True)
|
||||
return result
|
||||
else:
|
||||
logger.debug(f"JJ Food Service: Pattern {pattern} did not match")
|
||||
# Otherwise, try very specific selectors first - likely to contain prices
|
||||
specific_selectors = [
|
||||
'.price-delivery', # Delivery price specifically
|
||||
'.delivery-price', # Alternative delivery price
|
||||
'.price', # General price class
|
||||
]
|
||||
|
||||
for selector in specific_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
logger.info(f"Successfully scraped jjfoodservice: £{price}")
|
||||
break
|
||||
# Only process short text snippets that likely contain just prices
|
||||
if '£' in price_text and len(price_text) < 30:
|
||||
price = self._parse_uk_price(price_text, prefer_delivery=True)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
logger.info(f"JJ Food Service: Found price £{price} with selector '{selector}' from text: '{price_text}'")
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with JJ Food Service price selector {selector}: {e}")
|
||||
|
||||
# Try to extract title
|
||||
title_selectors = [
|
||||
'h1',
|
||||
'.product-title',
|
||||
'.product-name',
|
||||
'[data-testid="product-title"]',
|
||||
'.product-card-title',
|
||||
'title'
|
||||
]
|
||||
logger.debug(f"Error with JJ Food Service selector {selector}: {e}")
|
||||
|
||||
# Extract title
|
||||
title_selectors = ['h1', '.product-title', '.product-name']
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
@@ -96,61 +256,65 @@ class UKCateringScraper(PriceScraper):
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with JJ Food Service title selector {selector}: {e}")
|
||||
|
||||
# Check availability
|
||||
availability_indicators = [
|
||||
'out of stock',
|
||||
'unavailable',
|
||||
'not available',
|
||||
'temporarily unavailable'
|
||||
]
|
||||
|
||||
page_text = soup.get_text().lower()
|
||||
for indicator in availability_indicators:
|
||||
if indicator in page_text:
|
||||
result['availability'] = False
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def _extract_atoz_catering_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from A to Z Catering."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
"""Extract data specifically from A to Z Catering - prioritize delivery pricing using regex parse."""
|
||||
result = {'price': None, 'title': None, 'availability': True, 'currency': 'GBP'}
|
||||
# First, attempt to parse delivery price directly from page text
|
||||
page_text = soup.get_text(separator=' ')
|
||||
delivery_match = re.search(r'Delivery:\s*£(\d{1,3}\.\d{2})', page_text)
|
||||
if delivery_match:
|
||||
price_val = float(delivery_match.group(1))
|
||||
result['price'] = price_val
|
||||
logger.info(f"A to Z Catering: Parsed delivery price £{price_val} via regex")
|
||||
# extract title
|
||||
title_el = soup.select_one('h1')
|
||||
if title_el:
|
||||
result['title'] = title_el.get_text(strip=True)
|
||||
return result
|
||||
|
||||
# A to Z Catering specific selectors
|
||||
price_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'.delivery-price',
|
||||
'.collection-price',
|
||||
'span:contains("£")',
|
||||
'.price-value',
|
||||
'.cost',
|
||||
'.selling-price'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
# 1) Delivery-specific selectors
|
||||
for selector in ['.delivery-price', '.price-delivery']:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
# Skip if it contains "delivery" or "collection" but no price
|
||||
if ('delivery' in price_text.lower() or 'collection' in price_text.lower()) and '£' not in price_text:
|
||||
continue
|
||||
|
||||
price = self._parse_uk_price(price_text)
|
||||
text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(text, prefer_delivery=True)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
logger.info(f"Successfully scraped atoz_catering: £{price}")
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
logger.info(f"A to Z Catering: Found delivery price £{price} from {selector}")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with A to Z price selector {selector}: {e}")
|
||||
logger.debug(f"Error with A to Z delivery selector {selector}: {e}")
|
||||
|
||||
# 2) Main offer selector (fallback to collection price)
|
||||
for selector in ['.my-price.price-offer']:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
logger.info(f"A to Z Catering: Found collection price £{price} from {selector}")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with A to Z main selector {selector}: {e}")
|
||||
|
||||
# 3) Fallback general selectors
|
||||
for selector in ['.price', '.product-price']:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
logger.info(f"A to Z Catering: Fallback parsed price £{price} from {selector}")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with A to Z fallback selector {selector}: {e}")
|
||||
|
||||
# Extract title
|
||||
title_selectors = [
|
||||
@@ -197,7 +361,7 @@ class UKCateringScraper(PriceScraper):
|
||||
return result
|
||||
|
||||
def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from Amazon UK."""
|
||||
"""Extract data specifically from Amazon UK with enhanced special pricing detection."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
@@ -205,6 +369,15 @@ class UKCateringScraper(PriceScraper):
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# First, check for special offer prices using enhanced detection
|
||||
special_prices = self._find_special_offer_prices(soup, 'amazon_uk')
|
||||
if special_prices:
|
||||
# Use the lowest special offer price found
|
||||
best_special_price = min(price for price, _ in special_prices)
|
||||
result['price'] = best_special_price
|
||||
logger.info(f"Successfully scraped amazon_uk special offer price: £{best_special_price}")
|
||||
return result
|
||||
|
||||
# Amazon UK price selectors
|
||||
price_selectors = [
|
||||
'.a-price-whole',
|
||||
@@ -222,7 +395,7 @@ class UKCateringScraper(PriceScraper):
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
price = self._parse_uk_price(price_text, detect_special_offers=True, element=element)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
@@ -269,6 +442,122 @@ class UKCateringScraper(PriceScraper):
|
||||
|
||||
return result
|
||||
|
||||
def _extract_generic_data(self, soup: BeautifulSoup, site_name: str) -> Dict[str, Any]:
|
||||
"""Generic data extraction for UK sites not specifically implemented."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Generic price selectors
|
||||
price_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'[data-testid="price"]',
|
||||
'.price-value',
|
||||
'.current-price',
|
||||
'span:contains("£")',
|
||||
'.cost',
|
||||
'.selling-price'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
logger.info(f"Successfully scraped {site_name} generic price: £{price}")
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with generic price selector {selector}: {e}")
|
||||
|
||||
# Generic title selectors
|
||||
title_selectors = [
|
||||
'h1',
|
||||
'.product-title',
|
||||
'.product-name',
|
||||
'[data-testid="product-title"]',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
result['title'] = element.get_text(strip=True)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with generic title selector {selector}: {e}")
|
||||
|
||||
return result
|
||||
|
||||
async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
|
||||
"""Scrape price for a single product from a URL using UK-specific logic."""
|
||||
result = {
|
||||
'success': False,
|
||||
'price': None,
|
||||
'currency': 'GBP',
|
||||
'title': None,
|
||||
'availability': None,
|
||||
'url': url,
|
||||
'error': None
|
||||
}
|
||||
|
||||
try:
|
||||
# Validate that this is a supported UK site
|
||||
if site_name not in ['jjfoodservice', 'atoz_catering', 'amazon_uk']:
|
||||
result['error'] = f"Unsupported site for UK scraper: {site_name}"
|
||||
return result
|
||||
|
||||
# Check if site is enabled
|
||||
if not self.config.is_site_enabled(site_name):
|
||||
result['error'] = f"Site {site_name} is disabled"
|
||||
return result
|
||||
|
||||
# Fetch page content
|
||||
html_content = await self._fetch_page(url)
|
||||
if not html_content:
|
||||
result['error'] = "Failed to fetch page content"
|
||||
return result
|
||||
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Route to appropriate extraction method
|
||||
if site_name == 'jjfoodservice':
|
||||
extracted_data = self._extract_jjfoodservice_data(soup)
|
||||
elif site_name == 'atoz_catering':
|
||||
extracted_data = self._extract_atoz_catering_data(soup)
|
||||
elif site_name == 'amazon_uk':
|
||||
extracted_data = self._extract_amazon_uk_data(soup)
|
||||
else:
|
||||
# Fallback to generic extraction
|
||||
extracted_data = self._extract_generic_data(soup, site_name)
|
||||
|
||||
if extracted_data['price'] is not None:
|
||||
result.update({
|
||||
'success': True,
|
||||
'price': extracted_data['price'],
|
||||
'title': extracted_data.get('title'),
|
||||
'availability': extracted_data.get('availability')
|
||||
})
|
||||
logger.info(f"Successfully scraped {site_name}: £{extracted_data['price']}")
|
||||
else:
|
||||
result['error'] = "Could not extract price from page"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping {url}: {e}")
|
||||
result['error'] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
async def scrape_product(self, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||
"""Scrape prices for a product from all configured sites."""
|
||||
results = {}
|
||||
|
||||
@@ -1,515 +0,0 @@
|
||||
"""
|
||||
Specialized scrapers for UK catering supply sites
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from .scraper import PriceScraper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UKCateringScraper(PriceScraper):
|
||||
"""Specialized scraper for UK catering supply websites."""
|
||||
|
||||
def _parse_uk_price(self, price_text: str) -> Optional[float]:
|
||||
"""Parse UK price format with £ symbol."""
|
||||
if not price_text:
|
||||
return None
|
||||
|
||||
# Remove common text and normalize
|
||||
price_text = price_text.lower()
|
||||
price_text = re.sub(r'delivery:|collection:|was:|now:|offer:|from:', '', price_text)
|
||||
|
||||
# Find price with £ symbol
|
||||
price_match = re.search(r'£(\d+\.?\d*)', price_text)
|
||||
if price_match:
|
||||
try:
|
||||
return float(price_match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try without £ symbol but with decimal
|
||||
price_match = re.search(r'(\d+\.\d{2})', price_text)
|
||||
if price_match:
|
||||
try:
|
||||
return float(price_match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from JJ Food Service."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Try multiple selectors for price
|
||||
price_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'[data-testid="price"]',
|
||||
'.price-value',
|
||||
'.current-price',
|
||||
'.product-card-price',
|
||||
'span:contains("£")',
|
||||
'.cost'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with JJ Food Service price selector {selector}: {e}")
|
||||
|
||||
# Try to extract title
|
||||
title_selectors = [
|
||||
'h1',
|
||||
'.product-title',
|
||||
'.product-name',
|
||||
'[data-testid="product-title"]',
|
||||
'.product-card-title',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
result['title'] = element.get_text(strip=True)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with JJ Food Service title selector {selector}: {e}")
|
||||
|
||||
# Check availability
|
||||
availability_indicators = [
|
||||
'out of stock',
|
||||
'unavailable',
|
||||
'not available',
|
||||
'sold out'
|
||||
]
|
||||
|
||||
page_text = soup.get_text().lower()
|
||||
for indicator in availability_indicators:
|
||||
if indicator in page_text:
|
||||
result['availability'] = False
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def _extract_atoz_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from A to Z Catering."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# A to Z Catering shows prices like "Delivery:£X.XX Collection:£Y.YY"
|
||||
# We'll prioritize the lower price (usually collection)
|
||||
|
||||
price_text = soup.get_text()
|
||||
|
||||
# Look for delivery and collection prices
|
||||
delivery_match = re.search(r'delivery:?\s*£(\d+\.?\d*)', price_text, re.IGNORECASE)
|
||||
collection_match = re.search(r'collection:?\s*£(\d+\.?\d*)', price_text, re.IGNORECASE)
|
||||
|
||||
prices = []
|
||||
if delivery_match:
|
||||
try:
|
||||
prices.append(float(delivery_match.group(1)))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if collection_match:
|
||||
try:
|
||||
prices.append(float(collection_match.group(1)))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# If we found prices, use the lowest one
|
||||
if prices:
|
||||
result['price'] = min(prices)
|
||||
else:
|
||||
# Fallback to general price extraction
|
||||
price_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'span:contains("£")',
|
||||
'.price-value'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with A to Z price selector {selector}: {e}")
|
||||
|
||||
# Extract title - A to Z often has product names in links
|
||||
title_selectors = [
|
||||
'h1',
|
||||
'.product-title',
|
||||
'.product-name',
|
||||
'a[href*="/products/product/"]',
|
||||
'.product-link',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
title = element.get_text(strip=True)
|
||||
# Clean up the title
|
||||
if len(title) > 5 and 'A to Z' not in title:
|
||||
result['title'] = title
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with A to Z title selector {selector}: {e}")
|
||||
|
||||
# Check availability - look for "Add To Basket" button
|
||||
add_to_basket = soup.find(text=re.compile('Add To Basket', re.IGNORECASE))
|
||||
if not add_to_basket:
|
||||
# Also check for out of stock indicators
|
||||
out_of_stock_indicators = [
|
||||
'out of stock',
|
||||
'unavailable',
|
||||
'not available',
|
||||
'sold out'
|
||||
]
|
||||
|
||||
page_text = soup.get_text().lower()
|
||||
for indicator in out_of_stock_indicators:
|
||||
if indicator in page_text:
|
||||
result['availability'] = False
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from Amazon UK."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Amazon UK price selectors
|
||||
price_selectors = [
|
||||
'.a-price-whole',
|
||||
'.a-price .a-offscreen',
|
||||
'.a-price-current .a-offscreen',
|
||||
'#priceblock_dealprice',
|
||||
'#priceblock_ourprice',
|
||||
'.a-price-range',
|
||||
'.a-price.a-text-price.a-size-medium.apexPriceToPay .a-offscreen'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Amazon UK price selector {selector}: {e}")
|
||||
|
||||
# Extract title
|
||||
title_selectors = [
|
||||
'#productTitle',
|
||||
'.product-title',
|
||||
'h1.a-size-large'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
result['title'] = element.get_text(strip=True)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Amazon UK title selector {selector}: {e}")
|
||||
|
||||
# Check availability
|
||||
availability_text = soup.get_text().lower()
|
||||
if any(phrase in availability_text for phrase in ['out of stock', 'currently unavailable', 'not available']):
|
||||
result['availability'] = False
|
||||
|
||||
return result
|
||||
|
||||
def _extract_tesco_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from Tesco."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Tesco price selectors
|
||||
price_selectors = [
|
||||
'.price-control-wrapper .value',
|
||||
'.price-per-sellable-unit .value',
|
||||
'.price-per-quantity-weight .value',
|
||||
'[data-testid="price-current-value"]',
|
||||
'.price-current',
|
||||
'.product-price .price'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Tesco price selector {selector}: {e}")
|
||||
|
||||
# Extract title
|
||||
title_selectors = [
|
||||
'h1[data-testid="product-title"]',
|
||||
'.product-details-tile h1',
|
||||
'.product-title',
|
||||
'h1.product-name'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
result['title'] = element.get_text(strip=True)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Tesco title selector {selector}: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def _extract_sainsburys_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from Sainsburys."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Sainsburys price selectors
|
||||
price_selectors = [
|
||||
'.pd__cost__current-price',
|
||||
'.pd__cost .pd__cost__retail-price',
|
||||
'.pricing__now-price',
|
||||
'.product-price__current',
|
||||
'[data-testid="pd-retail-price"]',
|
||||
'.price-per-unit'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Sainsburys price selector {selector}: {e}")
|
||||
|
||||
# Extract title
|
||||
title_selectors = [
|
||||
'.pd__header h1',
|
||||
'h1[data-testid="pd-product-name"]',
|
||||
'.product-name',
|
||||
'.pd__product-name'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
result['title'] = element.get_text(strip=True)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Sainsburys title selector {selector}: {e}")
|
||||
|
||||
return result
|
||||
|
||||
def _extract_booker_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||
"""Extract data specifically from Booker."""
|
||||
result = {
|
||||
'price': None,
|
||||
'title': None,
|
||||
'availability': True,
|
||||
'currency': 'GBP'
|
||||
}
|
||||
|
||||
# Booker price selectors
|
||||
price_selectors = [
|
||||
'.price',
|
||||
'.product-price',
|
||||
'.price-current',
|
||||
'.selling-price',
|
||||
'[data-testid="price"]',
|
||||
'.product-tile-price'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
try:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
price_text = element.get_text(strip=True)
|
||||
price = self._parse_uk_price(price_text)
|
||||
if price is not None:
|
||||
result['price'] = price
|
||||
break
|
||||
if result['price'] is not None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Booker price selector {selector}: {e}")
|
||||
|
||||
# Extract title
|
||||
title_selectors = [
|
||||
'h1',
|
||||
'.product-title',
|
||||
'.product-name',
|
||||
'.product-description h1',
|
||||
'[data-testid="product-title"]'
|
||||
]
|
||||
|
||||
for selector in title_selectors:
|
||||
try:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
result['title'] = element.get_text(strip=True)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Error with Booker title selector {selector}: {e}")
|
||||
|
||||
return result
|
||||
|
||||
async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
|
||||
"""Enhanced scraping for UK catering sites."""
|
||||
result = {
|
||||
'success': False,
|
||||
'price': None,
|
||||
'currency': 'GBP',
|
||||
'title': None,
|
||||
'availability': None,
|
||||
'url': url,
|
||||
'error': None
|
||||
}
|
||||
|
||||
try:
|
||||
# Auto-detect site if not provided
|
||||
if not site_name:
|
||||
site_name = self._detect_site(url)
|
||||
if not site_name:
|
||||
result['error'] = "Could not detect site from URL"
|
||||
return result
|
||||
|
||||
# Check if site is enabled
|
||||
if not self.config.is_site_enabled(site_name):
|
||||
result['error'] = f"Site {site_name} is disabled"
|
||||
return result
|
||||
|
||||
# Fetch page content
|
||||
html_content = await self._fetch_page(url)
|
||||
if not html_content:
|
||||
result['error'] = "Failed to fetch page content"
|
||||
return result
|
||||
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Use specialized extraction based on site
|
||||
if site_name == 'jjfoodservice':
|
||||
extracted_data = self._extract_jjfoodservice_data(soup)
|
||||
elif site_name == 'atoz_catering':
|
||||
extracted_data = self._extract_atoz_data(soup)
|
||||
elif site_name == 'amazon_uk':
|
||||
extracted_data = self._extract_amazon_uk_data(soup)
|
||||
elif site_name == 'tesco':
|
||||
extracted_data = self._extract_tesco_data(soup)
|
||||
elif site_name == 'sainsburys':
|
||||
extracted_data = self._extract_sainsburys_data(soup)
|
||||
elif site_name == 'booker':
|
||||
extracted_data = self._extract_booker_data(soup)
|
||||
else:
|
||||
# Fall back to general extraction
|
||||
return await super().scrape_product_price(url, site_name)
|
||||
|
||||
if extracted_data['price'] is None:
|
||||
result['error'] = "Could not extract price from page"
|
||||
return result
|
||||
|
||||
result.update({
|
||||
'success': True,
|
||||
'price': extracted_data['price'],
|
||||
'currency': extracted_data.get('currency', 'GBP'),
|
||||
'title': extracted_data.get('title'),
|
||||
'availability': extracted_data.get('availability', True)
|
||||
})
|
||||
|
||||
logger.info(f"Successfully scraped {site_name}: £{extracted_data['price']}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping {url}: {e}")
|
||||
result['error'] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
def _detect_site(self, url: str) -> Optional[str]:
|
||||
"""Detect which UK catering site this URL belongs to."""
|
||||
url_lower = url.lower()
|
||||
|
||||
if 'jjfoodservice.com' in url_lower:
|
||||
return 'jjfoodservice'
|
||||
elif 'atoz-catering.co.uk' in url_lower:
|
||||
return 'atoz_catering'
|
||||
elif 'amazon.co.uk' in url_lower:
|
||||
return 'amazon_uk'
|
||||
elif 'tesco.com' in url_lower:
|
||||
return 'tesco'
|
||||
elif 'sainsburys.co.uk' in url_lower:
|
||||
return 'sainsburys'
|
||||
elif 'booker.co.uk' in url_lower:
|
||||
return 'booker'
|
||||
|
||||
# Fall back to parent detection for other sites
|
||||
return super()._detect_site(url)
|
||||
@@ -268,4 +268,70 @@ def create_app():
|
||||
fig = go.Figure(data=traces, layout=layout)
|
||||
return json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
|
||||
|
||||
@app.route('/edit_product/<int:product_id>', methods=['GET', 'POST'])
|
||||
def edit_product(product_id):
|
||||
"""Edit an existing product."""
|
||||
product = db_manager.get_product(product_id)
|
||||
if not product:
|
||||
flash('Product not found.', 'error')
|
||||
return redirect(url_for('index'))
|
||||
|
||||
form = ProductForm()
|
||||
|
||||
if form.validate_on_submit():
|
||||
urls = {}
|
||||
if form.jjfoodservice_url.data:
|
||||
urls['jjfoodservice'] = form.jjfoodservice_url.data
|
||||
if form.atoz_catering_url.data:
|
||||
urls['atoz_catering'] = form.atoz_catering_url.data
|
||||
if form.amazon_uk_url.data:
|
||||
urls['amazon_uk'] = form.amazon_uk_url.data
|
||||
|
||||
if not urls:
|
||||
flash('Please provide at least one URL to track.', 'error')
|
||||
return render_template('edit_product.html', form=form, product=product)
|
||||
|
||||
try:
|
||||
db_manager.update_product(
|
||||
product_id=product_id,
|
||||
name=form.name.data,
|
||||
description=form.description.data,
|
||||
target_price=form.target_price.data,
|
||||
urls=urls
|
||||
)
|
||||
flash(f'Product "{form.name.data}" updated successfully!', 'success')
|
||||
return redirect(url_for('product_detail', product_id=product_id))
|
||||
except Exception as e:
|
||||
flash(f'Error updating product: {str(e)}', 'error')
|
||||
|
||||
# Pre-populate form with existing data
|
||||
if request.method == 'GET':
|
||||
form.name.data = product['name']
|
||||
form.description.data = product['description']
|
||||
form.target_price.data = product['target_price']
|
||||
|
||||
# URLs are already parsed as a dictionary by the database method
|
||||
urls = product['urls'] if product['urls'] else {}
|
||||
form.jjfoodservice_url.data = urls.get('jjfoodservice', '')
|
||||
form.atoz_catering_url.data = urls.get('atoz_catering', '')
|
||||
form.amazon_uk_url.data = urls.get('amazon_uk', '')
|
||||
|
||||
return render_template('edit_product.html', form=form, product=product)
|
||||
|
||||
@app.route('/delete_product/<int:product_id>', methods=['POST'])
|
||||
def delete_product(product_id):
|
||||
"""Delete a product."""
|
||||
product = db_manager.get_product(product_id)
|
||||
if not product:
|
||||
flash('Product not found.', 'error')
|
||||
return redirect(url_for('index'))
|
||||
|
||||
try:
|
||||
db_manager.delete_product(product_id)
|
||||
flash(f'Product "{product["name"]}" deleted successfully!', 'success')
|
||||
except Exception as e:
|
||||
flash(f'Error deleting product: {str(e)}', 'error')
|
||||
|
||||
return redirect(url_for('index'))
|
||||
|
||||
return app
|
||||
|
||||
@@ -123,7 +123,8 @@
|
||||
<ul class="mb-0 mt-2">
|
||||
<li>Make sure URLs point to the specific product page</li>
|
||||
<li>Test URLs in your browser first to ensure they work</li>
|
||||
<li>Some sites may block automated requests - we'll handle this gracefully</li>
|
||||
<li>The system will automatically prioritize <strong>delivery prices</strong> over collection prices</li>
|
||||
<li>For JJ Food Service and A to Z Catering, ensure you can see delivery pricing on the page</li>
|
||||
<li>For best results, use direct product page URLs</li>
|
||||
</ul>
|
||||
</div>
|
||||
@@ -154,13 +155,15 @@
|
||||
<h6 class="fw-bold">JJ Food Service</h6>
|
||||
<p class="small text-muted">
|
||||
Navigate to the specific product page on JJ Food Service and copy the URL.
|
||||
Make sure you're logged in for accurate pricing.
|
||||
Make sure you're logged in for accurate pricing. The system will automatically
|
||||
prioritize <strong>delivery prices</strong> over collection prices.
|
||||
</p>
|
||||
|
||||
<h6 class="fw-bold">A to Z Catering</h6>
|
||||
<p class="small text-muted">
|
||||
Go to the product page on A to Z Catering and copy the URL.
|
||||
URLs typically contain "/products/product/" followed by the product name.
|
||||
The system will automatically capture <strong>delivery pricing</strong> when available.
|
||||
</p>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
@@ -170,10 +173,11 @@
|
||||
The URL should contain "/dp/" followed by the product identifier.
|
||||
</p>
|
||||
|
||||
<h6 class="fw-bold text-muted">Note</h6>
|
||||
<h6 class="fw-bold text-success">Delivery Pricing Priority</h6>
|
||||
<p class="small text-muted">
|
||||
We focus on UK catering supply websites that work well with automated price tracking.
|
||||
This provides reliable price monitoring for your business needs.
|
||||
For JJ Food Service and A to Z Catering, the system automatically prioritizes
|
||||
delivery prices over collection prices. This ensures you're tracking the
|
||||
most relevant pricing for delivered goods to your business.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
190
templates/edit_product.html
Normal file
190
templates/edit_product.html
Normal file
@@ -0,0 +1,190 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{% block title %}Edit Product - Price Tracker{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-lg-8">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h2 class="mb-0">
|
||||
<i class="fas fa-edit me-2 text-primary"></i>Edit Product: {{ product.name }}
|
||||
</h2>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<form method="POST">
|
||||
{{ form.hidden_tag() }}
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-8 mb-3">
|
||||
{{ form.name.label(class="form-label fw-bold") }}
|
||||
{{ form.name(class="form-control form-control-lg") }}
|
||||
{% if form.name.errors %}
|
||||
<div class="text-danger small mt-1">
|
||||
{% for error in form.name.errors %}
|
||||
<div>{{ error }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="col-md-4 mb-3">
|
||||
{{ form.target_price.label(class="form-label fw-bold") }}
|
||||
<div class="input-group">
|
||||
<span class="input-group-text">£</span>
|
||||
{{ form.target_price(class="form-control form-control-lg") }}
|
||||
</div>
|
||||
{% if form.target_price.errors %}
|
||||
<div class="text-danger small mt-1">
|
||||
{% for error in form.target_price.errors %}
|
||||
<div>{{ error }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
<small class="text-muted">Optional: Alert when price drops below this</small>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
{{ form.description.label(class="form-label fw-bold") }}
|
||||
{{ form.description(class="form-control", rows="3") }}
|
||||
{% if form.description.errors %}
|
||||
<div class="text-danger small mt-1">
|
||||
{% for error in form.description.errors %}
|
||||
<div>{{ error }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<hr class="my-4">
|
||||
<h5 class="mb-3">
|
||||
<i class="fas fa-link me-2 text-secondary"></i>Store URLs
|
||||
</h5>
|
||||
<p class="text-muted small mb-3">Add URLs from the stores you want to track. At least one URL is required.</p>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-6 mb-3">
|
||||
{{ form.jjfoodservice_url.label(class="form-label fw-bold") }}
|
||||
<div class="input-group">
|
||||
<span class="input-group-text">
|
||||
<i class="fas fa-store text-primary"></i>
|
||||
</span>
|
||||
{{ form.jjfoodservice_url(class="form-control", placeholder="https://www.jjfoodservice.com/...") }}
|
||||
</div>
|
||||
{% if form.jjfoodservice_url.errors %}
|
||||
<div class="text-danger small mt-1">
|
||||
{% for error in form.jjfoodservice_url.errors %}
|
||||
<div>{{ error }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<div class="col-md-6 mb-3">
|
||||
{{ form.atoz_catering_url.label(class="form-label fw-bold") }}
|
||||
<div class="input-group">
|
||||
<span class="input-group-text">
|
||||
<i class="fas fa-store text-success"></i>
|
||||
</span>
|
||||
{{ form.atoz_catering_url(class="form-control", placeholder="https://www.atoz-catering.co.uk/...") }}
|
||||
</div>
|
||||
{% if form.atoz_catering_url.errors %}
|
||||
<div class="text-danger small mt-1">
|
||||
{% for error in form.atoz_catering_url.errors %}
|
||||
<div>{{ error }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-6 mb-3">
|
||||
{{ form.amazon_uk_url.label(class="form-label fw-bold") }}
|
||||
<div class="input-group">
|
||||
<span class="input-group-text">
|
||||
<i class="fab fa-amazon text-warning"></i>
|
||||
</span>
|
||||
{{ form.amazon_uk_url(class="form-control", placeholder="https://www.amazon.co.uk/...") }}
|
||||
</div>
|
||||
{% if form.amazon_uk_url.errors %}
|
||||
<div class="text-danger small mt-1">
|
||||
{% for error in form.amazon_uk_url.errors %}
|
||||
<div>{{ error }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<hr class="my-4">
|
||||
|
||||
<div class="d-flex justify-content-between">
|
||||
<div>
|
||||
<button type="submit" class="btn btn-primary btn-lg me-3">
|
||||
<i class="fas fa-save me-2"></i>Update Product
|
||||
</button>
|
||||
<a href="{{ url_for('product_detail', product_id=product.id) }}" class="btn btn-outline-secondary btn-lg">
|
||||
<i class="fas fa-arrow-left me-2"></i>Cancel
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<!-- Delete button -->
|
||||
<div>
|
||||
<button type="button" class="btn btn-outline-danger btn-lg" data-bs-toggle="modal" data-bs-target="#deleteModal">
|
||||
<i class="fas fa-trash me-2"></i>Delete Product
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<!-- Help section -->
|
||||
<div class="mt-5">
|
||||
<div class="card bg-light">
|
||||
<div class="card-body">
|
||||
<h6 class="card-title">
|
||||
<i class="fas fa-info-circle me-2 text-info"></i>How to find product URLs
|
||||
</h6>
|
||||
<ul class="card-text small mb-0">
|
||||
<li><strong>JJ Food Service:</strong> Search for your product and copy the URL from the product page</li>
|
||||
<li><strong>A to Z Catering:</strong> Navigate to the specific product and copy the URL</li>
|
||||
<li><strong>Amazon UK:</strong> Find the product and copy the URL (we'll extract the essential part)</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Delete Confirmation Modal -->
|
||||
<div class="modal fade" id="deleteModal" tabindex="-1" aria-labelledby="deleteModalLabel" aria-hidden="true">
|
||||
<div class="modal-dialog">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="deleteModalLabel">
|
||||
<i class="fas fa-exclamation-triangle me-2 text-warning"></i>Confirm Delete
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<p>Are you sure you want to delete <strong>"{{ product.name }}"</strong>?</p>
|
||||
<div class="alert alert-warning">
|
||||
<i class="fas fa-warning me-2"></i>
|
||||
<strong>Warning:</strong> This action cannot be undone. All price history for this product will be permanently deleted.
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
|
||||
<form method="POST" action="{{ url_for('delete_product', product_id=product.id) }}" style="display: inline;">
|
||||
<button type="submit" class="btn btn-danger">
|
||||
<i class="fas fa-trash me-2"></i>Delete Product
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
@@ -97,6 +97,16 @@
|
||||
<i class="fas fa-sync-alt me-1"></i>Scrape Now
|
||||
</button>
|
||||
</div>
|
||||
<div class="btn-group" role="group">
|
||||
<a href="{{ url_for('edit_product', product_id=product.id) }}" class="btn btn-outline-secondary">
|
||||
<i class="fas fa-edit me-1"></i>Edit
|
||||
</a>
|
||||
<button class="btn btn-outline-danger delete-product-btn"
|
||||
data-product-id="{{ product.id }}"
|
||||
data-product-name="{{ product.name }}">
|
||||
<i class="fas fa-trash me-1"></i>Delete
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -181,4 +191,58 @@
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Delete Confirmation Modal -->
|
||||
<div class="modal fade" id="deleteModal" tabindex="-1" aria-labelledby="deleteModalLabel" aria-hidden="true">
|
||||
<div class="modal-dialog">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="deleteModalLabel">
|
||||
<i class="fas fa-exclamation-triangle me-2 text-warning"></i>Confirm Delete
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<p>Are you sure you want to delete <strong id="deleteProductName"></strong>?</p>
|
||||
<div class="alert alert-warning">
|
||||
<i class="fas fa-warning me-2"></i>
|
||||
<strong>Warning:</strong> This action cannot be undone. All price history for this product will be permanently deleted.
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
|
||||
<form id="deleteForm" method="POST" style="display: inline;">
|
||||
<button type="submit" class="btn btn-danger">
|
||||
<i class="fas fa-trash me-2"></i>Delete Product
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Handle delete product buttons
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const deleteButtons = document.querySelectorAll('.delete-product-btn');
|
||||
const deleteModal = document.getElementById('deleteModal');
|
||||
const deleteForm = document.getElementById('deleteForm');
|
||||
const deleteProductName = document.getElementById('deleteProductName');
|
||||
|
||||
deleteButtons.forEach(button => {
|
||||
button.addEventListener('click', function() {
|
||||
const productId = this.getAttribute('data-product-id');
|
||||
const productName = this.getAttribute('data-product-name');
|
||||
|
||||
// Update modal content
|
||||
deleteProductName.textContent = productName;
|
||||
deleteForm.action = `/delete_product/${productId}`;
|
||||
|
||||
// Show modal
|
||||
const modal = new bootstrap.Modal(deleteModal);
|
||||
modal.show();
|
||||
});
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
||||
@@ -14,6 +14,16 @@
|
||||
<button class="btn btn-success me-2" onclick="scrapeProduct({{ product.id }})">
|
||||
<i class="fas fa-sync-alt me-1"></i>Scrape Now
|
||||
</button>
|
||||
<a href="{{ url_for('edit_product', product_id=product.id) }}" class="btn btn-outline-primary me-2">
|
||||
<i class="fas fa-edit me-1"></i>Edit
|
||||
</a>
|
||||
<button class="btn btn-outline-danger me-2 delete-product-btn"
|
||||
data-product-id="{{ product.id }}"
|
||||
data-product-name="{{ product.name }}"
|
||||
data-bs-toggle="modal"
|
||||
data-bs-target="#deleteModal">
|
||||
<i class="fas fa-trash me-1"></i>Delete
|
||||
</button>
|
||||
<a href="{{ url_for('index') }}" class="btn btn-outline-secondary">
|
||||
<i class="fas fa-arrow-left me-1"></i>Back to Dashboard
|
||||
</a>
|
||||
@@ -222,6 +232,35 @@
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Delete Confirmation Modal -->
|
||||
<div class="modal fade" id="deleteModal" tabindex="-1" aria-labelledby="deleteModalLabel" aria-hidden="true">
|
||||
<div class="modal-dialog">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="deleteModalLabel">
|
||||
<i class="fas fa-exclamation-triangle me-2 text-warning"></i>Confirm Delete
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<p>Are you sure you want to delete <strong>"{{ product.name }}"</strong>?</p>
|
||||
<div class="alert alert-warning">
|
||||
<i class="fas fa-warning me-2"></i>
|
||||
<strong>Warning:</strong> This action cannot be undone. All price history for this product will be permanently deleted.
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
|
||||
<form method="POST" action="{{ url_for('delete_product', product_id=product.id) }}" style="display: inline;">
|
||||
<button type="submit" class="btn btn-danger">
|
||||
<i class="fas fa-trash me-2"></i>Delete Product
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
@@ -231,4 +270,20 @@
|
||||
Plotly.newPlot('priceChart', chartData.data, chartData.layout, {responsive: true});
|
||||
</script>
|
||||
{% endif %}
|
||||
|
||||
<script>
|
||||
// Handle delete product button
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const deleteButton = document.querySelector('.delete-product-btn');
|
||||
const deleteModal = document.getElementById('deleteModal');
|
||||
|
||||
if (deleteButton) {
|
||||
deleteButton.addEventListener('click', function() {
|
||||
// Show modal
|
||||
const modal = new bootstrap.Modal(deleteModal);
|
||||
modal.show();
|
||||
});
|
||||
}
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
||||
35
test_actual_scraper.py
Normal file
35
test_actual_scraper.py
Normal file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from uk_scraper import scrape_jj_foodservice
|
||||
|
||||
async def test_actual_scraper():
|
||||
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||
|
||||
print(f"Testing actual scraper with URL: {url}")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
result = await scrape_jj_foodservice(url)
|
||||
print(f"Scraper result: {result}")
|
||||
|
||||
if result:
|
||||
print(f"✅ Name: {result.get('name', 'Not found')}")
|
||||
print(f"✅ Collection Price: £{result.get('collection_price', 'Not found')}")
|
||||
print(f"✅ Delivery Price: £{result.get('delivery_price', 'Not found')}")
|
||||
print(f"✅ Image URL: {result.get('image_url', 'Not found')}")
|
||||
else:
|
||||
print("❌ Scraper returned None")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error occurred: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_actual_scraper())
|
||||
53
test_jj_detailed.py
Normal file
53
test_jj_detailed.py
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
async def test_jj_patterns():
|
||||
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
html = await response.text()
|
||||
|
||||
print(f"HTML content length: {len(html)}")
|
||||
|
||||
# Look for various keywords
|
||||
keywords = ['DELIVERY', 'delivery', 'COLLECTION', 'collection', '£10.49', '£11.79', '10.49', '11.79']
|
||||
|
||||
for keyword in keywords:
|
||||
if keyword in html:
|
||||
print(f"'{keyword}' FOUND in HTML")
|
||||
# Find context around the keyword
|
||||
index = html.find(keyword)
|
||||
start = max(0, index - 100)
|
||||
end = min(len(html), index + 100)
|
||||
context = html[start:end]
|
||||
print(f"Context: ...{context}...")
|
||||
print()
|
||||
else:
|
||||
print(f"'{keyword}' NOT found in HTML")
|
||||
|
||||
# Look for any price-like patterns
|
||||
price_patterns = re.findall(r'£?(\d{1,3}\.\d{2})', html)
|
||||
print(f"\nAll price patterns found: {price_patterns}")
|
||||
|
||||
# Try to find price elements using BeautifulSoup
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Look for specific CSS classes that might contain prices
|
||||
price_selectors = [
|
||||
'.price', '.product-price', '.delivery-price', '.price-delivery',
|
||||
'[class*="price"]', '[class*="Price"]'
|
||||
]
|
||||
|
||||
for selector in price_selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
print(f"\nFound elements with selector '{selector}':")
|
||||
for elem in elements[:5]: # Show first 5
|
||||
print(f" - {elem.get_text(strip=True)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_jj_patterns())
|
||||
54
test_jj_simple.py
Normal file
54
test_jj_simple.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple test to debug JJ Food Service scraping
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(__file__))
|
||||
|
||||
from src.uk_scraper import UKCateringScraper
|
||||
from src.config import Config
|
||||
import logging
|
||||
|
||||
# Set up verbose logging
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s - %(message)s')
|
||||
|
||||
async def test_jj_scraping():
|
||||
config = Config()
|
||||
|
||||
async with UKCateringScraper(config) as scraper:
|
||||
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||
|
||||
print(f"Testing URL: {url}")
|
||||
|
||||
# Get the raw HTML content
|
||||
html_content = await scraper._fetch_page(url)
|
||||
|
||||
if html_content:
|
||||
print(f"HTML content length: {len(html_content)}")
|
||||
print("First 500 characters of HTML:")
|
||||
print(html_content[:500])
|
||||
print("\n" + "="*50 + "\n")
|
||||
|
||||
# Look for delivery text
|
||||
if 'DELIVERY' in html_content:
|
||||
print("Found 'DELIVERY' in HTML content")
|
||||
# Find the context around DELIVERY
|
||||
delivery_pos = html_content.find('DELIVERY')
|
||||
context = html_content[delivery_pos:delivery_pos+100]
|
||||
print(f"Context around DELIVERY: {context}")
|
||||
else:
|
||||
print("'DELIVERY' not found in HTML content")
|
||||
|
||||
# Look for any price patterns
|
||||
import re
|
||||
price_matches = re.findall(r'£(\d{1,3}(?:\.\d{2})?)', html_content)
|
||||
print(f"All price patterns found: {price_matches}")
|
||||
|
||||
else:
|
||||
print("Failed to fetch HTML content")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_jj_scraping())
|
||||
51
test_regex_patterns.py
Normal file
51
test_regex_patterns.py
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test the exact regex patterns against the actual HTML content
|
||||
"""
|
||||
|
||||
import re
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
async def test_jj_patterns():
|
||||
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
html_content = await response.text()
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
page_text = soup.get_text(separator=' ')
|
||||
|
||||
print(f"Page text length: {len(page_text)}")
|
||||
|
||||
# Find the section with delivery info
|
||||
delivery_start = page_text.lower().find('delivery')
|
||||
if delivery_start >= 0:
|
||||
delivery_section = page_text[delivery_start:delivery_start+200]
|
||||
print(f"Delivery section: {delivery_section!r}")
|
||||
|
||||
# Test the exact patterns
|
||||
delivery_patterns = [
|
||||
r'Delivery:£(\d{1,3}\.\d{2})', # Delivery:£11.79
|
||||
r'DELIVERY:£(\d{1,3}\.\d{2})', # DELIVERY:£11.79
|
||||
r'delivery:£(\d{1,3}\.\d{2})', # delivery:£11.79
|
||||
r'DELIVERY:\s*£(\d{1,3}\.\d{2})', # DELIVERY: £11.79
|
||||
r'delivery:\s*£(\d{1,3}\.\d{2})', # delivery: £11.79
|
||||
]
|
||||
|
||||
for pattern in delivery_patterns:
|
||||
match = re.search(pattern, page_text, re.IGNORECASE)
|
||||
if match:
|
||||
print(f"✅ Pattern '{pattern}' matched! Price: £{match.group(1)}")
|
||||
return float(match.group(1))
|
||||
else:
|
||||
print(f"❌ Pattern '{pattern}' did not match")
|
||||
|
||||
print("No delivery patterns matched!")
|
||||
return None
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = asyncio.run(test_jj_patterns())
|
||||
print(f"Final result: {result}")
|
||||
46
test_scraper.py
Normal file
46
test_scraper.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script to debug scraping issues for JJ Food Service and A to Z Catering
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
||||
|
||||
from src.uk_scraper import UKCateringScraper
|
||||
from src.config import Config
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
async def test_scraping():
|
||||
config = Config()
|
||||
|
||||
async with UKCateringScraper(config) as scraper:
|
||||
# Test URLs that were problematic
|
||||
test_urls = [
|
||||
"https://www.jjfoodservice.com/catering-products/confectionery-and-snacks/chocolate/cadbury-dairy-milk-chocolate-bar-110g",
|
||||
"https://www.atozcatering.co.uk/catering-equipment/refrigeration/prep-fridges/polar-single-door-prep-counter-fridge-240ltr",
|
||||
"https://www.atozcatering.co.uk/catering-equipment/cooking-equipment/fryers/buffalo-single-tank-induction-fryer-5ltr"
|
||||
]
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Testing URL: {url}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
try:
|
||||
result = await scraper.scrape_product(url)
|
||||
if result:
|
||||
print(f"Success! Result: {result}")
|
||||
else:
|
||||
print("Failed to scrape product")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_scraping())
|
||||
225
test_special_pricing.py
Normal file
225
test_special_pricing.py
Normal file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for special pricing detection in UK scraper.
|
||||
This script tests various special pricing scenarios to ensure the enhanced detection works correctly.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
from uk_scraper import UKCateringScraper
|
||||
from config import Config
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_test_html_scenarios():
|
||||
"""Create test HTML scenarios for different special pricing patterns."""
|
||||
|
||||
scenarios = {
|
||||
'strikethrough_pricing': """
|
||||
<div class="product-price">
|
||||
<del>£15.99</del>
|
||||
<span class="sale-price">£12.99</span>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'was_now_pricing': """
|
||||
<div class="price-container">
|
||||
<span>Was £20.50, now £17.25</span>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'offer_label_pricing': """
|
||||
<div class="special-offer">
|
||||
<span class="offer-badge">SPECIAL OFFER</span>
|
||||
<span class="price">£8.99</span>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'delivery_special_pricing': """
|
||||
<div class="delivery-pricing">
|
||||
<h3>Delivery: <del>£25.00</del> £19.99</h3>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'multiple_prices_no_context': """
|
||||
<div class="price-section">
|
||||
<span>£15.99</span>
|
||||
<span>£12.99</span>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'amazon_deal_pricing': """
|
||||
<div class="a-price">
|
||||
<span class="a-price-strike">£29.99</span>
|
||||
<span class="a-price-current">£24.99</span>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'jj_member_pricing': """
|
||||
<div class="member-price">
|
||||
<span class="standard-price">£18.50</span>
|
||||
<span class="member-discount">Member price: £15.25</span>
|
||||
</div>
|
||||
""",
|
||||
|
||||
'atoz_h3_delivery': """
|
||||
<h3>Delivery: Was £22.00 Now £18.50</h3>
|
||||
""",
|
||||
|
||||
'percentage_discount': """
|
||||
<div class="discount-container">
|
||||
<span class="discount-badge">20% OFF</span>
|
||||
<span class="original-price">RRP £25.00</span>
|
||||
<span class="sale-price">£20.00</span>
|
||||
</div>
|
||||
"""
|
||||
}
|
||||
|
||||
return scenarios
|
||||
|
||||
|
||||
async def test_special_pricing_scenarios():
|
||||
"""Test the special pricing detection with various scenarios."""
|
||||
|
||||
# Initialize the scraper
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
|
||||
scenarios = create_test_html_scenarios()
|
||||
|
||||
print("Testing Special Pricing Detection")
|
||||
print("=" * 50)
|
||||
|
||||
for scenario_name, html_content in scenarios.items():
|
||||
print(f"\nTesting: {scenario_name}")
|
||||
print("-" * 30)
|
||||
|
||||
# Parse the HTML
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Test with different sites
|
||||
for site_name in ['jjfoodservice', 'atoz_catering', 'amazon_uk']:
|
||||
print(f"\n {site_name}:")
|
||||
|
||||
try:
|
||||
# Test special offer detection
|
||||
special_prices = scraper._find_special_offer_prices(soup, site_name)
|
||||
if special_prices:
|
||||
best_price = min(price for price, _ in special_prices)
|
||||
print(f" ✓ Special offers found: {special_prices}")
|
||||
print(f" ✓ Best price: £{best_price}")
|
||||
else:
|
||||
print(f" ✗ No special offers detected")
|
||||
|
||||
# Test the extraction methods
|
||||
if site_name == 'jjfoodservice':
|
||||
result = scraper._extract_jjfoodservice_data(soup)
|
||||
elif site_name == 'atoz_catering':
|
||||
result = scraper._extract_atoz_catering_data(soup)
|
||||
elif site_name == 'amazon_uk':
|
||||
result = scraper._extract_amazon_uk_data(soup)
|
||||
|
||||
if result['price']:
|
||||
print(f" ✓ Extracted price: £{result['price']}")
|
||||
else:
|
||||
print(f" ✗ No price extracted")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
|
||||
def test_parse_uk_price_functionality():
|
||||
"""Test the enhanced _parse_uk_price function."""
|
||||
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
|
||||
print("\n\nTesting _parse_uk_price Functionality")
|
||||
print("=" * 50)
|
||||
|
||||
test_cases = [
|
||||
("£15.99", False, False, 15.99),
|
||||
("Was £20.00 Now £15.99", False, True, 15.99),
|
||||
("£25.50 £19.99", False, True, 19.99),
|
||||
("Delivery: £12.50", True, False, 12.50),
|
||||
("Collection: £10.00 Delivery: £12.50", True, False, 12.50),
|
||||
("RRP £30.00 Sale £24.99", False, True, 24.99),
|
||||
("Save £5.00! Was £25.00 Now £20.00", False, True, 20.00),
|
||||
]
|
||||
|
||||
for i, (price_text, prefer_delivery, detect_special, expected) in enumerate(test_cases, 1):
|
||||
print(f"\nTest {i}: '{price_text}'")
|
||||
print(f" prefer_delivery={prefer_delivery}, detect_special={detect_special}")
|
||||
|
||||
# Create a mock element for testing
|
||||
mock_html = f"<span>{price_text}</span>"
|
||||
mock_element = BeautifulSoup(mock_html, 'html.parser').find('span')
|
||||
|
||||
result = scraper._parse_uk_price(
|
||||
price_text,
|
||||
prefer_delivery=prefer_delivery,
|
||||
detect_special_offers=detect_special,
|
||||
element=mock_element
|
||||
)
|
||||
|
||||
if result == expected:
|
||||
print(f" ✓ Result: £{result} (Expected: £{expected})")
|
||||
else:
|
||||
print(f" ✗ Result: £{result} (Expected: £{expected})")
|
||||
|
||||
|
||||
def test_special_pricing_context():
|
||||
"""Test the special pricing context detection."""
|
||||
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
|
||||
print("\n\nTesting Special Pricing Context Detection")
|
||||
print("=" * 50)
|
||||
|
||||
context_test_cases = [
|
||||
('<div class="sale"><del>£20.00</del><span>£15.99</span></div>', 'strikethrough'),
|
||||
('<div>Was £25.00 Now £19.99</div>', 'was_now'),
|
||||
('<div class="special-offer">£12.99</div>', 'offer_label'),
|
||||
('<div><span style="text-decoration: line-through">£18.00</span>£14.99</div>', 'inline_strikethrough'),
|
||||
]
|
||||
|
||||
for i, (html_content, test_type) in enumerate(context_test_cases, 1):
|
||||
print(f"\nTest {i}: {test_type}")
|
||||
print(f" HTML: {html_content}")
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
element = soup.find(['span', 'div'])
|
||||
|
||||
if element:
|
||||
context = scraper._extract_special_pricing_context(element)
|
||||
print(f" ✓ Context: {context}")
|
||||
else:
|
||||
print(f" ✗ No element found")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("UK Scraper Special Pricing Test Suite")
|
||||
print("=" * 60)
|
||||
|
||||
# Test the price parsing functionality
|
||||
test_parse_uk_price_functionality()
|
||||
|
||||
# Test special pricing context detection
|
||||
test_special_pricing_context()
|
||||
|
||||
# Test full scenarios
|
||||
asyncio.run(test_special_pricing_scenarios())
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Test suite completed!")
|
||||
57
validate_fix.py
Normal file
57
validate_fix.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick validation that the A to Z Catering pricing is working correctly
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
# Add the src directory to the path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||
|
||||
async def validate_atoz_pricing():
|
||||
"""Test the A to Z Catering pricing fix."""
|
||||
|
||||
try:
|
||||
from uk_scraper import UKCateringScraper
|
||||
from config import Config
|
||||
|
||||
print("Testing A to Z Catering pricing fix...")
|
||||
print("=" * 50)
|
||||
|
||||
config = Config()
|
||||
scraper = UKCateringScraper(config)
|
||||
|
||||
# Test the problematic URL
|
||||
url = 'https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24'
|
||||
|
||||
print(f"Testing URL: {url}")
|
||||
print("Expected price: £12.99 (not £1.39)")
|
||||
print("Testing...")
|
||||
|
||||
result = await scraper.scrape_product_price(url, 'atoz_catering')
|
||||
|
||||
print(f"\nResults:")
|
||||
print(f"Success: {result['success']}")
|
||||
|
||||
if result['success'] and result['price']:
|
||||
price = result['price']
|
||||
print(f"Price found: £{price}")
|
||||
|
||||
if price == 12.99:
|
||||
print("✅ FIXED! Correct price detected (£12.99)")
|
||||
elif price == 1.39:
|
||||
print("❌ STILL BROKEN! Wrong price detected (£1.39)")
|
||||
else:
|
||||
print(f"⚠️ Different price detected: £{price}")
|
||||
else:
|
||||
print(f"❌ Failed to scrape: {result.get('error', 'Unknown error')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(validate_atoz_pricing())
|
||||
Reference in New Issue
Block a user