scrape fix
This commit is contained in:
68
.github/workflows/advanced-mirror.yml
vendored
Normal file
68
.github/workflows/advanced-mirror.yml
vendored
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
name: Advanced Mirror to Azure DevOps
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main, master, develop ]
|
||||||
|
pull_request:
|
||||||
|
types: [closed]
|
||||||
|
branches: [ main, master ]
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
force_push:
|
||||||
|
description: 'Force push to Azure DevOps'
|
||||||
|
required: false
|
||||||
|
default: 'false'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
mirror:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.event_name == 'push' || (github.event_name == 'pull_request' && github.event.pull_request.merged == true) || github.event_name == 'workflow_dispatch'
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Setup Git
|
||||||
|
run: |
|
||||||
|
git config --global user.name "GitHub Mirror Bot"
|
||||||
|
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
||||||
|
|
||||||
|
- name: Add Azure DevOps Remote
|
||||||
|
env:
|
||||||
|
AZURE_DEVOPS_TOKEN: ${{ secrets.AZURE_DEVOPS_PAT }}
|
||||||
|
run: |
|
||||||
|
# URL encode the repository name for spaces
|
||||||
|
ENCODED_URL="https://oauth2:${AZURE_DEVOPS_TOKEN}@dev.azure.com/ptslondon/_git/Price%20Tracker"
|
||||||
|
git remote add azure "$ENCODED_URL"
|
||||||
|
|
||||||
|
- name: Mirror Repository
|
||||||
|
env:
|
||||||
|
FORCE_PUSH: ${{ github.event.inputs.force_push }}
|
||||||
|
run: |
|
||||||
|
# Set force flag
|
||||||
|
FORCE_FLAG=""
|
||||||
|
if [ "$FORCE_PUSH" = "true" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||||
|
FORCE_FLAG="--force"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Push current branch
|
||||||
|
CURRENT_BRANCH=${GITHUB_REF#refs/heads/}
|
||||||
|
echo "Mirroring branch: $CURRENT_BRANCH"
|
||||||
|
|
||||||
|
git push azure "$CURRENT_BRANCH" $FORCE_FLAG
|
||||||
|
|
||||||
|
# Push tags
|
||||||
|
git push azure --tags $FORCE_FLAG
|
||||||
|
|
||||||
|
echo "✅ Successfully mirrored to Azure DevOps"
|
||||||
|
|
||||||
|
- name: Verify Mirror
|
||||||
|
run: |
|
||||||
|
echo "Mirror completed for:"
|
||||||
|
echo "- Repository: Price Tracker"
|
||||||
|
echo "- Branch: ${GITHUB_REF#refs/heads/}"
|
||||||
|
echo "- Commit: ${{ github.sha }}"
|
||||||
|
echo "- Azure DevOps URL: https://dev.azure.com/ptslondon/_git/Price%20Tracker"
|
||||||
34
.github/workflows/mirror-to-azure.yml
vendored
Normal file
34
.github/workflows/mirror-to-azure.yml
vendored
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
name: Mirror to Azure DevOps
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main, master, develop ] # Add branches you want to mirror
|
||||||
|
workflow_dispatch: # Allows manual triggering
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
mirror:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0 # Fetch full history for complete mirror
|
||||||
|
|
||||||
|
- name: Mirror to Azure DevOps
|
||||||
|
env:
|
||||||
|
AZURE_DEVOPS_URL: https://dev.azure.com/ptslondon/_git/Price%20Tracker
|
||||||
|
AZURE_DEVOPS_TOKEN: ${{ secrets.AZURE_DEVOPS_PAT }}
|
||||||
|
run: |
|
||||||
|
# Configure git
|
||||||
|
git config --global user.name "GitHub Mirror Bot"
|
||||||
|
git config --global user.email "noreply@github.com"
|
||||||
|
|
||||||
|
# Add Azure DevOps as remote
|
||||||
|
git remote add azure https://oauth2:${AZURE_DEVOPS_TOKEN}@dev.azure.com/ptslondon/_git/Price%20Tracker
|
||||||
|
|
||||||
|
# Push all branches and tags
|
||||||
|
git push azure --all --force
|
||||||
|
git push azure --tags --force
|
||||||
|
|
||||||
|
echo "Successfully mirrored to Azure DevOps"
|
||||||
20
README.md
20
README.md
@@ -4,13 +4,14 @@ A comprehensive web scraper for tracking product prices across multiple e-commer
|
|||||||
|
|
||||||
## Features ✨
|
## Features ✨
|
||||||
|
|
||||||
- **Multi-site Price Tracking**: Monitor prices across Amazon, eBay, Walmart, and more
|
- **Multi-site Price Tracking**: Monitor prices across JJ Food Service, A to Z Catering, and Amazon UK
|
||||||
- **Beautiful Web UI**: Clean, responsive interface for managing products and viewing price history
|
- **Beautiful Web UI**: Clean, responsive interface for managing products and viewing price history
|
||||||
- **Price Alerts**: Get notified when products reach your target price
|
- **Price Alerts**: Get notified when products reach your target price
|
||||||
- **Historical Data**: View price trends with interactive charts
|
- **Historical Data**: View price trends with interactive charts
|
||||||
- **Automated Scraping**: Schedule regular price checks
|
- **Automated Scraping**: Schedule regular price checks
|
||||||
- **Multiple Notifications**: Email and webhook notifications
|
- **Multiple Notifications**: Email and webhook notifications
|
||||||
- **Robust Scraping**: Built-in retry logic, rotating user agents, and rate limiting
|
- **Robust Scraping**: Built-in retry logic, rotating user agents, and rate limiting
|
||||||
|
- **Special Pricing Detection**: Automatically detects and prioritizes delivery prices and special offers
|
||||||
|
|
||||||
## Quick Start 🚀
|
## Quick Start 🚀
|
||||||
|
|
||||||
@@ -106,13 +107,20 @@ Add new e-commerce sites by extending the sites configuration:
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"sites": {
|
"sites": {
|
||||||
"your_site": {
|
"atoz_catering": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
"base_url": "https://www.yoursite.com",
|
"base_url": "https://www.atoz-catering.co.uk",
|
||||||
"selectors": {
|
"selectors": {
|
||||||
"price": [".price", ".cost"],
|
"price": [
|
||||||
"title": [".product-title"],
|
".my-price.price-offer",
|
||||||
"availability": [".stock-status"]
|
".delivery-price",
|
||||||
|
".price"
|
||||||
|
],
|
||||||
|
"special_offer": [
|
||||||
|
".my-price.price-offer",
|
||||||
|
".special-offer",
|
||||||
|
"del:contains('£')"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
80
SCRAPER_ARCHITECTURE.md
Normal file
80
SCRAPER_ARCHITECTURE.md
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
# Price Tracker - Scraper Architecture
|
||||||
|
|
||||||
|
## Current Structure
|
||||||
|
|
||||||
|
### 1. **`scraper.py` - Base Scraper Class**
|
||||||
|
- **Purpose**: Foundation class for all price scraping
|
||||||
|
- **Handles**: Generic e-commerce sites (Amazon.com, eBay, Walmart, etc.)
|
||||||
|
- **Key Features**:
|
||||||
|
- Base `PriceScraper` class with HTTP session management
|
||||||
|
- Anti-bot measures (headers, delays, retries)
|
||||||
|
- Generic price extraction methods
|
||||||
|
- Site detection logic
|
||||||
|
|
||||||
|
### 2. **`uk_scraper.py` - UK Catering Specialist**
|
||||||
|
- **Purpose**: Specialized scraper for UK catering supply websites
|
||||||
|
- **Handles**: JJ Food Service, A to Z Catering, Amazon UK
|
||||||
|
- **Key Features**:
|
||||||
|
- Inherits from `PriceScraper` base class
|
||||||
|
- UK currency handling (£ symbol)
|
||||||
|
- Delivery vs Collection price prioritization
|
||||||
|
- Special pricing detection (offers, strikethrough, was/now pricing)
|
||||||
|
- Site-specific CSS selectors (e.g., `.my-price.price-offer` for A to Z)
|
||||||
|
|
||||||
|
### 3. **`scraper_manager.py` - Orchestration Layer**
|
||||||
|
- **Purpose**: Routes scraping tasks to appropriate scrapers
|
||||||
|
- **Logic**:
|
||||||
|
- Detects UK catering sites → uses `UKCateringScraper`
|
||||||
|
- Detects other sites → uses base `PriceScraper`
|
||||||
|
- Manages concurrent requests and error handling
|
||||||
|
|
||||||
|
## Site Mapping
|
||||||
|
|
||||||
|
### UK Catering Sites (UKCateringScraper):
|
||||||
|
- `jjfoodservice` → JJ Food Service
|
||||||
|
- `atoz_catering` → A to Z Catering
|
||||||
|
- `amazon_uk` → Amazon UK
|
||||||
|
|
||||||
|
### International Sites (PriceScraper):
|
||||||
|
- `amazon` → Amazon.com
|
||||||
|
- `ebay` → eBay
|
||||||
|
- `walmart` → Walmart
|
||||||
|
- *(Future sites can be added here)*
|
||||||
|
|
||||||
|
## Key Benefits of Current Structure
|
||||||
|
|
||||||
|
✅ **Separation of Concerns**: UK-specific logic is isolated
|
||||||
|
✅ **Extensibility**: Easy to add new UK sites or international sites
|
||||||
|
✅ **Maintainability**: Changes to UK logic don't affect international scraping
|
||||||
|
✅ **Specialization**: UK scraper handles currency, delivery pricing, special offers
|
||||||
|
|
||||||
|
## Recommendations
|
||||||
|
|
||||||
|
### ✅ **KEEP CURRENT STRUCTURE** - It's well-designed!
|
||||||
|
|
||||||
|
The separation between `scraper.py` and `uk_scraper.py` is actually **good architecture** because:
|
||||||
|
|
||||||
|
1. **UK catering sites have unique requirements** (delivery vs collection, £ pricing, special offers)
|
||||||
|
2. **International sites have different patterns** (USD pricing, different site structures)
|
||||||
|
3. **Easy to maintain and extend** each scraper independently
|
||||||
|
|
||||||
|
### Minor Improvements Made:
|
||||||
|
|
||||||
|
1. **Enhanced site detection** in base scraper
|
||||||
|
2. **Added helper methods** to determine scraper routing
|
||||||
|
3. **Improved scraper manager** logic for clarity
|
||||||
|
4. **Fixed A to Z pricing** with `.my-price.price-offer` selector
|
||||||
|
|
||||||
|
## Final File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
src/
|
||||||
|
├── scraper.py # Base scraper (international sites)
|
||||||
|
├── uk_scraper.py # UK catering specialist
|
||||||
|
├── scraper_manager.py # Orchestration layer
|
||||||
|
├── config.py # Configuration management
|
||||||
|
├── database.py # Data persistence
|
||||||
|
└── web_ui.py # Flask web interface
|
||||||
|
```
|
||||||
|
|
||||||
|
This structure supports both current UK catering needs and future expansion to international e-commerce sites.
|
||||||
177
SPECIAL_PRICING.md
Normal file
177
SPECIAL_PRICING.md
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
# Special Pricing Features - Price Tracker
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The UK Price Tracker now includes enhanced special pricing detection capabilities to identify and prioritize discounted, sale, and special offer prices across supported UK catering sites.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
### 🎯 Special Pricing Detection
|
||||||
|
- **Strikethrough Pricing**: Detects crossed-out prices with sale prices
|
||||||
|
- **Was/Now Patterns**: Identifies "Was £X Now £Y" pricing patterns
|
||||||
|
- **Offer Labels**: Recognizes sale/discount/special offer badges and containers
|
||||||
|
- **Percentage Discounts**: Detects "X% OFF" promotional pricing
|
||||||
|
- **Member/Trade Pricing**: Special pricing for registered customers (JJ Food Service)
|
||||||
|
|
||||||
|
### 🚚 Delivery Price Priority
|
||||||
|
- Automatically prioritizes delivery prices over collection prices
|
||||||
|
- Identifies delivery-specific special offers
|
||||||
|
- Handles mixed pricing scenarios (delivery vs collection vs general)
|
||||||
|
|
||||||
|
### 🏪 Site-Specific Enhancements
|
||||||
|
|
||||||
|
#### JJ Food Service
|
||||||
|
- Member pricing detection
|
||||||
|
- Trade pricing identification
|
||||||
|
- Bulk discount recognition
|
||||||
|
- Quantity-based pricing
|
||||||
|
|
||||||
|
#### A to Z Catering
|
||||||
|
- Header-based delivery pricing (H3/H4 elements)
|
||||||
|
- Inline strikethrough detection
|
||||||
|
- Special delivery offer containers
|
||||||
|
- Style-based strikethrough recognition
|
||||||
|
|
||||||
|
#### Amazon UK
|
||||||
|
- Deal price detection
|
||||||
|
- Strike-through pricing
|
||||||
|
- Sale badge recognition
|
||||||
|
- RRP vs Sale price comparison
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Special pricing is configured in `config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"scraping": {
|
||||||
|
"special_pricing": {
|
||||||
|
"enabled": true,
|
||||||
|
"prefer_delivery_prices": true,
|
||||||
|
"detect_strikethrough": true,
|
||||||
|
"detect_was_now_patterns": true,
|
||||||
|
"detect_percentage_discounts": true,
|
||||||
|
"min_discount_threshold": 0.05,
|
||||||
|
"max_price_difference_ratio": 0.5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sites": {
|
||||||
|
"jjfoodservice": {
|
||||||
|
"selectors": {
|
||||||
|
"special_offer": [
|
||||||
|
".special-offer",
|
||||||
|
".member-price",
|
||||||
|
"del:contains('£')",
|
||||||
|
".was-price"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Test Suite
|
||||||
|
Run the comprehensive test suite:
|
||||||
|
```bash
|
||||||
|
python test_special_pricing.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This tests:
|
||||||
|
- Price parsing with various formats
|
||||||
|
- Special pricing context detection
|
||||||
|
- Site-specific extraction methods
|
||||||
|
- Mock HTML scenarios
|
||||||
|
|
||||||
|
### Debug Tool
|
||||||
|
Debug real URLs:
|
||||||
|
```bash
|
||||||
|
python debug_special_pricing.py <URL> [--verbose]
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
```bash
|
||||||
|
# Debug a JJ Food Service product
|
||||||
|
python debug_special_pricing.py "https://www.jjfoodservice.com/product/example" --verbose
|
||||||
|
|
||||||
|
# Debug an A to Z Catering product
|
||||||
|
python debug_special_pricing.py "https://www.atoz-catering.co.uk/product/example"
|
||||||
|
|
||||||
|
# Debug an Amazon UK product
|
||||||
|
python debug_special_pricing.py "https://www.amazon.co.uk/product/example"
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### 1. Context Detection
|
||||||
|
The scraper analyzes HTML elements and their parent containers to detect special pricing context:
|
||||||
|
- Strikethrough elements (`<del>`, `<s>`, `<strike>`)
|
||||||
|
- CSS styling (`text-decoration: line-through`)
|
||||||
|
- Keyword patterns (`was`, `now`, `sale`, `offer`, `discount`)
|
||||||
|
- Percentage discount patterns (`20% off`, etc.)
|
||||||
|
|
||||||
|
### 2. Price Extraction
|
||||||
|
When multiple prices are found:
|
||||||
|
- **With special context**: Returns the lowest price (offer price)
|
||||||
|
- **Delivery preference**: Prioritizes delivery over collection prices
|
||||||
|
- **Multiple prices**: Takes the last/lowest price found
|
||||||
|
|
||||||
|
### 3. Site-Specific Logic
|
||||||
|
Each site has tailored extraction methods:
|
||||||
|
- **JJ Food Service**: Focuses on member/trade pricing
|
||||||
|
- **A to Z Catering**: Enhanced header and delivery price detection
|
||||||
|
- **Amazon UK**: Deal and promotional price recognition
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Strikethrough Pricing
|
||||||
|
```html
|
||||||
|
<div class="product-price">
|
||||||
|
<del>£15.99</del>
|
||||||
|
<span class="sale-price">£12.99</span>
|
||||||
|
</div>
|
||||||
|
```
|
||||||
|
**Result**: £12.99 (special offer detected)
|
||||||
|
|
||||||
|
### Was/Now Pricing
|
||||||
|
```html
|
||||||
|
<div class="price-container">
|
||||||
|
<span>Was £20.50, now £17.25</span>
|
||||||
|
</div>
|
||||||
|
```
|
||||||
|
**Result**: £17.25 (was/now pattern detected)
|
||||||
|
|
||||||
|
### Delivery Special Offers
|
||||||
|
```html
|
||||||
|
<h3>Delivery: <del>£25.00</del> £19.99</h3>
|
||||||
|
```
|
||||||
|
**Result**: £19.99 (delivery + special offer)
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### No Special Prices Detected
|
||||||
|
1. Check if the site uses non-standard markup
|
||||||
|
2. Add custom selectors to `config.json`
|
||||||
|
3. Use debug tool to see what selectors are matching
|
||||||
|
4. Verify special pricing is enabled in config
|
||||||
|
|
||||||
|
### Wrong Price Selected
|
||||||
|
1. Check if delivery preference is correctly configured
|
||||||
|
2. Verify the HTML structure matches expected patterns
|
||||||
|
3. Use verbose debugging to see all detected prices
|
||||||
|
4. Consider adding site-specific selectors
|
||||||
|
|
||||||
|
### Performance Issues
|
||||||
|
1. Reduce the number of special offer selectors
|
||||||
|
2. Increase delays between requests
|
||||||
|
3. Use more specific CSS selectors
|
||||||
|
4. Enable only necessary special pricing features
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
- **Machine Learning**: Auto-detect pricing patterns
|
||||||
|
- **More Sites**: Extend to additional UK catering suppliers
|
||||||
|
- **Price History**: Track special offer frequency and patterns
|
||||||
|
- **Alerts**: Notify when special offers are detected
|
||||||
|
- **Comparison**: Cross-site special offer comparison
|
||||||
198
debug_atoz_pricing.py
Normal file
198
debug_atoz_pricing.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Debug script specifically for A to Z Catering pricing issues
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
|
|
||||||
|
def fetch_and_analyze_atoz_page(url):
|
||||||
|
"""Fetch and analyze the A to Z page to identify pricing issues."""
|
||||||
|
|
||||||
|
print(f"Analyzing A to Z page: {url}")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=30)
|
||||||
|
print(f"HTTP Status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print("Failed to fetch page")
|
||||||
|
return
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
# 1. Find all elements containing prices
|
||||||
|
print("\n1. ALL PRICE ELEMENTS FOUND:")
|
||||||
|
print("-" * 40)
|
||||||
|
price_pattern = re.compile(r'£\d+\.?\d*')
|
||||||
|
price_elements = soup.find_all(string=price_pattern)
|
||||||
|
|
||||||
|
for i, price_text in enumerate(price_elements):
|
||||||
|
parent = price_text.parent if hasattr(price_text, 'parent') else None
|
||||||
|
parent_class = parent.get('class', []) if parent else []
|
||||||
|
parent_tag = parent.name if parent else 'N/A'
|
||||||
|
|
||||||
|
print(f" {i+1:2d}. '{price_text.strip()}' in <{parent_tag}> class={parent_class}")
|
||||||
|
|
||||||
|
# 2. Check for delivery-specific elements
|
||||||
|
print("\n2. DELIVERY-RELATED ELEMENTS:")
|
||||||
|
print("-" * 40)
|
||||||
|
delivery_keywords = ['delivery', 'delivered']
|
||||||
|
|
||||||
|
for keyword in delivery_keywords:
|
||||||
|
elements = soup.find_all(string=re.compile(keyword, re.IGNORECASE))
|
||||||
|
for elem in elements[:5]: # Show first 5
|
||||||
|
parent = elem.parent if hasattr(elem, 'parent') else None
|
||||||
|
parent_class = parent.get('class', []) if parent else []
|
||||||
|
text = elem.strip()[:100]
|
||||||
|
print(f" '{text}' in class={parent_class}")
|
||||||
|
|
||||||
|
# 3. Check h3 and h4 elements (A to Z specific)
|
||||||
|
print("\n3. H3/H4 ELEMENTS WITH PRICES:")
|
||||||
|
print("-" * 40)
|
||||||
|
headers = soup.find_all(['h3', 'h4'])
|
||||||
|
for header in headers:
|
||||||
|
text = header.get_text(strip=True)
|
||||||
|
if '£' in text:
|
||||||
|
print(f" <{header.name}>: {text}")
|
||||||
|
|
||||||
|
# 4. Test specific selectors from our config
|
||||||
|
print("\n4. TESTING OUR SELECTORS:")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
test_selectors = [
|
||||||
|
'.delivery-price',
|
||||||
|
'.price-delivery',
|
||||||
|
'.price',
|
||||||
|
'.product-price',
|
||||||
|
'.collection-price',
|
||||||
|
'span:contains("£")',
|
||||||
|
'h3:contains("Delivery")',
|
||||||
|
'h4:contains("Delivery")',
|
||||||
|
'*[class*="price"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in test_selectors:
|
||||||
|
try:
|
||||||
|
if ':contains(' in selector:
|
||||||
|
# Handle contains selectors differently
|
||||||
|
if 'h3:contains("Delivery")' == selector:
|
||||||
|
elements = [h for h in soup.find_all('h3') if 'delivery' in h.get_text().lower()]
|
||||||
|
elif 'h4:contains("Delivery")' == selector:
|
||||||
|
elements = [h for h in soup.find_all('h4') if 'delivery' in h.get_text().lower()]
|
||||||
|
elif 'span:contains("£")' == selector:
|
||||||
|
elements = [s for s in soup.find_all('span') if '£' in s.get_text()]
|
||||||
|
else:
|
||||||
|
elements = []
|
||||||
|
else:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
|
||||||
|
if elements:
|
||||||
|
print(f" ✓ {selector} -> {len(elements)} elements:")
|
||||||
|
for i, elem in enumerate(elements[:3]): # Show first 3
|
||||||
|
text = elem.get_text(strip=True)
|
||||||
|
if '£' in text:
|
||||||
|
print(f" [{i+1}] {text}")
|
||||||
|
else:
|
||||||
|
print(f" ✗ {selector} -> No elements")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠ {selector} -> Error: {e}")
|
||||||
|
|
||||||
|
# 5. Look for the specific prices mentioned (12.99 and 1.39)
|
||||||
|
print("\n5. SPECIFIC PRICE ANALYSIS:")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
if '12.99' in response.text:
|
||||||
|
print("✓ £12.99 found in page content")
|
||||||
|
# Find context around 12.99
|
||||||
|
matches = list(re.finditer(r'12\.99', response.text))
|
||||||
|
for match in matches[:3]: # Show first 3 occurrences
|
||||||
|
start = max(0, match.start() - 100)
|
||||||
|
end = min(len(response.text), match.end() + 100)
|
||||||
|
context = response.text[start:end].replace('\n', ' ').replace('\t', ' ')
|
||||||
|
print(f" Context: ...{context}...")
|
||||||
|
else:
|
||||||
|
print("✗ £12.99 NOT found in page content")
|
||||||
|
|
||||||
|
if '1.39' in response.text:
|
||||||
|
print("✓ £1.39 found in page content")
|
||||||
|
# Find context around 1.39
|
||||||
|
matches = list(re.finditer(r'1\.39', response.text))
|
||||||
|
for match in matches[:3]: # Show first 3 occurrences
|
||||||
|
start = max(0, match.start() - 100)
|
||||||
|
end = min(len(response.text), match.end() + 100)
|
||||||
|
context = response.text[start:end].replace('\n', ' ').replace('\t', ' ')
|
||||||
|
print(f" Context: ...{context}...")
|
||||||
|
else:
|
||||||
|
print("✗ £1.39 NOT found in page content")
|
||||||
|
|
||||||
|
# 6. Try to simulate our current parsing logic
|
||||||
|
print("\n6. SIMULATING CURRENT PARSING LOGIC:")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
# Test our general price selectors
|
||||||
|
general_selectors = [
|
||||||
|
'.price',
|
||||||
|
'.product-price',
|
||||||
|
'span:contains("£")',
|
||||||
|
'.price-value',
|
||||||
|
]
|
||||||
|
|
||||||
|
found_prices = []
|
||||||
|
for selector in general_selectors:
|
||||||
|
try:
|
||||||
|
if selector == 'span:contains("£")':
|
||||||
|
elements = [s for s in soup.find_all('span') if '£' in s.get_text()]
|
||||||
|
else:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
price_text = element.get_text(strip=True)
|
||||||
|
if '£' in price_text:
|
||||||
|
# Extract price using regex
|
||||||
|
price_matches = re.findall(r'£(\d+\.?\d*)', price_text)
|
||||||
|
for match in price_matches:
|
||||||
|
try:
|
||||||
|
price_value = float(match)
|
||||||
|
found_prices.append((price_value, selector, price_text))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error with {selector}: {e}")
|
||||||
|
|
||||||
|
print(f"Found {len(found_prices)} prices total:")
|
||||||
|
for price, selector, text in found_prices:
|
||||||
|
print(f" £{price} from '{selector}': {text[:50]}")
|
||||||
|
|
||||||
|
if found_prices:
|
||||||
|
# Show what our current logic would select
|
||||||
|
min_price = min(price for price, _, _ in found_prices)
|
||||||
|
max_price = max(price for price, _, _ in found_prices)
|
||||||
|
last_price = found_prices[-1][0] if found_prices else None
|
||||||
|
|
||||||
|
print(f"\nCurrent logic would likely select:")
|
||||||
|
print(f" Minimum price: £{min_price}")
|
||||||
|
print(f" Maximum price: £{max_price}")
|
||||||
|
print(f" Last price found: £{last_price}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
url = "https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24"
|
||||||
|
fetch_and_analyze_atoz_page(url)
|
||||||
34
debug_jj.py
Normal file
34
debug_jj.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Debug script to test JJ Food Service scraping
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
from src.config import Config
|
||||||
|
from src.uk_scraper import UKCateringScraper
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
|
async def test_jj_scraping():
|
||||||
|
config = Config()
|
||||||
|
|
||||||
|
print(f"JJ Food Service enabled: {config.is_site_enabled('jjfoodservice')}")
|
||||||
|
print(f"A to Z enabled: {config.is_site_enabled('atoz_catering')}")
|
||||||
|
|
||||||
|
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||||
|
|
||||||
|
async with UKCateringScraper(config) as scraper:
|
||||||
|
print(f"\nTesting JJ Food Service URL: {url}")
|
||||||
|
result = await scraper.scrape_product_price(url, 'jjfoodservice')
|
||||||
|
print(f"Result: {result}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_jj_scraping())
|
||||||
160
debug_special_pricing.py
Normal file
160
debug_special_pricing.py
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Special Pricing Debug Tool for UK Price Tracker
|
||||||
|
|
||||||
|
This tool helps debug and monitor special pricing detection on real websites.
|
||||||
|
It can be used to test URLs and see exactly what pricing information is being detected.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
|
|
||||||
|
from uk_scraper import UKCateringScraper
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_site_from_url(url: str) -> str:
|
||||||
|
"""Detect which site the URL belongs to."""
|
||||||
|
if 'jjfoodservice.com' in url:
|
||||||
|
return 'jjfoodservice'
|
||||||
|
elif 'atoz-catering.co.uk' in url:
|
||||||
|
return 'atoz_catering'
|
||||||
|
elif 'amazon.co.uk' in url:
|
||||||
|
return 'amazon_uk'
|
||||||
|
else:
|
||||||
|
return 'unknown'
|
||||||
|
|
||||||
|
|
||||||
|
async def debug_url_pricing(url: str, verbose: bool = False):
|
||||||
|
"""Debug pricing extraction for a specific URL."""
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
|
||||||
|
site_name = detect_site_from_url(url)
|
||||||
|
|
||||||
|
print(f"Debugging URL: {url}")
|
||||||
|
print(f"Detected site: {site_name}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
if site_name == 'unknown':
|
||||||
|
print("❌ Unknown site - cannot process")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Fetch the page content
|
||||||
|
print("🌐 Fetching page content...")
|
||||||
|
html_content = await scraper._fetch_page(url)
|
||||||
|
|
||||||
|
if not html_content:
|
||||||
|
print("❌ Failed to fetch page content")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✅ Page content fetched successfully")
|
||||||
|
|
||||||
|
# Parse with BeautifulSoup
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Debug special pricing detection
|
||||||
|
print("\n🔍 Looking for special offer prices...")
|
||||||
|
special_prices = scraper._find_special_offer_prices(soup, site_name)
|
||||||
|
|
||||||
|
if special_prices:
|
||||||
|
print(f"✅ Found {len(special_prices)} special offer prices:")
|
||||||
|
for price, selector in special_prices:
|
||||||
|
print(f" £{price} (found with: {selector})")
|
||||||
|
|
||||||
|
best_special_price = min(price for price, _ in special_prices)
|
||||||
|
print(f"🎯 Best special offer price: £{best_special_price}")
|
||||||
|
else:
|
||||||
|
print("❌ No special offer prices found")
|
||||||
|
|
||||||
|
# Test the main extraction method
|
||||||
|
print(f"\n🔍 Testing {site_name} extraction method...")
|
||||||
|
|
||||||
|
if site_name == 'jjfoodservice':
|
||||||
|
result = scraper._extract_jjfoodservice_data(soup)
|
||||||
|
elif site_name == 'atoz_catering':
|
||||||
|
result = scraper._extract_atoz_catering_data(soup)
|
||||||
|
elif site_name == 'amazon_uk':
|
||||||
|
result = scraper._extract_amazon_uk_data(soup)
|
||||||
|
|
||||||
|
print(f"✅ Extraction result:")
|
||||||
|
print(f" Price: £{result['price']}" if result['price'] else " Price: Not found")
|
||||||
|
print(f" Title: {result.get('title', 'Not found')}")
|
||||||
|
print(f" Available: {result.get('availability', 'Unknown')}")
|
||||||
|
print(f" Currency: {result.get('currency', 'Unknown')}")
|
||||||
|
|
||||||
|
# If verbose, show more debugging info
|
||||||
|
if verbose:
|
||||||
|
print(f"\n🔍 Verbose debugging for {site_name}...")
|
||||||
|
|
||||||
|
# Get site selectors from config
|
||||||
|
site_config = config.get_site_config(site_name)
|
||||||
|
if site_config and 'selectors' in site_config:
|
||||||
|
selectors = site_config['selectors']
|
||||||
|
|
||||||
|
# Test each selector type
|
||||||
|
for selector_type, selector_list in selectors.items():
|
||||||
|
print(f"\n Testing {selector_type} selectors:")
|
||||||
|
|
||||||
|
for selector in selector_list:
|
||||||
|
try:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
if elements:
|
||||||
|
print(f" ✅ {selector} -> Found {len(elements)} elements")
|
||||||
|
for i, elem in enumerate(elements[:3]): # Show first 3
|
||||||
|
text = elem.get_text(strip=True)[:100] # Truncate long text
|
||||||
|
print(f" [{i+1}] {text}")
|
||||||
|
else:
|
||||||
|
print(f" ❌ {selector} -> No elements found")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ {selector} -> Error: {e}")
|
||||||
|
|
||||||
|
# Test the full scraping method
|
||||||
|
print(f"\n🔍 Testing full scrape_product_price method...")
|
||||||
|
full_result = await scraper.scrape_product_price(url, site_name)
|
||||||
|
|
||||||
|
print("✅ Full scraping result:")
|
||||||
|
print(f" Success: {full_result['success']}")
|
||||||
|
print(f" Price: £{full_result['price']}" if full_result['price'] else " Price: Not found")
|
||||||
|
print(f" Error: {full_result.get('error', 'None')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error during debugging: {e}")
|
||||||
|
if verbose:
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to run the debug tool."""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Debug special pricing detection for UK price tracker')
|
||||||
|
parser.add_argument('url', help='URL to debug')
|
||||||
|
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output')
|
||||||
|
parser.add_argument('--test-selectors', action='store_true', help='Test all selectors from config')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("UK Price Tracker - Special Pricing Debug Tool")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Run the debugging
|
||||||
|
asyncio.run(debug_url_pricing(args.url, args.verbose))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
64
purge_database.py
Normal file
64
purge_database.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple script to purge all price data from the database
|
||||||
|
This will reset the database so the next scrape acts as the first one
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import os
|
||||||
|
from src.config import Config
|
||||||
|
|
||||||
|
def purge_database():
|
||||||
|
"""Purge all data from the price tracker database."""
|
||||||
|
config = Config()
|
||||||
|
db_path = config.database_path
|
||||||
|
|
||||||
|
if not os.path.exists(db_path):
|
||||||
|
print(f"Database file {db_path} does not exist. Nothing to purge.")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get all table names
|
||||||
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
||||||
|
tables = cursor.fetchall()
|
||||||
|
|
||||||
|
if not tables:
|
||||||
|
print("No tables found in database.")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(tables)} tables in database:")
|
||||||
|
for table in tables:
|
||||||
|
table_name = table[0]
|
||||||
|
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
||||||
|
count = cursor.fetchone()[0]
|
||||||
|
print(f" - {table_name}: {count} records")
|
||||||
|
|
||||||
|
# Confirm purge
|
||||||
|
response = input("\nDo you want to purge all data? (yes/no): ").lower().strip()
|
||||||
|
|
||||||
|
if response in ['yes', 'y']:
|
||||||
|
# Delete all data from all tables
|
||||||
|
for table in tables:
|
||||||
|
table_name = table[0]
|
||||||
|
cursor.execute(f"DELETE FROM {table_name}")
|
||||||
|
print(f"Purged all data from {table_name}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print("\n✅ Database purged successfully!")
|
||||||
|
print("The next scrape will act as the first one and log all prices.")
|
||||||
|
else:
|
||||||
|
print("Purge cancelled.")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
print(f"Database error: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
purge_database()
|
||||||
133
simple_test.py
Normal file
133
simple_test.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple test for special pricing functionality
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
|
|
||||||
|
def test_imports():
|
||||||
|
"""Test that all modules can be imported."""
|
||||||
|
try:
|
||||||
|
print("Testing imports...")
|
||||||
|
|
||||||
|
# Basic imports
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, Optional, List, Tuple
|
||||||
|
print("✓ Basic Python modules imported")
|
||||||
|
|
||||||
|
# Third-party imports
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
print("✓ BeautifulSoup imported")
|
||||||
|
|
||||||
|
# Local imports
|
||||||
|
from config import Config
|
||||||
|
print("✓ Config imported")
|
||||||
|
|
||||||
|
from scraper import PriceScraper
|
||||||
|
print("✓ PriceScraper imported")
|
||||||
|
|
||||||
|
from uk_scraper import UKCateringScraper
|
||||||
|
print("✓ UKCateringScraper imported")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Import error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_basic_functionality():
|
||||||
|
"""Test basic functionality of the special pricing."""
|
||||||
|
try:
|
||||||
|
from config import Config
|
||||||
|
from uk_scraper import UKCateringScraper
|
||||||
|
|
||||||
|
print("\nTesting basic functionality...")
|
||||||
|
|
||||||
|
# Create config and scraper
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
print("✓ Scraper created successfully")
|
||||||
|
|
||||||
|
# Test price parsing
|
||||||
|
test_price = scraper._parse_uk_price("£12.99")
|
||||||
|
if test_price == 12.99:
|
||||||
|
print("✓ Basic price parsing works")
|
||||||
|
else:
|
||||||
|
print(f"✗ Price parsing failed: got {test_price}, expected 12.99")
|
||||||
|
|
||||||
|
# Test special pricing
|
||||||
|
special_price = scraper._parse_uk_price("Was £20.00 Now £15.99", detect_special_offers=True)
|
||||||
|
if special_price == 15.99:
|
||||||
|
print("✓ Special price parsing works")
|
||||||
|
else:
|
||||||
|
print(f"✗ Special price parsing failed: got {special_price}, expected 15.99")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Functionality error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_html_parsing():
|
||||||
|
"""Test HTML parsing for special pricing."""
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from uk_scraper import UKCateringScraper
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
print("\nTesting HTML parsing...")
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
|
||||||
|
# Test strikethrough detection
|
||||||
|
html = '<div><del>£20.00</del><span>£15.99</span></div>'
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
special_prices = scraper._find_special_offer_prices(soup, 'atoz_catering')
|
||||||
|
if special_prices:
|
||||||
|
print(f"✓ Special offer detection works: found {len(special_prices)} prices")
|
||||||
|
else:
|
||||||
|
print("✗ Special offer detection failed")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ HTML parsing error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Simple Special Pricing Test")
|
||||||
|
print("=" * 40)
|
||||||
|
|
||||||
|
success = True
|
||||||
|
|
||||||
|
# Test imports
|
||||||
|
if not test_imports():
|
||||||
|
success = False
|
||||||
|
|
||||||
|
# Test basic functionality
|
||||||
|
if not test_basic_functionality():
|
||||||
|
success = False
|
||||||
|
|
||||||
|
# Test HTML parsing
|
||||||
|
if not test_html_parsing():
|
||||||
|
success = False
|
||||||
|
|
||||||
|
print("\n" + "=" * 40)
|
||||||
|
if success:
|
||||||
|
print("✅ All tests passed!")
|
||||||
|
else:
|
||||||
|
print("❌ Some tests failed!")
|
||||||
|
sys.exit(1)
|
||||||
@@ -147,6 +147,15 @@ class DatabaseManager:
|
|||||||
UPDATE products SET active = 0, updated_at = ? WHERE id = ?
|
UPDATE products SET active = 0, updated_at = ? WHERE id = ?
|
||||||
''', (datetime.now(), product_id))
|
''', (datetime.now(), product_id))
|
||||||
|
|
||||||
|
def delete_product(self, product_id: int):
|
||||||
|
"""Delete a product and all its associated price history."""
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
# Delete price history first (due to foreign key constraints)
|
||||||
|
conn.execute('DELETE FROM price_history WHERE product_id = ?', (product_id,))
|
||||||
|
|
||||||
|
# Delete the product
|
||||||
|
conn.execute('DELETE FROM products WHERE id = ?', (product_id,))
|
||||||
|
|
||||||
def save_price_history(self, product_id: int, site_name: str, price: float,
|
def save_price_history(self, product_id: int, site_name: str, price: float,
|
||||||
currency: str = 'GBP', availability: bool = True,
|
currency: str = 'GBP', availability: bool = True,
|
||||||
timestamp: datetime = None):
|
timestamp: datetime = None):
|
||||||
|
|||||||
@@ -169,13 +169,21 @@ class PriceScraper:
|
|||||||
"""Detect which site this URL belongs to."""
|
"""Detect which site this URL belongs to."""
|
||||||
domain = urlparse(url).netloc.lower()
|
domain = urlparse(url).netloc.lower()
|
||||||
|
|
||||||
if 'amazon' in domain:
|
# UK Catering sites (handled by UKCateringScraper)
|
||||||
|
if 'jjfoodservice.com' in domain:
|
||||||
|
return 'jjfoodservice'
|
||||||
|
elif 'atoz-catering.co.uk' in domain:
|
||||||
|
return 'atoz_catering'
|
||||||
|
elif 'amazon.co.uk' in domain:
|
||||||
|
return 'amazon_uk'
|
||||||
|
|
||||||
|
# International sites (handled by base PriceScraper)
|
||||||
|
elif 'amazon.com' in domain or 'amazon.' in domain:
|
||||||
return 'amazon'
|
return 'amazon'
|
||||||
elif 'ebay' in domain:
|
elif 'ebay' in domain:
|
||||||
return 'ebay'
|
return 'ebay'
|
||||||
elif 'walmart' in domain:
|
elif 'walmart' in domain:
|
||||||
return 'walmart'
|
return 'walmart'
|
||||||
# Add more site detection logic here
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -267,6 +275,17 @@ class PriceScraper:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def should_use_uk_scraper(self, url: str) -> bool:
|
||||||
|
"""Determine if this URL should use the UK catering scraper."""
|
||||||
|
site_name = self._detect_site(url)
|
||||||
|
uk_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
||||||
|
return site_name in uk_sites
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_uk_catering_sites(cls) -> set:
|
||||||
|
"""Get the list of UK catering sites."""
|
||||||
|
return {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
||||||
|
|
||||||
|
|
||||||
class ScraperManager:
|
class ScraperManager:
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ class ScraperManager(BaseScraper):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.active_tasks = {}
|
self.active_tasks = {}
|
||||||
|
self.semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||||
|
|
||||||
async def scrape_product_by_id(self, product_id: int, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
async def scrape_product_by_id(self, product_id: int, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||||
"""Scrape a specific product by ID with task tracking."""
|
"""Scrape a specific product by ID with task tracking."""
|
||||||
@@ -36,6 +37,79 @@ class ScraperManager(BaseScraper):
|
|||||||
if product_id in self.active_tasks:
|
if product_id in self.active_tasks:
|
||||||
del self.active_tasks[product_id]
|
del self.active_tasks[product_id]
|
||||||
|
|
||||||
|
async def scrape_product(self, product: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||||
|
"""Scrape prices for a single product across all configured sites."""
|
||||||
|
product_id = product['id']
|
||||||
|
urls = product['urls']
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Check if this product has UK catering sites
|
||||||
|
uk_catering_sites = {'jjfoodservice', 'atoz_catering', 'amazon_uk'}
|
||||||
|
has_uk_sites = any(site in uk_catering_sites for site in urls.keys())
|
||||||
|
|
||||||
|
if has_uk_sites:
|
||||||
|
# Use UK-specific scraper
|
||||||
|
async with UKCateringScraper(self.config) as scraper:
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for site_name, url in urls.items():
|
||||||
|
if self.config.is_site_enabled(site_name):
|
||||||
|
task = self._scrape_with_semaphore_uk(scraper, url, site_name)
|
||||||
|
tasks.append((site_name, task))
|
||||||
|
|
||||||
|
# Add delay between requests
|
||||||
|
await asyncio.sleep(self.config.delay_between_requests)
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for site_name, task in tasks:
|
||||||
|
try:
|
||||||
|
result = await task
|
||||||
|
results[site_name] = result
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
|
||||||
|
results[site_name] = {
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Use generic scraper for non-UK sites
|
||||||
|
from .scraper import PriceScraper
|
||||||
|
async with PriceScraper(self.config) as scraper:
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for site_name, url in urls.items():
|
||||||
|
if self.config.is_site_enabled(site_name):
|
||||||
|
task = self._scrape_with_semaphore(scraper, url, site_name)
|
||||||
|
tasks.append((site_name, task))
|
||||||
|
|
||||||
|
# Add delay between requests
|
||||||
|
await asyncio.sleep(self.config.delay_between_requests)
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
for site_name, task in tasks:
|
||||||
|
try:
|
||||||
|
result = await task
|
||||||
|
results[site_name] = result
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error scraping {site_name} for product {product_id}: {e}")
|
||||||
|
results[site_name] = {
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def _scrape_with_semaphore_uk(self, scraper: UKCateringScraper, url: str, site_name: str):
|
||||||
|
"""Scrape with semaphore using UK scraper."""
|
||||||
|
async with self.semaphore:
|
||||||
|
return await scraper.scrape_product_price(url, site_name)
|
||||||
|
|
||||||
|
async def _scrape_with_semaphore(self, scraper, url: str, site_name: str):
|
||||||
|
"""Scrape with semaphore using generic scraper."""
|
||||||
|
async with self.semaphore:
|
||||||
|
return await scraper.scrape_product_price(url, site_name)
|
||||||
|
|
||||||
async def cancel_product_scraping(self, product_id: int) -> bool:
|
async def cancel_product_scraping(self, product_id: int) -> bool:
|
||||||
"""Cancel scraping for a specific product."""
|
"""Cancel scraping for a specific product."""
|
||||||
if product_id in self.active_tasks:
|
if product_id in self.active_tasks:
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ Specialized scrapers for UK catering supply sites
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional, List, Tuple
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, Tag
|
||||||
from .scraper import PriceScraper
|
from .scraper import PriceScraper
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -14,35 +14,153 @@ logger = logging.getLogger(__name__)
|
|||||||
class UKCateringScraper(PriceScraper):
|
class UKCateringScraper(PriceScraper):
|
||||||
"""Specialized scraper for UK catering supply websites."""
|
"""Specialized scraper for UK catering supply websites."""
|
||||||
|
|
||||||
def _parse_uk_price(self, price_text: str) -> Optional[float]:
|
def _extract_special_pricing_context(self, element: Tag) -> Dict[str, Any]:
|
||||||
"""Parse UK price format with £ symbol."""
|
"""Extract special pricing context from an element and its surroundings."""
|
||||||
|
context = {
|
||||||
|
'has_strikethrough': False,
|
||||||
|
'has_offer_label': False,
|
||||||
|
'has_was_now': False,
|
||||||
|
'prices': [],
|
||||||
|
'price_types': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get parent elements to check for special pricing context
|
||||||
|
parents = [element] + [p for p in element.parents if p.name][:3] # Check up to 3 levels up
|
||||||
|
|
||||||
|
for parent in parents:
|
||||||
|
parent_text = parent.get_text().lower() if parent else ""
|
||||||
|
|
||||||
|
# Check for strikethrough pricing
|
||||||
|
strikethrough_elements = parent.find_all(['del', 's', 'strike']) if parent else []
|
||||||
|
if strikethrough_elements:
|
||||||
|
context['has_strikethrough'] = True
|
||||||
|
for strike_elem in strikethrough_elements:
|
||||||
|
strike_price = self._parse_uk_price(strike_elem.get_text())
|
||||||
|
if strike_price:
|
||||||
|
context['prices'].append(strike_price)
|
||||||
|
context['price_types'].append('was_price')
|
||||||
|
|
||||||
|
# Check for offer/sale/discount labels
|
||||||
|
offer_patterns = [
|
||||||
|
r'\bsale\b', r'\boffer\b', r'\bdeal\b', r'\bdiscount\b',
|
||||||
|
r'\bspecial\b', r'\bpromo\b', r'\breduced\b', r'\bsave\b',
|
||||||
|
r'\bwas\s*£', r'\bnow\s*£', r'\b\d+%\s*off\b'
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in offer_patterns:
|
||||||
|
if re.search(pattern, parent_text):
|
||||||
|
context['has_offer_label'] = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Look for "was/now" pricing patterns
|
||||||
|
was_now_match = re.search(r'was\s*£([\d.]+).*?now\s*£([\d.]+)', parent_text, re.IGNORECASE)
|
||||||
|
if was_now_match:
|
||||||
|
context['has_was_now'] = True
|
||||||
|
was_price = float(was_now_match.group(1))
|
||||||
|
now_price = float(was_now_match.group(2))
|
||||||
|
context['prices'].extend([was_price, now_price])
|
||||||
|
context['price_types'].extend(['was_price', 'now_price'])
|
||||||
|
|
||||||
|
return context
|
||||||
|
|
||||||
|
def _parse_uk_price(self, price_text: str, prefer_delivery: bool = False) -> Optional[float]:
|
||||||
|
"""Simple, conservative UK price parsing - just extract the first reasonable price."""
|
||||||
if not price_text:
|
if not price_text:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Remove common text and normalize
|
# Skip very long text blocks that are unlikely to contain just prices
|
||||||
price_text = price_text.lower()
|
if len(price_text) > 100:
|
||||||
price_text = re.sub(r'delivery:|collection:|was:|now:|offer:|from:', '', price_text)
|
return None
|
||||||
|
|
||||||
|
# Check if this is delivery or collection pricing
|
||||||
|
is_delivery = 'delivery' in price_text.lower()
|
||||||
|
is_collection = 'collection' in price_text.lower()
|
||||||
|
|
||||||
|
# If we prefer delivery and this is explicitly collection, skip it
|
||||||
|
if prefer_delivery and is_collection and not is_delivery:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Simple regex to find prices - be very specific
|
||||||
|
price_match = re.search(r'£(\d{1,3}(?:\.\d{2})?)', price_text)
|
||||||
|
|
||||||
# Find price with £ symbol
|
|
||||||
price_match = re.search(r'£(\d+\.?\d*)', price_text)
|
|
||||||
if price_match:
|
if price_match:
|
||||||
try:
|
try:
|
||||||
return float(price_match.group(1))
|
price_val = float(price_match.group(1))
|
||||||
except ValueError:
|
# Only accept reasonable food product prices
|
||||||
pass
|
if 2.0 <= price_val <= 100.0:
|
||||||
|
return price_val
|
||||||
# Try without £ symbol but with decimal
|
|
||||||
price_match = re.search(r'(\d+\.\d{2})', price_text)
|
|
||||||
if price_match:
|
|
||||||
try:
|
|
||||||
return float(price_match.group(1))
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _find_special_offer_prices(self, soup: BeautifulSoup, site_name: str) -> List[Tuple[float, str]]:
|
||||||
|
"""Find special offer prices using enhanced selectors."""
|
||||||
|
special_prices = []
|
||||||
|
|
||||||
|
# Enhanced selectors for special offers
|
||||||
|
special_offer_selectors = [
|
||||||
|
# General special offer containers
|
||||||
|
'.special-offer', '.sale-price', '.offer-price', '.discount-price',
|
||||||
|
'.promo-price', '.reduced-price', '.deal-price',
|
||||||
|
|
||||||
|
# Strikethrough and comparison pricing
|
||||||
|
'del:contains("£"), s:contains("£"), strike:contains("£")',
|
||||||
|
'.was-price', '.original-price', '.rrp-price',
|
||||||
|
|
||||||
|
# Was/Now pricing containers
|
||||||
|
'.was-now-pricing', '.price-comparison', '.before-after-price',
|
||||||
|
|
||||||
|
# Sale badges and labels
|
||||||
|
'.sale-badge', '.offer-badge', '.discount-badge',
|
||||||
|
'*[class*="sale"]:contains("£")',
|
||||||
|
'*[class*="offer"]:contains("£")',
|
||||||
|
'*[class*="discount"]:contains("£")',
|
||||||
|
|
||||||
|
# Site-specific patterns
|
||||||
|
'.product-price-wrapper', '.price-container', '.pricing-section'
|
||||||
|
]
|
||||||
|
|
||||||
|
if site_name == 'atoz_catering':
|
||||||
|
# A to Z specific selectors - prioritize the offer price class
|
||||||
|
special_offer_selectors.extend([
|
||||||
|
'.my-price.price-offer', # Primary A to Z offer price selector
|
||||||
|
'h3:contains("£")', 'h4:contains("£")',
|
||||||
|
'.delivery-price-special', '.collection-price-special',
|
||||||
|
'*[style*="text-decoration: line-through"]',
|
||||||
|
'*[style*="text-decoration:line-through"]'
|
||||||
|
])
|
||||||
|
elif site_name == 'jjfoodservice':
|
||||||
|
# JJ Food Service specific selectors
|
||||||
|
special_offer_selectors.extend([
|
||||||
|
'.member-price', '.trade-price', '.bulk-price',
|
||||||
|
'.quantity-discount', '.volume-discount'
|
||||||
|
])
|
||||||
|
elif site_name == 'amazon_uk':
|
||||||
|
# Amazon UK specific selectors
|
||||||
|
special_offer_selectors.extend([
|
||||||
|
'.a-price.a-text-price.a-size-medium.apexPriceToPay .a-offscreen',
|
||||||
|
'.a-price-strike .a-offscreen',
|
||||||
|
'#priceblock_dealprice', '#priceblock_saleprice',
|
||||||
|
'.a-price-was', '.a-price-save'
|
||||||
|
])
|
||||||
|
|
||||||
|
for selector in special_offer_selectors:
|
||||||
|
try:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
for element in elements:
|
||||||
|
price_text = element.get_text(strip=True)
|
||||||
|
if '£' in price_text:
|
||||||
|
price = self._parse_uk_price(price_text, detect_special_offers=True, element=element)
|
||||||
|
if price:
|
||||||
|
special_prices.append((price, selector))
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error with special offer selector {selector}: {e}")
|
||||||
|
|
||||||
|
return special_prices
|
||||||
|
|
||||||
def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||||
"""Extract data specifically from JJ Food Service."""
|
"""Extract data specifically from JJ Food Service - simplified approach."""
|
||||||
result = {
|
result = {
|
||||||
'price': None,
|
'price': None,
|
||||||
'title': None,
|
'title': None,
|
||||||
@@ -50,43 +168,85 @@ class UKCateringScraper(PriceScraper):
|
|||||||
'currency': 'GBP'
|
'currency': 'GBP'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Try multiple selectors for price
|
# First, try to find elements with Price in class name and extract delivery price
|
||||||
price_selectors = [
|
price_elements = soup.select('[class*="Price"]')
|
||||||
'.price',
|
logger.debug(f"JJ Food Service: Found {len(price_elements)} price elements")
|
||||||
'.product-price',
|
|
||||||
'[data-testid="price"]',
|
for element in price_elements:
|
||||||
'.price-value',
|
text = element.get_text(strip=True)
|
||||||
'.current-price',
|
logger.debug(f"JJ Food Service: Checking price element text: '{text[:100]}'")
|
||||||
'.product-card-price',
|
|
||||||
'span:contains("£")',
|
# Look for delivery price in concatenated strings like "Collection:£10.49£4.62 per kgDelivery:£11.79£5.19 per kg"
|
||||||
'.cost'
|
delivery_match = re.search(r'Delivery:£(\d{1,3}\.\d{2})', text, re.IGNORECASE)
|
||||||
|
if delivery_match:
|
||||||
|
price_val = float(delivery_match.group(1))
|
||||||
|
result['price'] = price_val
|
||||||
|
logger.info(f"JJ Food Service: Found delivery price £{price_val} in price element")
|
||||||
|
# extract title
|
||||||
|
title_el = soup.select_one('h1')
|
||||||
|
if title_el:
|
||||||
|
result['title'] = title_el.get_text(strip=True)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Second, attempt regex-based parsing of delivery price from raw page text
|
||||||
|
page_text = soup.get_text(separator=' ')
|
||||||
|
logger.debug(f"JJ Food Service page_text snippet: {page_text[:500]!r}")
|
||||||
|
|
||||||
|
# Look for delivery price patterns in the text
|
||||||
|
if 'DELIVERY' in page_text or 'delivery' in page_text:
|
||||||
|
logger.debug(f"Found 'DELIVERY' in page text, looking for price patterns...")
|
||||||
|
delivery_section = page_text[page_text.lower().find('delivery'):page_text.lower().find('delivery')+100]
|
||||||
|
logger.debug(f"Delivery section: {delivery_section!r}")
|
||||||
|
|
||||||
|
# Try multiple patterns for delivery price (based on actual HTML structure)
|
||||||
|
delivery_patterns = [
|
||||||
|
r'Delivery:£(\d{1,3}\.\d{2})', # Delivery:£11.79 (actual format found)
|
||||||
|
r'DELIVERY:£(\d{1,3}\.\d{2})', # DELIVERY:£11.79
|
||||||
|
r'delivery:£(\d{1,3}\.\d{2})', # delivery:£11.79
|
||||||
|
r'DELIVERY:\s*£(\d{1,3}\.\d{2})', # DELIVERY: £11.79 (with space)
|
||||||
|
r'delivery:\s*£(\d{1,3}\.\d{2})', # delivery: £11.79 (with space)
|
||||||
]
|
]
|
||||||
|
|
||||||
for selector in price_selectors:
|
for pattern in delivery_patterns:
|
||||||
|
logger.debug(f"JJ Food Service: Trying pattern: {pattern}")
|
||||||
|
delivery_match = re.search(pattern, page_text, re.IGNORECASE)
|
||||||
|
if delivery_match:
|
||||||
|
price_val = float(delivery_match.group(1))
|
||||||
|
result['price'] = price_val
|
||||||
|
logger.info(f"JJ Food Service: Parsed delivery price £{price_val} via regex pattern: {pattern}")
|
||||||
|
# extract title
|
||||||
|
title_el = soup.select_one('h1')
|
||||||
|
if title_el:
|
||||||
|
result['title'] = title_el.get_text(strip=True)
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
logger.debug(f"JJ Food Service: Pattern {pattern} did not match")
|
||||||
|
# Otherwise, try very specific selectors first - likely to contain prices
|
||||||
|
specific_selectors = [
|
||||||
|
'.price-delivery', # Delivery price specifically
|
||||||
|
'.delivery-price', # Alternative delivery price
|
||||||
|
'.price', # General price class
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in specific_selectors:
|
||||||
try:
|
try:
|
||||||
elements = soup.select(selector)
|
elements = soup.select(selector)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
price_text = element.get_text(strip=True)
|
price_text = element.get_text(strip=True)
|
||||||
price = self._parse_uk_price(price_text)
|
# Only process short text snippets that likely contain just prices
|
||||||
if price is not None:
|
if '£' in price_text and len(price_text) < 30:
|
||||||
result['price'] = price
|
price = self._parse_uk_price(price_text, prefer_delivery=True)
|
||||||
logger.info(f"Successfully scraped jjfoodservice: £{price}")
|
if price is not None:
|
||||||
break
|
result['price'] = price
|
||||||
|
logger.info(f"JJ Food Service: Found price £{price} with selector '{selector}' from text: '{price_text}'")
|
||||||
|
break
|
||||||
if result['price'] is not None:
|
if result['price'] is not None:
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Error with JJ Food Service price selector {selector}: {e}")
|
logger.debug(f"Error with JJ Food Service selector {selector}: {e}")
|
||||||
|
|
||||||
# Try to extract title
|
|
||||||
title_selectors = [
|
|
||||||
'h1',
|
|
||||||
'.product-title',
|
|
||||||
'.product-name',
|
|
||||||
'[data-testid="product-title"]',
|
|
||||||
'.product-card-title',
|
|
||||||
'title'
|
|
||||||
]
|
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
title_selectors = ['h1', '.product-title', '.product-name']
|
||||||
for selector in title_selectors:
|
for selector in title_selectors:
|
||||||
try:
|
try:
|
||||||
element = soup.select_one(selector)
|
element = soup.select_one(selector)
|
||||||
@@ -96,61 +256,65 @@ class UKCateringScraper(PriceScraper):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Error with JJ Food Service title selector {selector}: {e}")
|
logger.debug(f"Error with JJ Food Service title selector {selector}: {e}")
|
||||||
|
|
||||||
# Check availability
|
|
||||||
availability_indicators = [
|
|
||||||
'out of stock',
|
|
||||||
'unavailable',
|
|
||||||
'not available',
|
|
||||||
'temporarily unavailable'
|
|
||||||
]
|
|
||||||
|
|
||||||
page_text = soup.get_text().lower()
|
|
||||||
for indicator in availability_indicators:
|
|
||||||
if indicator in page_text:
|
|
||||||
result['availability'] = False
|
|
||||||
break
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _extract_atoz_catering_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
def _extract_atoz_catering_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||||
"""Extract data specifically from A to Z Catering."""
|
"""Extract data specifically from A to Z Catering - prioritize delivery pricing using regex parse."""
|
||||||
result = {
|
result = {'price': None, 'title': None, 'availability': True, 'currency': 'GBP'}
|
||||||
'price': None,
|
# First, attempt to parse delivery price directly from page text
|
||||||
'title': None,
|
page_text = soup.get_text(separator=' ')
|
||||||
'availability': True,
|
delivery_match = re.search(r'Delivery:\s*£(\d{1,3}\.\d{2})', page_text)
|
||||||
'currency': 'GBP'
|
if delivery_match:
|
||||||
}
|
price_val = float(delivery_match.group(1))
|
||||||
|
result['price'] = price_val
|
||||||
|
logger.info(f"A to Z Catering: Parsed delivery price £{price_val} via regex")
|
||||||
|
# extract title
|
||||||
|
title_el = soup.select_one('h1')
|
||||||
|
if title_el:
|
||||||
|
result['title'] = title_el.get_text(strip=True)
|
||||||
|
return result
|
||||||
|
|
||||||
# A to Z Catering specific selectors
|
# 1) Delivery-specific selectors
|
||||||
price_selectors = [
|
for selector in ['.delivery-price', '.price-delivery']:
|
||||||
'.price',
|
|
||||||
'.product-price',
|
|
||||||
'.delivery-price',
|
|
||||||
'.collection-price',
|
|
||||||
'span:contains("£")',
|
|
||||||
'.price-value',
|
|
||||||
'.cost',
|
|
||||||
'.selling-price'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
try:
|
||||||
elements = soup.select(selector)
|
elements = soup.select(selector)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
price_text = element.get_text(strip=True)
|
text = element.get_text(strip=True)
|
||||||
# Skip if it contains "delivery" or "collection" but no price
|
price = self._parse_uk_price(text, prefer_delivery=True)
|
||||||
if ('delivery' in price_text.lower() or 'collection' in price_text.lower()) and '£' not in price_text:
|
|
||||||
continue
|
|
||||||
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
if price is not None:
|
||||||
result['price'] = price
|
result['price'] = price
|
||||||
logger.info(f"Successfully scraped atoz_catering: £{price}")
|
logger.info(f"A to Z Catering: Found delivery price £{price} from {selector}")
|
||||||
break
|
return result
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Error with A to Z price selector {selector}: {e}")
|
logger.debug(f"Error with A to Z delivery selector {selector}: {e}")
|
||||||
|
|
||||||
|
# 2) Main offer selector (fallback to collection price)
|
||||||
|
for selector in ['.my-price.price-offer']:
|
||||||
|
try:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
for element in elements:
|
||||||
|
text = element.get_text(strip=True)
|
||||||
|
price = self._parse_uk_price(text)
|
||||||
|
if price is not None:
|
||||||
|
result['price'] = price
|
||||||
|
logger.info(f"A to Z Catering: Found collection price £{price} from {selector}")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error with A to Z main selector {selector}: {e}")
|
||||||
|
|
||||||
|
# 3) Fallback general selectors
|
||||||
|
for selector in ['.price', '.product-price']:
|
||||||
|
try:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
for element in elements:
|
||||||
|
text = element.get_text(strip=True)
|
||||||
|
price = self._parse_uk_price(text)
|
||||||
|
if price is not None:
|
||||||
|
result['price'] = price
|
||||||
|
logger.info(f"A to Z Catering: Fallback parsed price £{price} from {selector}")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error with A to Z fallback selector {selector}: {e}")
|
||||||
|
|
||||||
# Extract title
|
# Extract title
|
||||||
title_selectors = [
|
title_selectors = [
|
||||||
@@ -197,7 +361,7 @@ class UKCateringScraper(PriceScraper):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
||||||
"""Extract data specifically from Amazon UK."""
|
"""Extract data specifically from Amazon UK with enhanced special pricing detection."""
|
||||||
result = {
|
result = {
|
||||||
'price': None,
|
'price': None,
|
||||||
'title': None,
|
'title': None,
|
||||||
@@ -205,6 +369,15 @@ class UKCateringScraper(PriceScraper):
|
|||||||
'currency': 'GBP'
|
'currency': 'GBP'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# First, check for special offer prices using enhanced detection
|
||||||
|
special_prices = self._find_special_offer_prices(soup, 'amazon_uk')
|
||||||
|
if special_prices:
|
||||||
|
# Use the lowest special offer price found
|
||||||
|
best_special_price = min(price for price, _ in special_prices)
|
||||||
|
result['price'] = best_special_price
|
||||||
|
logger.info(f"Successfully scraped amazon_uk special offer price: £{best_special_price}")
|
||||||
|
return result
|
||||||
|
|
||||||
# Amazon UK price selectors
|
# Amazon UK price selectors
|
||||||
price_selectors = [
|
price_selectors = [
|
||||||
'.a-price-whole',
|
'.a-price-whole',
|
||||||
@@ -222,7 +395,7 @@ class UKCateringScraper(PriceScraper):
|
|||||||
elements = soup.select(selector)
|
elements = soup.select(selector)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
price_text = element.get_text(strip=True)
|
price_text = element.get_text(strip=True)
|
||||||
price = self._parse_uk_price(price_text)
|
price = self._parse_uk_price(price_text, detect_special_offers=True, element=element)
|
||||||
if price is not None:
|
if price is not None:
|
||||||
result['price'] = price
|
result['price'] = price
|
||||||
break
|
break
|
||||||
@@ -269,6 +442,122 @@ class UKCateringScraper(PriceScraper):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _extract_generic_data(self, soup: BeautifulSoup, site_name: str) -> Dict[str, Any]:
|
||||||
|
"""Generic data extraction for UK sites not specifically implemented."""
|
||||||
|
result = {
|
||||||
|
'price': None,
|
||||||
|
'title': None,
|
||||||
|
'availability': True,
|
||||||
|
'currency': 'GBP'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generic price selectors
|
||||||
|
price_selectors = [
|
||||||
|
'.price',
|
||||||
|
'.product-price',
|
||||||
|
'[data-testid="price"]',
|
||||||
|
'.price-value',
|
||||||
|
'.current-price',
|
||||||
|
'span:contains("£")',
|
||||||
|
'.cost',
|
||||||
|
'.selling-price'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in price_selectors:
|
||||||
|
try:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
for element in elements:
|
||||||
|
price_text = element.get_text(strip=True)
|
||||||
|
price = self._parse_uk_price(price_text)
|
||||||
|
if price is not None:
|
||||||
|
result['price'] = price
|
||||||
|
logger.info(f"Successfully scraped {site_name} generic price: £{price}")
|
||||||
|
break
|
||||||
|
if result['price'] is not None:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error with generic price selector {selector}: {e}")
|
||||||
|
|
||||||
|
# Generic title selectors
|
||||||
|
title_selectors = [
|
||||||
|
'h1',
|
||||||
|
'.product-title',
|
||||||
|
'.product-name',
|
||||||
|
'[data-testid="product-title"]',
|
||||||
|
'title'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in title_selectors:
|
||||||
|
try:
|
||||||
|
element = soup.select_one(selector)
|
||||||
|
if element:
|
||||||
|
result['title'] = element.get_text(strip=True)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error with generic title selector {selector}: {e}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
|
||||||
|
"""Scrape price for a single product from a URL using UK-specific logic."""
|
||||||
|
result = {
|
||||||
|
'success': False,
|
||||||
|
'price': None,
|
||||||
|
'currency': 'GBP',
|
||||||
|
'title': None,
|
||||||
|
'availability': None,
|
||||||
|
'url': url,
|
||||||
|
'error': None
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Validate that this is a supported UK site
|
||||||
|
if site_name not in ['jjfoodservice', 'atoz_catering', 'amazon_uk']:
|
||||||
|
result['error'] = f"Unsupported site for UK scraper: {site_name}"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Check if site is enabled
|
||||||
|
if not self.config.is_site_enabled(site_name):
|
||||||
|
result['error'] = f"Site {site_name} is disabled"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Fetch page content
|
||||||
|
html_content = await self._fetch_page(url)
|
||||||
|
if not html_content:
|
||||||
|
result['error'] = "Failed to fetch page content"
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Parse HTML
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Route to appropriate extraction method
|
||||||
|
if site_name == 'jjfoodservice':
|
||||||
|
extracted_data = self._extract_jjfoodservice_data(soup)
|
||||||
|
elif site_name == 'atoz_catering':
|
||||||
|
extracted_data = self._extract_atoz_catering_data(soup)
|
||||||
|
elif site_name == 'amazon_uk':
|
||||||
|
extracted_data = self._extract_amazon_uk_data(soup)
|
||||||
|
else:
|
||||||
|
# Fallback to generic extraction
|
||||||
|
extracted_data = self._extract_generic_data(soup, site_name)
|
||||||
|
|
||||||
|
if extracted_data['price'] is not None:
|
||||||
|
result.update({
|
||||||
|
'success': True,
|
||||||
|
'price': extracted_data['price'],
|
||||||
|
'title': extracted_data.get('title'),
|
||||||
|
'availability': extracted_data.get('availability')
|
||||||
|
})
|
||||||
|
logger.info(f"Successfully scraped {site_name}: £{extracted_data['price']}")
|
||||||
|
else:
|
||||||
|
result['error'] = "Could not extract price from page"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error scraping {url}: {e}")
|
||||||
|
result['error'] = str(e)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
async def scrape_product(self, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
async def scrape_product(self, product_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
|
||||||
"""Scrape prices for a product from all configured sites."""
|
"""Scrape prices for a product from all configured sites."""
|
||||||
results = {}
|
results = {}
|
||||||
|
|||||||
@@ -1,515 +0,0 @@
|
|||||||
"""
|
|
||||||
Specialized scrapers for UK catering supply sites
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import logging
|
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from .scraper import PriceScraper
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class UKCateringScraper(PriceScraper):
|
|
||||||
"""Specialized scraper for UK catering supply websites."""
|
|
||||||
|
|
||||||
def _parse_uk_price(self, price_text: str) -> Optional[float]:
|
|
||||||
"""Parse UK price format with £ symbol."""
|
|
||||||
if not price_text:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Remove common text and normalize
|
|
||||||
price_text = price_text.lower()
|
|
||||||
price_text = re.sub(r'delivery:|collection:|was:|now:|offer:|from:', '', price_text)
|
|
||||||
|
|
||||||
# Find price with £ symbol
|
|
||||||
price_match = re.search(r'£(\d+\.?\d*)', price_text)
|
|
||||||
if price_match:
|
|
||||||
try:
|
|
||||||
return float(price_match.group(1))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Try without £ symbol but with decimal
|
|
||||||
price_match = re.search(r'(\d+\.\d{2})', price_text)
|
|
||||||
if price_match:
|
|
||||||
try:
|
|
||||||
return float(price_match.group(1))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _extract_jjfoodservice_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
||||||
"""Extract data specifically from JJ Food Service."""
|
|
||||||
result = {
|
|
||||||
'price': None,
|
|
||||||
'title': None,
|
|
||||||
'availability': True,
|
|
||||||
'currency': 'GBP'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Try multiple selectors for price
|
|
||||||
price_selectors = [
|
|
||||||
'.price',
|
|
||||||
'.product-price',
|
|
||||||
'[data-testid="price"]',
|
|
||||||
'.price-value',
|
|
||||||
'.current-price',
|
|
||||||
'.product-card-price',
|
|
||||||
'span:contains("£")',
|
|
||||||
'.cost'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
|
||||||
elements = soup.select(selector)
|
|
||||||
for element in elements:
|
|
||||||
price_text = element.get_text(strip=True)
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
|
||||||
result['price'] = price
|
|
||||||
break
|
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with JJ Food Service price selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Try to extract title
|
|
||||||
title_selectors = [
|
|
||||||
'h1',
|
|
||||||
'.product-title',
|
|
||||||
'.product-name',
|
|
||||||
'[data-testid="product-title"]',
|
|
||||||
'.product-card-title',
|
|
||||||
'title'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in title_selectors:
|
|
||||||
try:
|
|
||||||
element = soup.select_one(selector)
|
|
||||||
if element:
|
|
||||||
result['title'] = element.get_text(strip=True)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with JJ Food Service title selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Check availability
|
|
||||||
availability_indicators = [
|
|
||||||
'out of stock',
|
|
||||||
'unavailable',
|
|
||||||
'not available',
|
|
||||||
'sold out'
|
|
||||||
]
|
|
||||||
|
|
||||||
page_text = soup.get_text().lower()
|
|
||||||
for indicator in availability_indicators:
|
|
||||||
if indicator in page_text:
|
|
||||||
result['availability'] = False
|
|
||||||
break
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _extract_atoz_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
||||||
"""Extract data specifically from A to Z Catering."""
|
|
||||||
result = {
|
|
||||||
'price': None,
|
|
||||||
'title': None,
|
|
||||||
'availability': True,
|
|
||||||
'currency': 'GBP'
|
|
||||||
}
|
|
||||||
|
|
||||||
# A to Z Catering shows prices like "Delivery:£X.XX Collection:£Y.YY"
|
|
||||||
# We'll prioritize the lower price (usually collection)
|
|
||||||
|
|
||||||
price_text = soup.get_text()
|
|
||||||
|
|
||||||
# Look for delivery and collection prices
|
|
||||||
delivery_match = re.search(r'delivery:?\s*£(\d+\.?\d*)', price_text, re.IGNORECASE)
|
|
||||||
collection_match = re.search(r'collection:?\s*£(\d+\.?\d*)', price_text, re.IGNORECASE)
|
|
||||||
|
|
||||||
prices = []
|
|
||||||
if delivery_match:
|
|
||||||
try:
|
|
||||||
prices.append(float(delivery_match.group(1)))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if collection_match:
|
|
||||||
try:
|
|
||||||
prices.append(float(collection_match.group(1)))
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# If we found prices, use the lowest one
|
|
||||||
if prices:
|
|
||||||
result['price'] = min(prices)
|
|
||||||
else:
|
|
||||||
# Fallback to general price extraction
|
|
||||||
price_selectors = [
|
|
||||||
'.price',
|
|
||||||
'.product-price',
|
|
||||||
'span:contains("£")',
|
|
||||||
'.price-value'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
|
||||||
elements = soup.select(selector)
|
|
||||||
for element in elements:
|
|
||||||
price_text = element.get_text(strip=True)
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
|
||||||
result['price'] = price
|
|
||||||
break
|
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with A to Z price selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Extract title - A to Z often has product names in links
|
|
||||||
title_selectors = [
|
|
||||||
'h1',
|
|
||||||
'.product-title',
|
|
||||||
'.product-name',
|
|
||||||
'a[href*="/products/product/"]',
|
|
||||||
'.product-link',
|
|
||||||
'title'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in title_selectors:
|
|
||||||
try:
|
|
||||||
element = soup.select_one(selector)
|
|
||||||
if element:
|
|
||||||
title = element.get_text(strip=True)
|
|
||||||
# Clean up the title
|
|
||||||
if len(title) > 5 and 'A to Z' not in title:
|
|
||||||
result['title'] = title
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with A to Z title selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Check availability - look for "Add To Basket" button
|
|
||||||
add_to_basket = soup.find(text=re.compile('Add To Basket', re.IGNORECASE))
|
|
||||||
if not add_to_basket:
|
|
||||||
# Also check for out of stock indicators
|
|
||||||
out_of_stock_indicators = [
|
|
||||||
'out of stock',
|
|
||||||
'unavailable',
|
|
||||||
'not available',
|
|
||||||
'sold out'
|
|
||||||
]
|
|
||||||
|
|
||||||
page_text = soup.get_text().lower()
|
|
||||||
for indicator in out_of_stock_indicators:
|
|
||||||
if indicator in page_text:
|
|
||||||
result['availability'] = False
|
|
||||||
break
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _extract_amazon_uk_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
||||||
"""Extract data specifically from Amazon UK."""
|
|
||||||
result = {
|
|
||||||
'price': None,
|
|
||||||
'title': None,
|
|
||||||
'availability': True,
|
|
||||||
'currency': 'GBP'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Amazon UK price selectors
|
|
||||||
price_selectors = [
|
|
||||||
'.a-price-whole',
|
|
||||||
'.a-price .a-offscreen',
|
|
||||||
'.a-price-current .a-offscreen',
|
|
||||||
'#priceblock_dealprice',
|
|
||||||
'#priceblock_ourprice',
|
|
||||||
'.a-price-range',
|
|
||||||
'.a-price.a-text-price.a-size-medium.apexPriceToPay .a-offscreen'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
|
||||||
elements = soup.select(selector)
|
|
||||||
for element in elements:
|
|
||||||
price_text = element.get_text(strip=True)
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
|
||||||
result['price'] = price
|
|
||||||
break
|
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Amazon UK price selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
title_selectors = [
|
|
||||||
'#productTitle',
|
|
||||||
'.product-title',
|
|
||||||
'h1.a-size-large'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in title_selectors:
|
|
||||||
try:
|
|
||||||
element = soup.select_one(selector)
|
|
||||||
if element:
|
|
||||||
result['title'] = element.get_text(strip=True)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Amazon UK title selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Check availability
|
|
||||||
availability_text = soup.get_text().lower()
|
|
||||||
if any(phrase in availability_text for phrase in ['out of stock', 'currently unavailable', 'not available']):
|
|
||||||
result['availability'] = False
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _extract_tesco_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
||||||
"""Extract data specifically from Tesco."""
|
|
||||||
result = {
|
|
||||||
'price': None,
|
|
||||||
'title': None,
|
|
||||||
'availability': True,
|
|
||||||
'currency': 'GBP'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Tesco price selectors
|
|
||||||
price_selectors = [
|
|
||||||
'.price-control-wrapper .value',
|
|
||||||
'.price-per-sellable-unit .value',
|
|
||||||
'.price-per-quantity-weight .value',
|
|
||||||
'[data-testid="price-current-value"]',
|
|
||||||
'.price-current',
|
|
||||||
'.product-price .price'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
|
||||||
elements = soup.select(selector)
|
|
||||||
for element in elements:
|
|
||||||
price_text = element.get_text(strip=True)
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
|
||||||
result['price'] = price
|
|
||||||
break
|
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Tesco price selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
title_selectors = [
|
|
||||||
'h1[data-testid="product-title"]',
|
|
||||||
'.product-details-tile h1',
|
|
||||||
'.product-title',
|
|
||||||
'h1.product-name'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in title_selectors:
|
|
||||||
try:
|
|
||||||
element = soup.select_one(selector)
|
|
||||||
if element:
|
|
||||||
result['title'] = element.get_text(strip=True)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Tesco title selector {selector}: {e}")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _extract_sainsburys_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
||||||
"""Extract data specifically from Sainsburys."""
|
|
||||||
result = {
|
|
||||||
'price': None,
|
|
||||||
'title': None,
|
|
||||||
'availability': True,
|
|
||||||
'currency': 'GBP'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Sainsburys price selectors
|
|
||||||
price_selectors = [
|
|
||||||
'.pd__cost__current-price',
|
|
||||||
'.pd__cost .pd__cost__retail-price',
|
|
||||||
'.pricing__now-price',
|
|
||||||
'.product-price__current',
|
|
||||||
'[data-testid="pd-retail-price"]',
|
|
||||||
'.price-per-unit'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
|
||||||
elements = soup.select(selector)
|
|
||||||
for element in elements:
|
|
||||||
price_text = element.get_text(strip=True)
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
|
||||||
result['price'] = price
|
|
||||||
break
|
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Sainsburys price selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
title_selectors = [
|
|
||||||
'.pd__header h1',
|
|
||||||
'h1[data-testid="pd-product-name"]',
|
|
||||||
'.product-name',
|
|
||||||
'.pd__product-name'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in title_selectors:
|
|
||||||
try:
|
|
||||||
element = soup.select_one(selector)
|
|
||||||
if element:
|
|
||||||
result['title'] = element.get_text(strip=True)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Sainsburys title selector {selector}: {e}")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _extract_booker_data(self, soup: BeautifulSoup) -> Dict[str, Any]:
|
|
||||||
"""Extract data specifically from Booker."""
|
|
||||||
result = {
|
|
||||||
'price': None,
|
|
||||||
'title': None,
|
|
||||||
'availability': True,
|
|
||||||
'currency': 'GBP'
|
|
||||||
}
|
|
||||||
|
|
||||||
# Booker price selectors
|
|
||||||
price_selectors = [
|
|
||||||
'.price',
|
|
||||||
'.product-price',
|
|
||||||
'.price-current',
|
|
||||||
'.selling-price',
|
|
||||||
'[data-testid="price"]',
|
|
||||||
'.product-tile-price'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in price_selectors:
|
|
||||||
try:
|
|
||||||
elements = soup.select(selector)
|
|
||||||
for element in elements:
|
|
||||||
price_text = element.get_text(strip=True)
|
|
||||||
price = self._parse_uk_price(price_text)
|
|
||||||
if price is not None:
|
|
||||||
result['price'] = price
|
|
||||||
break
|
|
||||||
if result['price'] is not None:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Booker price selector {selector}: {e}")
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
title_selectors = [
|
|
||||||
'h1',
|
|
||||||
'.product-title',
|
|
||||||
'.product-name',
|
|
||||||
'.product-description h1',
|
|
||||||
'[data-testid="product-title"]'
|
|
||||||
]
|
|
||||||
|
|
||||||
for selector in title_selectors:
|
|
||||||
try:
|
|
||||||
element = soup.select_one(selector)
|
|
||||||
if element:
|
|
||||||
result['title'] = element.get_text(strip=True)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error with Booker title selector {selector}: {e}")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
async def scrape_product_price(self, url: str, site_name: str = None) -> Dict[str, Any]:
|
|
||||||
"""Enhanced scraping for UK catering sites."""
|
|
||||||
result = {
|
|
||||||
'success': False,
|
|
||||||
'price': None,
|
|
||||||
'currency': 'GBP',
|
|
||||||
'title': None,
|
|
||||||
'availability': None,
|
|
||||||
'url': url,
|
|
||||||
'error': None
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Auto-detect site if not provided
|
|
||||||
if not site_name:
|
|
||||||
site_name = self._detect_site(url)
|
|
||||||
if not site_name:
|
|
||||||
result['error'] = "Could not detect site from URL"
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Check if site is enabled
|
|
||||||
if not self.config.is_site_enabled(site_name):
|
|
||||||
result['error'] = f"Site {site_name} is disabled"
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Fetch page content
|
|
||||||
html_content = await self._fetch_page(url)
|
|
||||||
if not html_content:
|
|
||||||
result['error'] = "Failed to fetch page content"
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Parse HTML
|
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
|
||||||
|
|
||||||
# Use specialized extraction based on site
|
|
||||||
if site_name == 'jjfoodservice':
|
|
||||||
extracted_data = self._extract_jjfoodservice_data(soup)
|
|
||||||
elif site_name == 'atoz_catering':
|
|
||||||
extracted_data = self._extract_atoz_data(soup)
|
|
||||||
elif site_name == 'amazon_uk':
|
|
||||||
extracted_data = self._extract_amazon_uk_data(soup)
|
|
||||||
elif site_name == 'tesco':
|
|
||||||
extracted_data = self._extract_tesco_data(soup)
|
|
||||||
elif site_name == 'sainsburys':
|
|
||||||
extracted_data = self._extract_sainsburys_data(soup)
|
|
||||||
elif site_name == 'booker':
|
|
||||||
extracted_data = self._extract_booker_data(soup)
|
|
||||||
else:
|
|
||||||
# Fall back to general extraction
|
|
||||||
return await super().scrape_product_price(url, site_name)
|
|
||||||
|
|
||||||
if extracted_data['price'] is None:
|
|
||||||
result['error'] = "Could not extract price from page"
|
|
||||||
return result
|
|
||||||
|
|
||||||
result.update({
|
|
||||||
'success': True,
|
|
||||||
'price': extracted_data['price'],
|
|
||||||
'currency': extracted_data.get('currency', 'GBP'),
|
|
||||||
'title': extracted_data.get('title'),
|
|
||||||
'availability': extracted_data.get('availability', True)
|
|
||||||
})
|
|
||||||
|
|
||||||
logger.info(f"Successfully scraped {site_name}: £{extracted_data['price']}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error scraping {url}: {e}")
|
|
||||||
result['error'] = str(e)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _detect_site(self, url: str) -> Optional[str]:
|
|
||||||
"""Detect which UK catering site this URL belongs to."""
|
|
||||||
url_lower = url.lower()
|
|
||||||
|
|
||||||
if 'jjfoodservice.com' in url_lower:
|
|
||||||
return 'jjfoodservice'
|
|
||||||
elif 'atoz-catering.co.uk' in url_lower:
|
|
||||||
return 'atoz_catering'
|
|
||||||
elif 'amazon.co.uk' in url_lower:
|
|
||||||
return 'amazon_uk'
|
|
||||||
elif 'tesco.com' in url_lower:
|
|
||||||
return 'tesco'
|
|
||||||
elif 'sainsburys.co.uk' in url_lower:
|
|
||||||
return 'sainsburys'
|
|
||||||
elif 'booker.co.uk' in url_lower:
|
|
||||||
return 'booker'
|
|
||||||
|
|
||||||
# Fall back to parent detection for other sites
|
|
||||||
return super()._detect_site(url)
|
|
||||||
@@ -268,4 +268,70 @@ def create_app():
|
|||||||
fig = go.Figure(data=traces, layout=layout)
|
fig = go.Figure(data=traces, layout=layout)
|
||||||
return json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
|
return json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
|
||||||
|
|
||||||
|
@app.route('/edit_product/<int:product_id>', methods=['GET', 'POST'])
|
||||||
|
def edit_product(product_id):
|
||||||
|
"""Edit an existing product."""
|
||||||
|
product = db_manager.get_product(product_id)
|
||||||
|
if not product:
|
||||||
|
flash('Product not found.', 'error')
|
||||||
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
|
form = ProductForm()
|
||||||
|
|
||||||
|
if form.validate_on_submit():
|
||||||
|
urls = {}
|
||||||
|
if form.jjfoodservice_url.data:
|
||||||
|
urls['jjfoodservice'] = form.jjfoodservice_url.data
|
||||||
|
if form.atoz_catering_url.data:
|
||||||
|
urls['atoz_catering'] = form.atoz_catering_url.data
|
||||||
|
if form.amazon_uk_url.data:
|
||||||
|
urls['amazon_uk'] = form.amazon_uk_url.data
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
flash('Please provide at least one URL to track.', 'error')
|
||||||
|
return render_template('edit_product.html', form=form, product=product)
|
||||||
|
|
||||||
|
try:
|
||||||
|
db_manager.update_product(
|
||||||
|
product_id=product_id,
|
||||||
|
name=form.name.data,
|
||||||
|
description=form.description.data,
|
||||||
|
target_price=form.target_price.data,
|
||||||
|
urls=urls
|
||||||
|
)
|
||||||
|
flash(f'Product "{form.name.data}" updated successfully!', 'success')
|
||||||
|
return redirect(url_for('product_detail', product_id=product_id))
|
||||||
|
except Exception as e:
|
||||||
|
flash(f'Error updating product: {str(e)}', 'error')
|
||||||
|
|
||||||
|
# Pre-populate form with existing data
|
||||||
|
if request.method == 'GET':
|
||||||
|
form.name.data = product['name']
|
||||||
|
form.description.data = product['description']
|
||||||
|
form.target_price.data = product['target_price']
|
||||||
|
|
||||||
|
# URLs are already parsed as a dictionary by the database method
|
||||||
|
urls = product['urls'] if product['urls'] else {}
|
||||||
|
form.jjfoodservice_url.data = urls.get('jjfoodservice', '')
|
||||||
|
form.atoz_catering_url.data = urls.get('atoz_catering', '')
|
||||||
|
form.amazon_uk_url.data = urls.get('amazon_uk', '')
|
||||||
|
|
||||||
|
return render_template('edit_product.html', form=form, product=product)
|
||||||
|
|
||||||
|
@app.route('/delete_product/<int:product_id>', methods=['POST'])
|
||||||
|
def delete_product(product_id):
|
||||||
|
"""Delete a product."""
|
||||||
|
product = db_manager.get_product(product_id)
|
||||||
|
if not product:
|
||||||
|
flash('Product not found.', 'error')
|
||||||
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
|
try:
|
||||||
|
db_manager.delete_product(product_id)
|
||||||
|
flash(f'Product "{product["name"]}" deleted successfully!', 'success')
|
||||||
|
except Exception as e:
|
||||||
|
flash(f'Error deleting product: {str(e)}', 'error')
|
||||||
|
|
||||||
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|||||||
@@ -123,7 +123,8 @@
|
|||||||
<ul class="mb-0 mt-2">
|
<ul class="mb-0 mt-2">
|
||||||
<li>Make sure URLs point to the specific product page</li>
|
<li>Make sure URLs point to the specific product page</li>
|
||||||
<li>Test URLs in your browser first to ensure they work</li>
|
<li>Test URLs in your browser first to ensure they work</li>
|
||||||
<li>Some sites may block automated requests - we'll handle this gracefully</li>
|
<li>The system will automatically prioritize <strong>delivery prices</strong> over collection prices</li>
|
||||||
|
<li>For JJ Food Service and A to Z Catering, ensure you can see delivery pricing on the page</li>
|
||||||
<li>For best results, use direct product page URLs</li>
|
<li>For best results, use direct product page URLs</li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
@@ -154,13 +155,15 @@
|
|||||||
<h6 class="fw-bold">JJ Food Service</h6>
|
<h6 class="fw-bold">JJ Food Service</h6>
|
||||||
<p class="small text-muted">
|
<p class="small text-muted">
|
||||||
Navigate to the specific product page on JJ Food Service and copy the URL.
|
Navigate to the specific product page on JJ Food Service and copy the URL.
|
||||||
Make sure you're logged in for accurate pricing.
|
Make sure you're logged in for accurate pricing. The system will automatically
|
||||||
|
prioritize <strong>delivery prices</strong> over collection prices.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<h6 class="fw-bold">A to Z Catering</h6>
|
<h6 class="fw-bold">A to Z Catering</h6>
|
||||||
<p class="small text-muted">
|
<p class="small text-muted">
|
||||||
Go to the product page on A to Z Catering and copy the URL.
|
Go to the product page on A to Z Catering and copy the URL.
|
||||||
URLs typically contain "/products/product/" followed by the product name.
|
URLs typically contain "/products/product/" followed by the product name.
|
||||||
|
The system will automatically capture <strong>delivery pricing</strong> when available.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-md-6">
|
<div class="col-md-6">
|
||||||
@@ -170,10 +173,11 @@
|
|||||||
The URL should contain "/dp/" followed by the product identifier.
|
The URL should contain "/dp/" followed by the product identifier.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<h6 class="fw-bold text-muted">Note</h6>
|
<h6 class="fw-bold text-success">Delivery Pricing Priority</h6>
|
||||||
<p class="small text-muted">
|
<p class="small text-muted">
|
||||||
We focus on UK catering supply websites that work well with automated price tracking.
|
For JJ Food Service and A to Z Catering, the system automatically prioritizes
|
||||||
This provides reliable price monitoring for your business needs.
|
delivery prices over collection prices. This ensures you're tracking the
|
||||||
|
most relevant pricing for delivered goods to your business.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
190
templates/edit_product.html
Normal file
190
templates/edit_product.html
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block title %}Edit Product - Price Tracker{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="row justify-content-center">
|
||||||
|
<div class="col-lg-8">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<h2 class="mb-0">
|
||||||
|
<i class="fas fa-edit me-2 text-primary"></i>Edit Product: {{ product.name }}
|
||||||
|
</h2>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<form method="POST">
|
||||||
|
{{ form.hidden_tag() }}
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-8 mb-3">
|
||||||
|
{{ form.name.label(class="form-label fw-bold") }}
|
||||||
|
{{ form.name(class="form-control form-control-lg") }}
|
||||||
|
{% if form.name.errors %}
|
||||||
|
<div class="text-danger small mt-1">
|
||||||
|
{% for error in form.name.errors %}
|
||||||
|
<div>{{ error }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
<div class="col-md-4 mb-3">
|
||||||
|
{{ form.target_price.label(class="form-label fw-bold") }}
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-text">£</span>
|
||||||
|
{{ form.target_price(class="form-control form-control-lg") }}
|
||||||
|
</div>
|
||||||
|
{% if form.target_price.errors %}
|
||||||
|
<div class="text-danger small mt-1">
|
||||||
|
{% for error in form.target_price.errors %}
|
||||||
|
<div>{{ error }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
<small class="text-muted">Optional: Alert when price drops below this</small>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="mb-3">
|
||||||
|
{{ form.description.label(class="form-label fw-bold") }}
|
||||||
|
{{ form.description(class="form-control", rows="3") }}
|
||||||
|
{% if form.description.errors %}
|
||||||
|
<div class="text-danger small mt-1">
|
||||||
|
{% for error in form.description.errors %}
|
||||||
|
<div>{{ error }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="my-4">
|
||||||
|
<h5 class="mb-3">
|
||||||
|
<i class="fas fa-link me-2 text-secondary"></i>Store URLs
|
||||||
|
</h5>
|
||||||
|
<p class="text-muted small mb-3">Add URLs from the stores you want to track. At least one URL is required.</p>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6 mb-3">
|
||||||
|
{{ form.jjfoodservice_url.label(class="form-label fw-bold") }}
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-text">
|
||||||
|
<i class="fas fa-store text-primary"></i>
|
||||||
|
</span>
|
||||||
|
{{ form.jjfoodservice_url(class="form-control", placeholder="https://www.jjfoodservice.com/...") }}
|
||||||
|
</div>
|
||||||
|
{% if form.jjfoodservice_url.errors %}
|
||||||
|
<div class="text-danger small mt-1">
|
||||||
|
{% for error in form.jjfoodservice_url.errors %}
|
||||||
|
<div>{{ error }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-6 mb-3">
|
||||||
|
{{ form.atoz_catering_url.label(class="form-label fw-bold") }}
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-text">
|
||||||
|
<i class="fas fa-store text-success"></i>
|
||||||
|
</span>
|
||||||
|
{{ form.atoz_catering_url(class="form-control", placeholder="https://www.atoz-catering.co.uk/...") }}
|
||||||
|
</div>
|
||||||
|
{% if form.atoz_catering_url.errors %}
|
||||||
|
<div class="text-danger small mt-1">
|
||||||
|
{% for error in form.atoz_catering_url.errors %}
|
||||||
|
<div>{{ error }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6 mb-3">
|
||||||
|
{{ form.amazon_uk_url.label(class="form-label fw-bold") }}
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-text">
|
||||||
|
<i class="fab fa-amazon text-warning"></i>
|
||||||
|
</span>
|
||||||
|
{{ form.amazon_uk_url(class="form-control", placeholder="https://www.amazon.co.uk/...") }}
|
||||||
|
</div>
|
||||||
|
{% if form.amazon_uk_url.errors %}
|
||||||
|
<div class="text-danger small mt-1">
|
||||||
|
{% for error in form.amazon_uk_url.errors %}
|
||||||
|
<div>{{ error }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="my-4">
|
||||||
|
|
||||||
|
<div class="d-flex justify-content-between">
|
||||||
|
<div>
|
||||||
|
<button type="submit" class="btn btn-primary btn-lg me-3">
|
||||||
|
<i class="fas fa-save me-2"></i>Update Product
|
||||||
|
</button>
|
||||||
|
<a href="{{ url_for('product_detail', product_id=product.id) }}" class="btn btn-outline-secondary btn-lg">
|
||||||
|
<i class="fas fa-arrow-left me-2"></i>Cancel
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Delete button -->
|
||||||
|
<div>
|
||||||
|
<button type="button" class="btn btn-outline-danger btn-lg" data-bs-toggle="modal" data-bs-target="#deleteModal">
|
||||||
|
<i class="fas fa-trash me-2"></i>Delete Product
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<!-- Help section -->
|
||||||
|
<div class="mt-5">
|
||||||
|
<div class="card bg-light">
|
||||||
|
<div class="card-body">
|
||||||
|
<h6 class="card-title">
|
||||||
|
<i class="fas fa-info-circle me-2 text-info"></i>How to find product URLs
|
||||||
|
</h6>
|
||||||
|
<ul class="card-text small mb-0">
|
||||||
|
<li><strong>JJ Food Service:</strong> Search for your product and copy the URL from the product page</li>
|
||||||
|
<li><strong>A to Z Catering:</strong> Navigate to the specific product and copy the URL</li>
|
||||||
|
<li><strong>Amazon UK:</strong> Find the product and copy the URL (we'll extract the essential part)</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Delete Confirmation Modal -->
|
||||||
|
<div class="modal fade" id="deleteModal" tabindex="-1" aria-labelledby="deleteModalLabel" aria-hidden="true">
|
||||||
|
<div class="modal-dialog">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h5 class="modal-title" id="deleteModalLabel">
|
||||||
|
<i class="fas fa-exclamation-triangle me-2 text-warning"></i>Confirm Delete
|
||||||
|
</h5>
|
||||||
|
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Are you sure you want to delete <strong>"{{ product.name }}"</strong>?</p>
|
||||||
|
<div class="alert alert-warning">
|
||||||
|
<i class="fas fa-warning me-2"></i>
|
||||||
|
<strong>Warning:</strong> This action cannot be undone. All price history for this product will be permanently deleted.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
|
||||||
|
<form method="POST" action="{{ url_for('delete_product', product_id=product.id) }}" style="display: inline;">
|
||||||
|
<button type="submit" class="btn btn-danger">
|
||||||
|
<i class="fas fa-trash me-2"></i>Delete Product
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% endblock %}
|
||||||
@@ -97,6 +97,16 @@
|
|||||||
<i class="fas fa-sync-alt me-1"></i>Scrape Now
|
<i class="fas fa-sync-alt me-1"></i>Scrape Now
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="btn-group" role="group">
|
||||||
|
<a href="{{ url_for('edit_product', product_id=product.id) }}" class="btn btn-outline-secondary">
|
||||||
|
<i class="fas fa-edit me-1"></i>Edit
|
||||||
|
</a>
|
||||||
|
<button class="btn btn-outline-danger delete-product-btn"
|
||||||
|
data-product-id="{{ product.id }}"
|
||||||
|
data-product-name="{{ product.name }}">
|
||||||
|
<i class="fas fa-trash me-1"></i>Delete
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -181,4 +191,58 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
<!-- Delete Confirmation Modal -->
|
||||||
|
<div class="modal fade" id="deleteModal" tabindex="-1" aria-labelledby="deleteModalLabel" aria-hidden="true">
|
||||||
|
<div class="modal-dialog">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h5 class="modal-title" id="deleteModalLabel">
|
||||||
|
<i class="fas fa-exclamation-triangle me-2 text-warning"></i>Confirm Delete
|
||||||
|
</h5>
|
||||||
|
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Are you sure you want to delete <strong id="deleteProductName"></strong>?</p>
|
||||||
|
<div class="alert alert-warning">
|
||||||
|
<i class="fas fa-warning me-2"></i>
|
||||||
|
<strong>Warning:</strong> This action cannot be undone. All price history for this product will be permanently deleted.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
|
||||||
|
<form id="deleteForm" method="POST" style="display: inline;">
|
||||||
|
<button type="submit" class="btn btn-danger">
|
||||||
|
<i class="fas fa-trash me-2"></i>Delete Product
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Handle delete product buttons
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
const deleteButtons = document.querySelectorAll('.delete-product-btn');
|
||||||
|
const deleteModal = document.getElementById('deleteModal');
|
||||||
|
const deleteForm = document.getElementById('deleteForm');
|
||||||
|
const deleteProductName = document.getElementById('deleteProductName');
|
||||||
|
|
||||||
|
deleteButtons.forEach(button => {
|
||||||
|
button.addEventListener('click', function() {
|
||||||
|
const productId = this.getAttribute('data-product-id');
|
||||||
|
const productName = this.getAttribute('data-product-name');
|
||||||
|
|
||||||
|
// Update modal content
|
||||||
|
deleteProductName.textContent = productName;
|
||||||
|
deleteForm.action = `/delete_product/${productId}`;
|
||||||
|
|
||||||
|
// Show modal
|
||||||
|
const modal = new bootstrap.Modal(deleteModal);
|
||||||
|
modal.show();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
@@ -14,6 +14,16 @@
|
|||||||
<button class="btn btn-success me-2" onclick="scrapeProduct({{ product.id }})">
|
<button class="btn btn-success me-2" onclick="scrapeProduct({{ product.id }})">
|
||||||
<i class="fas fa-sync-alt me-1"></i>Scrape Now
|
<i class="fas fa-sync-alt me-1"></i>Scrape Now
|
||||||
</button>
|
</button>
|
||||||
|
<a href="{{ url_for('edit_product', product_id=product.id) }}" class="btn btn-outline-primary me-2">
|
||||||
|
<i class="fas fa-edit me-1"></i>Edit
|
||||||
|
</a>
|
||||||
|
<button class="btn btn-outline-danger me-2 delete-product-btn"
|
||||||
|
data-product-id="{{ product.id }}"
|
||||||
|
data-product-name="{{ product.name }}"
|
||||||
|
data-bs-toggle="modal"
|
||||||
|
data-bs-target="#deleteModal">
|
||||||
|
<i class="fas fa-trash me-1"></i>Delete
|
||||||
|
</button>
|
||||||
<a href="{{ url_for('index') }}" class="btn btn-outline-secondary">
|
<a href="{{ url_for('index') }}" class="btn btn-outline-secondary">
|
||||||
<i class="fas fa-arrow-left me-1"></i>Back to Dashboard
|
<i class="fas fa-arrow-left me-1"></i>Back to Dashboard
|
||||||
</a>
|
</a>
|
||||||
@@ -222,6 +232,35 @@
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Delete Confirmation Modal -->
|
||||||
|
<div class="modal fade" id="deleteModal" tabindex="-1" aria-labelledby="deleteModalLabel" aria-hidden="true">
|
||||||
|
<div class="modal-dialog">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h5 class="modal-title" id="deleteModalLabel">
|
||||||
|
<i class="fas fa-exclamation-triangle me-2 text-warning"></i>Confirm Delete
|
||||||
|
</h5>
|
||||||
|
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Are you sure you want to delete <strong>"{{ product.name }}"</strong>?</p>
|
||||||
|
<div class="alert alert-warning">
|
||||||
|
<i class="fas fa-warning me-2"></i>
|
||||||
|
<strong>Warning:</strong> This action cannot be undone. All price history for this product will be permanently deleted.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Cancel</button>
|
||||||
|
<form method="POST" action="{{ url_for('delete_product', product_id=product.id) }}" style="display: inline;">
|
||||||
|
<button type="submit" class="btn btn-danger">
|
||||||
|
<i class="fas fa-trash me-2"></i>Delete Product
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
||||||
{% block scripts %}
|
{% block scripts %}
|
||||||
@@ -231,4 +270,20 @@
|
|||||||
Plotly.newPlot('priceChart', chartData.data, chartData.layout, {responsive: true});
|
Plotly.newPlot('priceChart', chartData.data, chartData.layout, {responsive: true});
|
||||||
</script>
|
</script>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Handle delete product button
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
const deleteButton = document.querySelector('.delete-product-btn');
|
||||||
|
const deleteModal = document.getElementById('deleteModal');
|
||||||
|
|
||||||
|
if (deleteButton) {
|
||||||
|
deleteButton.addEventListener('click', function() {
|
||||||
|
// Show modal
|
||||||
|
const modal = new bootstrap.Modal(deleteModal);
|
||||||
|
modal.show();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
35
test_actual_scraper.py
Normal file
35
test_actual_scraper.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
|
|
||||||
|
from uk_scraper import scrape_jj_foodservice
|
||||||
|
|
||||||
|
async def test_actual_scraper():
|
||||||
|
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||||
|
|
||||||
|
print(f"Testing actual scraper with URL: {url}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await scrape_jj_foodservice(url)
|
||||||
|
print(f"Scraper result: {result}")
|
||||||
|
|
||||||
|
if result:
|
||||||
|
print(f"✅ Name: {result.get('name', 'Not found')}")
|
||||||
|
print(f"✅ Collection Price: £{result.get('collection_price', 'Not found')}")
|
||||||
|
print(f"✅ Delivery Price: £{result.get('delivery_price', 'Not found')}")
|
||||||
|
print(f"✅ Image URL: {result.get('image_url', 'Not found')}")
|
||||||
|
else:
|
||||||
|
print("❌ Scraper returned None")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error occurred: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_actual_scraper())
|
||||||
53
test_jj_detailed.py
Normal file
53
test_jj_detailed.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
async def test_jj_patterns():
|
||||||
|
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
html = await response.text()
|
||||||
|
|
||||||
|
print(f"HTML content length: {len(html)}")
|
||||||
|
|
||||||
|
# Look for various keywords
|
||||||
|
keywords = ['DELIVERY', 'delivery', 'COLLECTION', 'collection', '£10.49', '£11.79', '10.49', '11.79']
|
||||||
|
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword in html:
|
||||||
|
print(f"'{keyword}' FOUND in HTML")
|
||||||
|
# Find context around the keyword
|
||||||
|
index = html.find(keyword)
|
||||||
|
start = max(0, index - 100)
|
||||||
|
end = min(len(html), index + 100)
|
||||||
|
context = html[start:end]
|
||||||
|
print(f"Context: ...{context}...")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print(f"'{keyword}' NOT found in HTML")
|
||||||
|
|
||||||
|
# Look for any price-like patterns
|
||||||
|
price_patterns = re.findall(r'£?(\d{1,3}\.\d{2})', html)
|
||||||
|
print(f"\nAll price patterns found: {price_patterns}")
|
||||||
|
|
||||||
|
# Try to find price elements using BeautifulSoup
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
|
# Look for specific CSS classes that might contain prices
|
||||||
|
price_selectors = [
|
||||||
|
'.price', '.product-price', '.delivery-price', '.price-delivery',
|
||||||
|
'[class*="price"]', '[class*="Price"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in price_selectors:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
if elements:
|
||||||
|
print(f"\nFound elements with selector '{selector}':")
|
||||||
|
for elem in elements[:5]: # Show first 5
|
||||||
|
print(f" - {elem.get_text(strip=True)}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_jj_patterns())
|
||||||
54
test_jj_simple.py
Normal file
54
test_jj_simple.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple test to debug JJ Food Service scraping
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.append(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
from src.uk_scraper import UKCateringScraper
|
||||||
|
from src.config import Config
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Set up verbose logging
|
||||||
|
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s - %(message)s')
|
||||||
|
|
||||||
|
async def test_jj_scraping():
|
||||||
|
config = Config()
|
||||||
|
|
||||||
|
async with UKCateringScraper(config) as scraper:
|
||||||
|
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||||
|
|
||||||
|
print(f"Testing URL: {url}")
|
||||||
|
|
||||||
|
# Get the raw HTML content
|
||||||
|
html_content = await scraper._fetch_page(url)
|
||||||
|
|
||||||
|
if html_content:
|
||||||
|
print(f"HTML content length: {len(html_content)}")
|
||||||
|
print("First 500 characters of HTML:")
|
||||||
|
print(html_content[:500])
|
||||||
|
print("\n" + "="*50 + "\n")
|
||||||
|
|
||||||
|
# Look for delivery text
|
||||||
|
if 'DELIVERY' in html_content:
|
||||||
|
print("Found 'DELIVERY' in HTML content")
|
||||||
|
# Find the context around DELIVERY
|
||||||
|
delivery_pos = html_content.find('DELIVERY')
|
||||||
|
context = html_content[delivery_pos:delivery_pos+100]
|
||||||
|
print(f"Context around DELIVERY: {context}")
|
||||||
|
else:
|
||||||
|
print("'DELIVERY' not found in HTML content")
|
||||||
|
|
||||||
|
# Look for any price patterns
|
||||||
|
import re
|
||||||
|
price_matches = re.findall(r'£(\d{1,3}(?:\.\d{2})?)', html_content)
|
||||||
|
print(f"All price patterns found: {price_matches}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("Failed to fetch HTML content")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_jj_scraping())
|
||||||
51
test_regex_patterns.py
Normal file
51
test_regex_patterns.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test the exact regex patterns against the actual HTML content
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
async def test_jj_patterns():
|
||||||
|
url = "https://www.jjfoodservice.com/product/London-Enfield/BAC002/"
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
html_content = await response.text()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
page_text = soup.get_text(separator=' ')
|
||||||
|
|
||||||
|
print(f"Page text length: {len(page_text)}")
|
||||||
|
|
||||||
|
# Find the section with delivery info
|
||||||
|
delivery_start = page_text.lower().find('delivery')
|
||||||
|
if delivery_start >= 0:
|
||||||
|
delivery_section = page_text[delivery_start:delivery_start+200]
|
||||||
|
print(f"Delivery section: {delivery_section!r}")
|
||||||
|
|
||||||
|
# Test the exact patterns
|
||||||
|
delivery_patterns = [
|
||||||
|
r'Delivery:£(\d{1,3}\.\d{2})', # Delivery:£11.79
|
||||||
|
r'DELIVERY:£(\d{1,3}\.\d{2})', # DELIVERY:£11.79
|
||||||
|
r'delivery:£(\d{1,3}\.\d{2})', # delivery:£11.79
|
||||||
|
r'DELIVERY:\s*£(\d{1,3}\.\d{2})', # DELIVERY: £11.79
|
||||||
|
r'delivery:\s*£(\d{1,3}\.\d{2})', # delivery: £11.79
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in delivery_patterns:
|
||||||
|
match = re.search(pattern, page_text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
print(f"✅ Pattern '{pattern}' matched! Price: £{match.group(1)}")
|
||||||
|
return float(match.group(1))
|
||||||
|
else:
|
||||||
|
print(f"❌ Pattern '{pattern}' did not match")
|
||||||
|
|
||||||
|
print("No delivery patterns matched!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
result = asyncio.run(test_jj_patterns())
|
||||||
|
print(f"Final result: {result}")
|
||||||
46
test_scraper.py
Normal file
46
test_scraper.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to debug scraping issues for JJ Food Service and A to Z Catering
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
from src.uk_scraper import UKCateringScraper
|
||||||
|
from src.config import Config
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
|
async def test_scraping():
|
||||||
|
config = Config()
|
||||||
|
|
||||||
|
async with UKCateringScraper(config) as scraper:
|
||||||
|
# Test URLs that were problematic
|
||||||
|
test_urls = [
|
||||||
|
"https://www.jjfoodservice.com/catering-products/confectionery-and-snacks/chocolate/cadbury-dairy-milk-chocolate-bar-110g",
|
||||||
|
"https://www.atozcatering.co.uk/catering-equipment/refrigeration/prep-fridges/polar-single-door-prep-counter-fridge-240ltr",
|
||||||
|
"https://www.atozcatering.co.uk/catering-equipment/cooking-equipment/fryers/buffalo-single-tank-induction-fryer-5ltr"
|
||||||
|
]
|
||||||
|
|
||||||
|
for url in test_urls:
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Testing URL: {url}")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await scraper.scrape_product(url)
|
||||||
|
if result:
|
||||||
|
print(f"Success! Result: {result}")
|
||||||
|
else:
|
||||||
|
print("Failed to scrape product")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_scraping())
|
||||||
225
test_special_pricing.py
Normal file
225
test_special_pricing.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for special pricing detection in UK scraper.
|
||||||
|
This script tests various special pricing scenarios to ensure the enhanced detection works correctly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
|
|
||||||
|
from uk_scraper import UKCateringScraper
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def create_test_html_scenarios():
|
||||||
|
"""Create test HTML scenarios for different special pricing patterns."""
|
||||||
|
|
||||||
|
scenarios = {
|
||||||
|
'strikethrough_pricing': """
|
||||||
|
<div class="product-price">
|
||||||
|
<del>£15.99</del>
|
||||||
|
<span class="sale-price">£12.99</span>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'was_now_pricing': """
|
||||||
|
<div class="price-container">
|
||||||
|
<span>Was £20.50, now £17.25</span>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'offer_label_pricing': """
|
||||||
|
<div class="special-offer">
|
||||||
|
<span class="offer-badge">SPECIAL OFFER</span>
|
||||||
|
<span class="price">£8.99</span>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'delivery_special_pricing': """
|
||||||
|
<div class="delivery-pricing">
|
||||||
|
<h3>Delivery: <del>£25.00</del> £19.99</h3>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'multiple_prices_no_context': """
|
||||||
|
<div class="price-section">
|
||||||
|
<span>£15.99</span>
|
||||||
|
<span>£12.99</span>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'amazon_deal_pricing': """
|
||||||
|
<div class="a-price">
|
||||||
|
<span class="a-price-strike">£29.99</span>
|
||||||
|
<span class="a-price-current">£24.99</span>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'jj_member_pricing': """
|
||||||
|
<div class="member-price">
|
||||||
|
<span class="standard-price">£18.50</span>
|
||||||
|
<span class="member-discount">Member price: £15.25</span>
|
||||||
|
</div>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'atoz_h3_delivery': """
|
||||||
|
<h3>Delivery: Was £22.00 Now £18.50</h3>
|
||||||
|
""",
|
||||||
|
|
||||||
|
'percentage_discount': """
|
||||||
|
<div class="discount-container">
|
||||||
|
<span class="discount-badge">20% OFF</span>
|
||||||
|
<span class="original-price">RRP £25.00</span>
|
||||||
|
<span class="sale-price">£20.00</span>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
return scenarios
|
||||||
|
|
||||||
|
|
||||||
|
async def test_special_pricing_scenarios():
|
||||||
|
"""Test the special pricing detection with various scenarios."""
|
||||||
|
|
||||||
|
# Initialize the scraper
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
|
||||||
|
scenarios = create_test_html_scenarios()
|
||||||
|
|
||||||
|
print("Testing Special Pricing Detection")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
for scenario_name, html_content in scenarios.items():
|
||||||
|
print(f"\nTesting: {scenario_name}")
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
# Parse the HTML
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Test with different sites
|
||||||
|
for site_name in ['jjfoodservice', 'atoz_catering', 'amazon_uk']:
|
||||||
|
print(f"\n {site_name}:")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Test special offer detection
|
||||||
|
special_prices = scraper._find_special_offer_prices(soup, site_name)
|
||||||
|
if special_prices:
|
||||||
|
best_price = min(price for price, _ in special_prices)
|
||||||
|
print(f" ✓ Special offers found: {special_prices}")
|
||||||
|
print(f" ✓ Best price: £{best_price}")
|
||||||
|
else:
|
||||||
|
print(f" ✗ No special offers detected")
|
||||||
|
|
||||||
|
# Test the extraction methods
|
||||||
|
if site_name == 'jjfoodservice':
|
||||||
|
result = scraper._extract_jjfoodservice_data(soup)
|
||||||
|
elif site_name == 'atoz_catering':
|
||||||
|
result = scraper._extract_atoz_catering_data(soup)
|
||||||
|
elif site_name == 'amazon_uk':
|
||||||
|
result = scraper._extract_amazon_uk_data(soup)
|
||||||
|
|
||||||
|
if result['price']:
|
||||||
|
print(f" ✓ Extracted price: £{result['price']}")
|
||||||
|
else:
|
||||||
|
print(f" ✗ No price extracted")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✗ Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_uk_price_functionality():
|
||||||
|
"""Test the enhanced _parse_uk_price function."""
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
|
||||||
|
print("\n\nTesting _parse_uk_price Functionality")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
("£15.99", False, False, 15.99),
|
||||||
|
("Was £20.00 Now £15.99", False, True, 15.99),
|
||||||
|
("£25.50 £19.99", False, True, 19.99),
|
||||||
|
("Delivery: £12.50", True, False, 12.50),
|
||||||
|
("Collection: £10.00 Delivery: £12.50", True, False, 12.50),
|
||||||
|
("RRP £30.00 Sale £24.99", False, True, 24.99),
|
||||||
|
("Save £5.00! Was £25.00 Now £20.00", False, True, 20.00),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (price_text, prefer_delivery, detect_special, expected) in enumerate(test_cases, 1):
|
||||||
|
print(f"\nTest {i}: '{price_text}'")
|
||||||
|
print(f" prefer_delivery={prefer_delivery}, detect_special={detect_special}")
|
||||||
|
|
||||||
|
# Create a mock element for testing
|
||||||
|
mock_html = f"<span>{price_text}</span>"
|
||||||
|
mock_element = BeautifulSoup(mock_html, 'html.parser').find('span')
|
||||||
|
|
||||||
|
result = scraper._parse_uk_price(
|
||||||
|
price_text,
|
||||||
|
prefer_delivery=prefer_delivery,
|
||||||
|
detect_special_offers=detect_special,
|
||||||
|
element=mock_element
|
||||||
|
)
|
||||||
|
|
||||||
|
if result == expected:
|
||||||
|
print(f" ✓ Result: £{result} (Expected: £{expected})")
|
||||||
|
else:
|
||||||
|
print(f" ✗ Result: £{result} (Expected: £{expected})")
|
||||||
|
|
||||||
|
|
||||||
|
def test_special_pricing_context():
|
||||||
|
"""Test the special pricing context detection."""
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
|
||||||
|
print("\n\nTesting Special Pricing Context Detection")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
context_test_cases = [
|
||||||
|
('<div class="sale"><del>£20.00</del><span>£15.99</span></div>', 'strikethrough'),
|
||||||
|
('<div>Was £25.00 Now £19.99</div>', 'was_now'),
|
||||||
|
('<div class="special-offer">£12.99</div>', 'offer_label'),
|
||||||
|
('<div><span style="text-decoration: line-through">£18.00</span>£14.99</div>', 'inline_strikethrough'),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (html_content, test_type) in enumerate(context_test_cases, 1):
|
||||||
|
print(f"\nTest {i}: {test_type}")
|
||||||
|
print(f" HTML: {html_content}")
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
element = soup.find(['span', 'div'])
|
||||||
|
|
||||||
|
if element:
|
||||||
|
context = scraper._extract_special_pricing_context(element)
|
||||||
|
print(f" ✓ Context: {context}")
|
||||||
|
else:
|
||||||
|
print(f" ✗ No element found")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("UK Scraper Special Pricing Test Suite")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Test the price parsing functionality
|
||||||
|
test_parse_uk_price_functionality()
|
||||||
|
|
||||||
|
# Test special pricing context detection
|
||||||
|
test_special_pricing_context()
|
||||||
|
|
||||||
|
# Test full scenarios
|
||||||
|
asyncio.run(test_special_pricing_scenarios())
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Test suite completed!")
|
||||||
57
validate_fix.py
Normal file
57
validate_fix.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick validation that the A to Z Catering pricing is working correctly
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Add the src directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
|
||||||
|
|
||||||
|
async def validate_atoz_pricing():
|
||||||
|
"""Test the A to Z Catering pricing fix."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
from uk_scraper import UKCateringScraper
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
print("Testing A to Z Catering pricing fix...")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
scraper = UKCateringScraper(config)
|
||||||
|
|
||||||
|
# Test the problematic URL
|
||||||
|
url = 'https://www.atoz-catering.co.uk/products/product/coca-cola-cans--coke-gb---24'
|
||||||
|
|
||||||
|
print(f"Testing URL: {url}")
|
||||||
|
print("Expected price: £12.99 (not £1.39)")
|
||||||
|
print("Testing...")
|
||||||
|
|
||||||
|
result = await scraper.scrape_product_price(url, 'atoz_catering')
|
||||||
|
|
||||||
|
print(f"\nResults:")
|
||||||
|
print(f"Success: {result['success']}")
|
||||||
|
|
||||||
|
if result['success'] and result['price']:
|
||||||
|
price = result['price']
|
||||||
|
print(f"Price found: £{price}")
|
||||||
|
|
||||||
|
if price == 12.99:
|
||||||
|
print("✅ FIXED! Correct price detected (£12.99)")
|
||||||
|
elif price == 1.39:
|
||||||
|
print("❌ STILL BROKEN! Wrong price detected (£1.39)")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Different price detected: £{price}")
|
||||||
|
else:
|
||||||
|
print(f"❌ Failed to scrape: {result.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(validate_atoz_pricing())
|
||||||
Reference in New Issue
Block a user