#!/usr/bin/env python3
"""
Scrape metadata from Leggett (frenchestateagents.com) property detail pages.

Uses Playwright (Cloudflare blocks plain requests). Extracts: price, bedrooms,
land size, building size, description, coordinates, region.

Usage:
    python3 enrich_leggett.py                # Scrape all missing
    python3 enrich_leggett.py --limit 3      # Test on 3 properties
    python3 enrich_leggett.py --dry-run      # Show what would be scraped
"""
import argparse
import asyncio
import re
import time

from store import load, persist, upsert, is_active, short_url, browser_page, wait_for_cloudflare


def needs_enrichment(prop, force=False, coords_only=False):
    """Property is Leggett, active, and missing key metadata."""
    if prop.get('source') != 'leggett':
        return False
    if not is_active(prop):
        return False
    if force:
        return True
    if coords_only:
        # Only re-scrape if missing postal_code (needed for geocoding)
        return not prop.get('postal_code')
    # Skip if already has price AND bedrooms (idempotent)
    if prop.get('price') and prop.get('bedrooms'):
        return False
    return True


async def scrape_property(page, url):
    """Navigate to a Leggett property page and extract metadata."""
    try:
        await page.goto(url, wait_until='domcontentloaded', timeout=30000)
        # Wait for page to settle (Cloudflare challenge)
        await page.wait_for_timeout(3000)

        if not await wait_for_cloudflare(page):
            return None, 'Cloudflare blocked'

        fields = await page.evaluate("""() => {
            const result = {};

            // Title
            const h1 = document.querySelector('h1');
            if (h1) result.title = h1.textContent.trim();

            // Price — extract euro amount only (page shows both EUR and GBP)
            const priceEl = document.querySelector('.price .new-price, .price, .detail-price');
            if (priceEl) {
                const text = priceEl.textContent;
                const euroMatch = text.match(/€\\s*([\\d,.]+)/);
                if (euroMatch) {
                    const priceText = euroMatch[1].replace(/[^0-9]/g, '');
                    if (priceText) result.price = parseInt(priceText);
                }
            }

            // Coordinates from map
            const mapImg = document.querySelector('img.map-region, img[data-lat]');
            if (mapImg) {
                const lat = mapImg.dataset?.lat || mapImg.getAttribute('data-lat');
                const lon = mapImg.dataset?.lon || mapImg.getAttribute('data-lon');
                if (lat && lon) {
                    result.lat = parseFloat(lat);
                    result.lon = parseFloat(lon);
                }
            }

            // Also check for map iframe or OpenStreetMap embed
            const iframe = document.querySelector('iframe[src*="openstreetmap"], iframe[src*="google.com/maps"]');
            if (iframe && !result.lat) {
                const src = iframe.getAttribute('src') || '';
                const coordMatch = src.match(/[-+]?\\d+\\.\\d+/g);
                if (coordMatch && coordMatch.length >= 2) {
                    result.lat = parseFloat(coordMatch[0]);
                    result.lon = parseFloat(coordMatch[1]);
                }
            }

            // Region/location
            const locEl = document.querySelector('.locations .primary, .breadcrumb, .detail-location');
            if (locEl) result.region_text = locEl.textContent.trim();

            // Description
            const descEl = document.querySelector('.description, .property-description, .detail-description, #description');
            if (descEl) result.description = descEl.textContent.trim().slice(0, 2000);

            // Feature list (bedrooms, land, building size)
            const features = document.querySelectorAll('.features li, .detail-features li, .key-features li, .property-features li, .summary-list li, ul.list-unstyled li');
            for (const li of features) {
                const text = li.textContent.trim().toLowerCase();

                // Bedrooms
                const bedMatch = text.match(/(\\d+)\\s*(?:bed|chambre|slaap)/);
                if (bedMatch) result.bedrooms = parseInt(bedMatch[1]);

                // Land size
                const landMatch = text.match(/(\\d[\\d.,]*)\\s*(?:m²|m2|sqm).*(?:land|terrain|ground|plot)/i)
                    || text.match(/(?:land|terrain|ground|plot).*?(\\d[\\d.,]*)\\s*(?:m²|m2|sqm|hectare)/i);
                if (landMatch) {
                    let val = parseFloat(landMatch[1].replace(',', ''));
                    if (text.includes('hectare')) val *= 10000;
                    result.land_size_m2 = val;
                }

                // Also try standalone area patterns
                const areaMatch = text.match(/(\\d[\\d.,]*)\\s*(?:m²|m2|sqm)/);
                if (areaMatch && !result.land_size_m2 && !result.building_size_m2) {
                    const val = parseFloat(areaMatch[1].replace(',', ''));
                    // Guess: > 1000 = land, otherwise building
                    if (val >= 1000) result.land_size_m2 = val;
                    else result.building_size_m2 = val;
                }

                // Building size
                const buildMatch = text.match(/(\\d[\\d.,]*)\\s*(?:m²|m2|sqm).*(?:habitable|living|floor|build)/i)
                    || text.match(/(?:habitable|living|floor|build).*?(\\d[\\d.,]*)\\s*(?:m²|m2|sqm)/i);
                if (buildMatch) result.building_size_m2 = parseFloat(buildMatch[1].replace(',', ''));
            }

            // Fallback: scan full page text for bedroom count
            if (!result.bedrooms) {
                const body = document.body.textContent;
                const bedFallback = body.match(/(\\d+)\\s*(?:bedroom|bed\\b)/i);
                if (bedFallback) result.bedrooms = parseInt(bedFallback[1]);
            }

            // Extract postal code + city from JSON-LD or Google Maps link
            const jsonlds = document.querySelectorAll('script[type="application/ld+json"]');
            for (const s of jsonlds) {
                try {
                    const data = JSON.parse(s.textContent);
                    if (data['@type'] === 'RealEstateListing' && data.address) {
                        const addr = data.address;
                        if (addr.postalCode) result.postal_code = addr.postalCode;
                        if (addr.addressLocality) result.city = addr.addressLocality;
                        if (addr.addressRegion) result.department = addr.addressRegion;
                    }
                } catch {}
            }

            // Fallback: extract from Google Maps link
            if (!result.postal_code) {
                const mapLink = document.querySelector('a[href*="maps.google.com"]');
                if (mapLink) {
                    const href = mapLink.getAttribute('href') || '';
                    const qMatch = href.match(/[?&]q=([^&]+)/);
                    if (qMatch) {
                        const parts = decodeURIComponent(qMatch[1]).replace(/\\+/g, ' ').trim();
                        const pcMatch = parts.match(/(\\d{5})/);
                        if (pcMatch) result.postal_code = pcMatch[1];
                        // City is usually after the postal code
                        const cityMatch = parts.match(/\\d{5}\\s+(.+)/);
                        if (cityMatch && !result.city) result.city = cityMatch[1];
                    }
                }
            }

            return result;
        }""")

        if not fields or (not fields.get('price') and not fields.get('title')):
            return None, 'No data extracted'

        return fields, 'ok'

    except Exception as e:
        return None, str(e)[:100]


async def main():
    parser = argparse.ArgumentParser(description='Enrich Leggett property metadata')
    parser.add_argument('--limit', type=int, default=0, help='Max properties to scrape (0=all)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be scraped')
    parser.add_argument('--headless', action='store_true', help='Run headless (may fail on Cloudflare)')
    parser.add_argument('--force', action='store_true', help='Re-scrape already enriched')
    parser.add_argument('--coords-only', action='store_true', help='Only re-scrape properties missing postal code')
    args = parser.parse_args()

    store = load()
    candidates = [url for url, p in store.items() if needs_enrichment(p, args.force, args.coords_only)]

    if args.limit:
        candidates = candidates[:args.limit]

    print(f"Leggett properties needing enrichment: {len(candidates)}")

    if args.dry_run:
        for url in candidates:
            print(f"  {url}")
        return

    if not candidates:
        print("Nothing to do.")
        return

    enriched = 0
    failed = 0

    async with browser_page(headless=args.headless) as page:
        for i, url in enumerate(candidates):
            print(f"  [{i+1}/{len(candidates)}] {short_url(url)}...", end=' ')

            fields, status = await scrape_property(page, url)

            if fields:
                # Clean up: only upsert fields we actually extracted
                clean = {}
                if fields.get('title') and fields['title'] != 'Just a moment...':
                    clean['title'] = fields['title']
                for key in ['price', 'bedrooms', 'land_size_m2', 'building_size_m2',
                            'lat', 'lon', 'description', 'postal_code', 'city', 'department']:
                    if fields.get(key):
                        clean[key] = fields[key]
                if fields.get('region_text'):
                    clean['location'] = fields['region_text']
                    clean['location_source'] = 'leggett_scrape'

                if clean:
                    upsert(store, url, clean)
                    enriched += 1
                    extracted = ', '.join(f"{k}={v}" for k, v in clean.items()
                                         if k not in ('title', 'description'))[:80]
                    print(f"OK ({extracted})")
                else:
                    failed += 1
                    print(f"empty")
            else:
                failed += 1
                print(f"FAIL ({status})")

            # Rate limit
            await page.wait_for_timeout(2000)

    print(f"\nEnriched {enriched}/{len(candidates)} ({failed} failed)")

    if enriched > 0:
        persist(store)


if __name__ == '__main__':
    asyncio.run(main())
