#!/usr/bin/env python3
"""
Extract breadcrumbs specifically for properties missing coordinates
"""

import asyncio
from playwright.async_api import async_playwright
import json
import pandas as pd

async def extract_breadcrumb(page, url):
    """Extract breadcrumb from a property page"""
    try:
        await page.goto(url, wait_until="domcontentloaded", timeout=30000)
        await page.wait_for_timeout(2000)

        # Try to find breadcrumb container
        breadcrumb = await page.query_selector(".breadcrumb-container")
        if not breadcrumb:
            breadcrumb = await page.query_selector("nav[aria-label='breadcrumb']")
        if not breadcrumb:
            breadcrumb = await page.query_selector(".breadcrumb")

        if breadcrumb:
            # Get all breadcrumb links
            links = await breadcrumb.query_selector_all("a")
            breadcrumb_parts = []
            for link in links:
                text = await link.inner_text()
                text = text.strip()
                if text and text.lower() not in ['home', 'properstar', 'properties']:
                    breadcrumb_parts.append(text)

            if breadcrumb_parts:
                return " > ".join(breadcrumb_parts)

        # Fallback: try location from meta tags
        location_meta = await page.query_selector("meta[property='og:locality']")
        if location_meta:
            location = await location_meta.get_attribute("content")
            if location:
                return location

        # Fallback: try to get from title
        title_meta = await page.query_selector("meta[property='og:title']")
        if title_meta:
            title = await title_meta.get_attribute("content")
            # Extract location if it contains " - "
            if title and " - " in title:
                parts = title.split(" - ")
                if len(parts) >= 2:
                    return parts[-1].strip()

        return None

    except Exception as e:
        print(f"  ⚠️ Error: {str(e)[:100]}")
        return None

async def main():
    print("="*70)
    print("EXTRACT BREADCRUMBS FOR MISSING PROPERTIES")
    print("="*70)

    # Load enriched_data.json
    with open('enriched_data.json', 'r') as f:
        properties = json.load(f)

    # Find properties without coordinates
    active = [p for p in properties if p.get('status') == 'Active']
    missing = [p for p in active if not (p.get('lat') and p.get('lon'))]

    print(f"\n📊 Active properties: {len(active)}")
    print(f"📍 Missing coordinates: {len(missing)}\n")

    if not missing:
        print("✅ All properties have coordinates!")
        return

    # Extract breadcrumbs
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        updated = 0

        for i, prop in enumerate(missing, 1):
            url = prop.get('url')
            title = prop.get('title', 'No title')[:60]

            print(f"[{i}/{len(missing)}] {title}...")
            print(f"   URL: {url}")

            breadcrumb = await extract_breadcrumb(page, url)

            if breadcrumb:
                print(f"   ✅ {breadcrumb}")
                prop['breadcrumb'] = breadcrumb
                updated += 1

                # Also add to CSV if it exists
                try:
                    df = pd.read_csv('extracted_property_urls.csv')
                    # Check if URL exists
                    if url not in df['URL'].values:
                        # Add new row
                        new_row = pd.DataFrame([{
                            'URL': url,
                            'Locatie': prop.get('location', 'Unknown'),
                            'Prijs': prop.get('price', ''),
                            'Breadcrumb': breadcrumb
                        }])
                        df = pd.concat([df, new_row], ignore_index=True)
                        df.to_csv('extracted_property_urls.csv', index=False)
                except Exception as e:
                    print(f"   ⚠️ Could not update CSV: {e}")
            else:
                print(f"   ❌ No breadcrumb found")

            # Save progress every 5 properties
            if i % 5 == 0:
                with open('enriched_data.json', 'w') as f:
                    json.dump(properties, f, indent=2)
                print(f"   💾 Progress saved")

            await asyncio.sleep(1)  # Rate limiting

        await browser.close()

    # Final save
    with open('enriched_data.json', 'w') as f:
        json.dump(properties, f, indent=2)

    print("\n" + "="*70)
    print("📊 RESULTS")
    print("="*70)
    print(f"✅ Extracted breadcrumbs: {updated}/{len(missing)}")
    print(f"\n🔄 Next step: Run sync_and_geocode_missing.py to geocode these properties")

if __name__ == '__main__':
    asyncio.run(main())
