#!/usr/bin/env python3
"""
Fix missing breadcrumbs by extracting them for properties in analysis_output.csv
that are not in extracted_property_urls.csv
"""
import asyncio
from playwright.async_api import async_playwright
import pandas as pd

async def extract_breadcrumb(page, url):
    """Extract breadcrumb from a single property page
    Returns: (breadcrumb_string, is_404)
    """
    try:
        response = await page.goto(url, wait_until="domcontentloaded", timeout=30000)

        # Check if page returned 404 or 410 (Gone - permanently removed)
        if response and (response.status == 404 or response.status == 410):
            return None, True

        await page.wait_for_timeout(2000)

        # Try to find breadcrumb - use nav with Breadcrumb label
        breadcrumb = await page.query_selector("nav[aria-label='Breadcrumb']")
        if not breadcrumb:
            breadcrumb = await page.query_selector(".breadcrumb-container")
        if not breadcrumb:
            breadcrumb = await page.query_selector(".breadcrumb")

        if breadcrumb:
            text = await breadcrumb.inner_text()
            # Clean up the text
            parts = [p.strip() for p in text.split('\n') if p.strip()]
            # Filter out navigation words
            parts = [p for p in parts if p.lower() not in ['home', 'properstar', 'properties', 'huis', 'boerderij']]
            if parts:
                return " > ".join(parts), False

        # Fallback 1: try to get location from meta tags
        location_meta = await page.query_selector("meta[property='og:locality']")
        if location_meta:
            location = await location_meta.get_attribute("content")
            if location:
                return location, False

        # Fallback 2: Extract location from page title
        title_element = await page.query_selector("title")
        if title_element:
            title_text = await title_element.inner_text()
            if " in " in title_text or " te koop " in title_text:
                for separator in [' in ', ' te koop in ', ' - ']:
                    if separator in title_text:
                        after_sep = title_text.split(separator, 1)[1]
                        location_parts = [p.strip() for p in after_sep.split(',')]
                        location_parts = [p for p in location_parts if p and len(p) > 2]
                        if location_parts:
                            return " > ".join(reversed(location_parts)), False

        # Fallback 3: Look for location in structured data (JSON-LD)
        try:
            json_ld = await page.query_selector("script[type='application/ld+json']")
            if json_ld:
                import json
                json_text = await json_ld.inner_text()
                data = json.loads(json_text)
                if isinstance(data, dict) and 'address' in data:
                    addr = data['address']
                    parts = []
                    if 'addressCountry' in addr:
                        parts.insert(0, addr['addressCountry'])
                    if 'addressRegion' in addr:
                        parts.append(addr['addressRegion'])
                    if 'addressLocality' in addr:
                        parts.append(addr['addressLocality'])
                    if parts:
                        return " > ".join(parts), False
        except:
            pass

        # Fallback 4: Search for location keywords in page text
        try:
            page_text = await page.inner_text("body")
            location_patterns = [
                r"(?:Located in|Location:|Locatie:)\s*([A-Z][a-zA-Z\s-]+(?:,\s*[A-Z][a-zA-Z\s-]+){1,3})",
                r"(?:te koop in|for sale in)\s*([A-Z][a-zA-Z\s-]+(?:,\s*[A-Z][a-zA-Z\s-]+){1,3})"
            ]
            import re
            for pattern in location_patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match:
                    location_str = match.group(1)
                    parts = [p.strip() for p in location_str.split(',')]
                    parts = [p for p in parts if p and len(p) > 2]
                    if parts:
                        return " > ".join(reversed(parts)), False
        except:
            pass

        return None, False

    except Exception as e:
        print(f"  ⚠️ Error: {str(e)[:80]}")
        return None, False

async def main():
    # Load all properties from analysis_output.csv
    analysis_df = pd.read_csv('analysis_output.csv')
    print(f"📊 Total properties in analysis: {len(analysis_df)}")
    
    # Load existing breadcrumbs
    try:
        breadcrumb_df = pd.read_csv('extracted_property_urls.csv')
        existing_urls = set(breadcrumb_df['URL'].tolist())
        print(f"📊 Properties with breadcrumbs: {len(existing_urls)}")
    except FileNotFoundError:
        breadcrumb_df = pd.DataFrame(columns=['URL', 'Locatie', 'Prijs', 'Breadcrumb'])
        existing_urls = set()
        print(f"📊 No existing breadcrumbs file found, creating new one")
    
    # Find properties without breadcrumbs
    all_urls = analysis_df['URL'].tolist()
    missing_urls = [url for url in all_urls if url not in existing_urls]
    
    print(f"📊 Properties missing breadcrumbs: {len(missing_urls)}")
    
    if not missing_urls:
        print("✅ All properties have breadcrumbs!")
        return
    
    # Extract breadcrumbs for missing properties
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        new_rows = []
        success_count = 0
        page_404_count = 0

        for i, url in enumerate(missing_urls, 1):
            print(f"[{i}/{len(missing_urls)}] {url.split('/')[-1]}")

            breadcrumb, is_404 = await extract_breadcrumb(page, url)

            # Get location and price from analysis_output if available
            prop = analysis_df[analysis_df['URL'] == url].iloc[0]
            locatie = prop.get('Locatie') if pd.notna(prop.get('Locatie')) else None
            prijs = prop.get('Prijs') if pd.notna(prop.get('Prijs')) else None

            new_rows.append({
                'URL': url,
                'Locatie': locatie,
                'Prijs': prijs,
                'Breadcrumb': breadcrumb,
                'Status_404': is_404
            })

            if is_404:
                print(f"  🚫 404 Page Not Found (property removed)")
                page_404_count += 1
            elif breadcrumb:
                print(f"  ✅ {breadcrumb}")
                success_count += 1
            else:
                print(f"  ❌ No breadcrumb found")
            
            await page.wait_for_timeout(500)  # Rate limiting
        
        await browser.close()
    
    # Append new rows to existing breadcrumb data
    new_df = pd.DataFrame(new_rows)
    combined_df = pd.concat([breadcrumb_df, new_df], ignore_index=True)
    
    # Save combined data
    combined_df.to_csv('extracted_property_urls.csv', index=False)
    
    print(f"\n{'='*70}")
    print(f"✅ BREADCRUMB FIX COMPLETE")
    print(f"{'='*70}")
    print(f"Successfully extracted: {success_count}/{len(missing_urls)}")
    if page_404_count > 0:
        print(f"404 pages (removed properties): {page_404_count}")
        print(f"   These should be removed from analysis_output.csv")
    print(f"Total properties with breadcrumbs: {len(combined_df)}")
    print(f"\n💡 Next steps:")
    if page_404_count > 0:
        print(f"   1. Run: python3 check_availability.py --remove-404  # Remove 404 pages")
        print(f"   2. Run: python3 geocode_with_breadcrumbs.py")
        print(f"   3. Run: python3 parse_criteria.py")
    else:
        print(f"   1. Run: python3 geocode_with_breadcrumbs.py")
        print(f"   2. Run: python3 parse_criteria.py")

if __name__ == '__main__':
    asyncio.run(main())
