#!/usr/bin/env python3
"""
Extract breadcrumb location data from Properstar property detail pages
This provides better location context than just the listing page location
"""

import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import time
from pathlib import Path

async def extract_breadcrumb(page, url):
    """Extract breadcrumb from a single property page
    Returns: (breadcrumb_string, is_404)
    - breadcrumb_string: The extracted breadcrumb or None
    - is_404: True if page returned 404, False otherwise
    """
    try:
        response = await page.goto(url, wait_until="domcontentloaded", timeout=30000)

        # Check if page returned 404 or 410 (Gone - permanently removed)
        if response and (response.status == 404 or response.status == 410):
            return None, True

        await page.wait_for_timeout(2000)

        # Try to find breadcrumb container
        breadcrumb = await page.query_selector(".breadcrumb-container")
        if not breadcrumb:
            breadcrumb = await page.query_selector("nav[aria-label='breadcrumb']")
        if not breadcrumb:
            breadcrumb = await page.query_selector(".breadcrumb")

        if breadcrumb:
            # Get all breadcrumb links
            links = await breadcrumb.query_selector_all("a")
            breadcrumb_parts = []
            for link in links:
                text = await link.inner_text()
                text = text.strip()
                if text and text.lower() not in ['home', 'properstar', 'properties']:
                    breadcrumb_parts.append(text)

            if breadcrumb_parts:
                # Return as hierarchical string: "Spain > Galicia > Lugo > Monforte de Lemos"
                return " > ".join(breadcrumb_parts), False

        # Fallback 1: try to get location from meta tags
        location_meta = await page.query_selector("meta[property='og:locality']")
        if location_meta:
            location = await location_meta.get_attribute("content")
            if location:
                return location, False

        # Fallback 2: Extract location from page title
        title_element = await page.query_selector("title")
        if title_element:
            title_text = await title_element.inner_text()
            # Title often contains location like "Property in City, Region, Country"
            if " in " in title_text or " te koop " in title_text:
                # Parse location from title
                # Example: "Farm for sale in Normandy, France" -> "France > Normandy"
                parts = []
                for separator in [' in ', ' te koop in ', ' - ']:
                    if separator in title_text:
                        after_sep = title_text.split(separator, 1)[1]
                        # Clean and extract location parts (comma-separated)
                        location_parts = [p.strip() for p in after_sep.split(',')]
                        location_parts = [p for p in location_parts if p and len(p) > 2]
                        if location_parts:
                            # Reverse order (City, Region, Country -> Country > Region > City)
                            return " > ".join(reversed(location_parts)), False

        # Fallback 3: Look for location in structured data (JSON-LD)
        try:
            json_ld = await page.query_selector("script[type='application/ld+json']")
            if json_ld:
                import json
                json_text = await json_ld.inner_text()
                data = json.loads(json_text)
                if isinstance(data, dict):
                    # Check for address structure
                    if 'address' in data:
                        addr = data['address']
                        parts = []
                        if 'addressCountry' in addr:
                            parts.insert(0, addr['addressCountry'])
                        if 'addressRegion' in addr:
                            parts.append(addr['addressRegion'])
                        if 'addressLocality' in addr:
                            parts.append(addr['addressLocality'])
                        if parts:
                            return " > ".join(parts), False
        except:
            pass  # JSON-LD parsing failed, continue

        # Fallback 4: Search for location keywords in page text
        try:
            # Look for common location patterns in the page
            page_text = await page.inner_text("body")
            # Look for "Located in X" or "Location: X" patterns
            location_patterns = [
                r"(?:Located in|Location:|Locatie:)\s*([A-Z][a-zA-Z\s-]+(?:,\s*[A-Z][a-zA-Z\s-]+){1,3})",
                r"(?:te koop in|for sale in)\s*([A-Z][a-zA-Z\s-]+(?:,\s*[A-Z][a-zA-Z\s-]+){1,3})"
            ]
            import re
            for pattern in location_patterns:
                match = re.search(pattern, page_text, re.IGNORECASE)
                if match:
                    location_str = match.group(1)
                    # Parse into breadcrumb format
                    parts = [p.strip() for p in location_str.split(',')]
                    parts = [p for p in parts if p and len(p) > 2]
                    if parts:
                        return " > ".join(reversed(parts)), False
        except:
            pass

        return None, False

    except Exception as e:
        print(f"  ⚠️ Error extracting breadcrumb: {str(e)[:100]}")
        return None, False

async def extract_all_breadcrumbs():
    """Extract breadcrumbs for all properties"""

    # Load property URLs from analysis_output.csv or fallback to extracted_property_urls.csv
    source_df = None
    if Path("analysis_output.csv").exists():
        try:
            source_df = pd.read_csv('analysis_output.csv')
            if 'URL' in source_df.columns:
                print(f"📊 Found {len(source_df)} properties in analysis_output.csv")
        except Exception as e:
            print(f"⚠️ Could not read analysis_output.csv: {e}")
            source_df = None
    if source_df is None and Path("extracted_property_urls.csv").exists():
        try:
            df_urls = pd.read_csv("extracted_property_urls.csv")
            if 'Property URL' in df_urls.columns:
                source_df = pd.DataFrame({
                    'URL': df_urls['Property URL'],
                    'Locatie': df_urls.get('Locatie', None),
                    'Prijs': df_urls.get('Prijs', None)
                })
                print(f"📊 Fallback: Found {len(source_df)} properties in extracted_property_urls.csv")
        except Exception as e:
            print(f"❌ Could not read extracted_property_urls.csv: {e}")
            return
    if source_df is None or 'URL' not in source_df.columns:
        print("❌ No valid source with URL column found (analysis_output.csv or extracted_property_urls.csv)")
        return

    # Load existing breadcrumbs if they exist
    try:
        df = pd.read_csv('extracted_property_urls.csv')
        # Normalize URL column
        if 'URL' not in df.columns and 'Property URL' in df.columns:
            df = df.rename(columns={'Property URL': 'URL'})
        print(f"📊 Existing breadcrumbs: {len(df)} properties")
    except FileNotFoundError:
        # Create new DataFrame with columns from source
        df = pd.DataFrame(columns=['URL', 'Locatie', 'Prijs', 'Breadcrumb'])
        print("📊 No existing breadcrumbs file, creating new one")

    # Merge source properties with existing breadcrumbs
    # This ensures all properties from analysis_output.csv are included
    all_urls = source_df['URL'].tolist()
    if 'URL' not in df.columns:
        print("❌ No URL column in extracted_property_urls.csv (or fallback). Cannot proceed.")
        return
    existing_urls = set(df['URL'].tolist())

    # Add missing URLs to df
    missing_urls = [url for url in all_urls if url not in existing_urls]
    if missing_urls:
        print(f"📊 Adding {len(missing_urls)} new properties from analysis_output.csv")
        new_rows = []
        for url in missing_urls:
            prop = source_df[source_df['URL'] == url].iloc[0]
            new_rows.append({
                'URL': url,
                'Locatie': prop.get('Locatie') if pd.notna(prop.get('Locatie')) else None,
                'Prijs': prop.get('Prijs') if pd.notna(prop.get('Prijs')) else None,
                'Breadcrumb': None
            })
        df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)

    print(f"📊 Total properties to process: {len(df)}")

    # Check if Breadcrumb column exists
    if 'Breadcrumb' not in df.columns:
        df['Breadcrumb'] = None

    # Add Status_404 column to track dead pages
    if 'Status_404' not in df.columns:
        df['Status_404'] = False

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        updated_count = 0
        failed_count = 0
        page_404_count = 0

        for idx, row in df.iterrows():
            url = row['URL']

            # Skip if already has breadcrumb
            if pd.notna(row.get('Breadcrumb')):
                continue

            print(f"\n[{idx+1}/{len(df)}] {url}")

            breadcrumb, is_404 = await extract_breadcrumb(page, url)

            if is_404:
                print(f"  🚫 404 Page Not Found (property removed)")
                df.at[idx, 'Status_404'] = True
                page_404_count += 1
                failed_count += 1
            elif breadcrumb:
                print(f"  ✅ {breadcrumb}")
                df.at[idx, 'Breadcrumb'] = breadcrumb
                df.at[idx, 'Status_404'] = False
                updated_count += 1
            else:
                print(f"  ⚠️ No breadcrumb found")
                df.at[idx, 'Status_404'] = False
                failed_count += 1

            # Save progress every 10 properties
            if (idx + 1) % 10 == 0:
                df.to_csv('extracted_property_urls.csv', index=False)
                print(f"  💾 Progress saved")

            # Rate limiting
            await asyncio.sleep(1)

        await browser.close()

    # Final save
    df.to_csv('extracted_property_urls.csv', index=False)

    # Count how many have breadcrumbs now
    with_breadcrumbs = df['Breadcrumb'].notna().sum()
    without_breadcrumbs = len(df) - with_breadcrumbs

    print("\n" + "="*70)
    print(f"✅ BREADCRUMB EXTRACTION COMPLETE")
    print("="*70)
    print(f"Properties in source (analysis_output.csv): {len(source_df)}")
    print(f"Properties in extracted_property_urls.csv: {len(df)}")
    print(f"Already had breadcrumbs: {len(df) - updated_count - failed_count}")
    print(f"Successfully extracted: {updated_count}")
    print(f"Failed: {failed_count}")
    if page_404_count > 0:
        print(f"404 pages (removed properties): {page_404_count}")
        print(f"   These should be removed from analysis_output.csv")
    print(f"Total with breadcrumbs: {with_breadcrumbs}/{len(df)} ({with_breadcrumbs*100//len(df)}%)")
    if without_breadcrumbs > 0:
        print(f"⚠️  Still missing breadcrumbs: {without_breadcrumbs}")
        print(f"   Run: python3 fix_missing_breadcrumbs.py to retry")
        print(f"\n💡 To remove 404 pages from system:")
        print(f"   Run: python3 check_availability.py --remove-404")

if __name__ == "__main__":
    asyncio.run(extract_all_breadcrumbs())
