#!/usr/bin/env python3
"""
Geocode properties using breadcrumb data from property detail pages
This provides the most accurate location information
"""

import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time

def geocode_location(location_string, geolocator):
    """Geocode a location string with European focus"""
    if not location_string or location_string == "Unknown":
        return None, None, None

    try:
        # Try geocoding with viewbox focused on Europe
        location = geolocator.geocode(
            location_string,
            exactly_one=True,
            viewbox=[(-10, 35), (40, 70)],  # Western Europe bbox
            bounded=True,
            timeout=10
        )

        if location:
            return location.latitude, location.longitude, location.address

        # If bounded search fails, try unbounded but still prefer Europe
        location = geolocator.geocode(location_string, exactly_one=True, timeout=10)
        if location:
            return location.latitude, location.longitude, location.address

    except (GeocoderTimedOut, GeocoderServiceError) as e:
        print(f"  ⚠️ Geocoding error: {str(e)[:50]}")

    return None, None, None

def parse_breadcrumb_smart(breadcrumb):
    """
    Parse breadcrumb intelligently to extract location components
    Remove property types and create multiple query strategies
    """
    if not breadcrumb:
        return []

    # Property types to remove (in various languages)
    property_types = [
        # Dutch
        'Huis', 'Boerderij', 'Vrijstaand huis', 'Huis in dorp',
        'Villa', 'Appartement', 'Grond', 'Perceel',
        'Vakantiehuisje', 'Vastgoed', 'Agrarische exploitatie',
        'Terrein', 'Woning',
        # English
        'House', 'Farm', 'Farmhouse', 'Villa', 'Plot', 'Land',
        'Property', 'Estate', 'Cottage', 'Agricultural holding'
    ]

    # Split breadcrumb and clean
    parts = [p.strip() for p in breadcrumb.split('>')]

    # Remove property type from last element if present
    if parts:
        last_part = parts[-1]
        for prop_type in property_types:
            if last_part == prop_type or last_part.endswith(f' {prop_type}'):
                parts = parts[:-1]
                break

    if not parts:
        return []

    # Create multiple query strategies from most specific to least specific
    queries = []

    # Strategy 1: City, Region, Country (e.g., "Saintes, Charente-Maritime, France")
    if len(parts) >= 3:
        queries.append(f"{parts[-1]}, {parts[-2]}, {parts[0]}")

    # Strategy 2: City, Country (e.g., "Saintes, France")
    if len(parts) >= 2:
        queries.append(f"{parts[-1]}, {parts[0]}")

    # Strategy 3: Just city (e.g., "Saintes")
    if len(parts) >= 1:
        queries.append(parts[-1])

    return queries

def main():
    # Load property data
    try:
        urls_df = pd.read_csv('extracted_property_urls.csv')
    except FileNotFoundError:
        print("❌ extracted_property_urls.csv not found!")
        return

    try:
        df = pd.read_csv('analysis_output.csv')
    except FileNotFoundError:
        print("❌ analysis_output.csv not found!")
        return

    print(f"📊 Found {len(df)} properties in analysis_output.csv")
    print(f"📊 Found {len(urls_df)} properties in extracted_property_urls.csv")

    # Merge breadcrumb data
    if 'Breadcrumb' in urls_df.columns:
        breadcrumb_map = dict(zip(urls_df['URL'], urls_df['Breadcrumb']))
        print(f"✅ Found breadcrumb data for {urls_df['Breadcrumb'].notna().sum()} properties")
    else:
        breadcrumb_map = {}
        print("⚠️ No breadcrumb data found - run extract_breadcrumbs.py first")

    geolocator = Nominatim(user_agent="farmmatch_geocoder")

    geocoded_count = 0
    failed_count = 0
    skipped_count = 0

    for idx, row in df.iterrows():
        url = row['URL']

        # Skip if already has coordinates
        if pd.notna(row.get('Latitude')) and pd.notna(row.get('Longitude')):
            skipped_count += 1
            continue

        print(f"\n[{idx+1}/{len(df)}] {url}")

        # Priority 1: Use breadcrumb if available
        breadcrumb = breadcrumb_map.get(url)
        if breadcrumb and pd.notna(breadcrumb):
            print(f"  📍 Breadcrumb: {breadcrumb}")

            # Parse breadcrumb into multiple query strategies
            queries = parse_breadcrumb_smart(breadcrumb)

            lat, lon, address = None, None, None

            # Try each query strategy until one succeeds
            for i, query in enumerate(queries):
                print(f"  🔍 Try {i+1}/{len(queries)}: {query}")
                lat, lon, address = geocode_location(query, geolocator)

                if lat and lon:
                    print(f"  ✅ {address[:100] if address else ''}")
                    print(f"  📍 {lat}, {lon}")
                    break

                time.sleep(0.5)  # Brief pause between attempts

            if lat and lon:
                df.at[idx, 'Latitude'] = lat
                df.at[idx, 'Longitude'] = lon
                # Extract city name from breadcrumb
                parts = [p.strip() for p in breadcrumb.split('>')]
                city = parts[-2] if len(parts) >= 2 else parts[-1] if parts else breadcrumb
                # Remove property type if present
                property_types = ['Huis', 'Boerderij', 'Vrijstaand huis', 'Huis in dorp', 'Villa', 'Appartement']
                for prop_type in property_types:
                    if city == prop_type:
                        city = parts[-3] if len(parts) >= 3 else parts[-2] if len(parts) >= 2 else breadcrumb
                        break
                df.at[idx, 'ExtractedLocation'] = city
                df.at[idx, 'LocationSource'] = 'breadcrumb'

                geocoded_count += 1

                # Save progress every 10 properties
                if geocoded_count % 10 == 0:
                    df.to_csv('analysis_output.csv', index=False)
                    print(f"  💾 Progress saved ({geocoded_count} geocoded)")

                time.sleep(1)  # Rate limiting
                continue

        # Priority 2: Use location from favorites scraper
        location = row.get('Locatie')
        if pd.notna(location) and location != "Unknown":
            print(f"  📍 Location: {location}")

            lat, lon, address = geocode_location(location, geolocator)

            if lat and lon:
                print(f"  ✅ {address[:100] if address else ''}")

                df.at[idx, 'Latitude'] = lat
                df.at[idx, 'Longitude'] = lon
                df.at[idx, 'ExtractedLocation'] = location
                df.at[idx, 'LocationSource'] = 'favorites'

                geocoded_count += 1

                if geocoded_count % 10 == 0:
                    df.to_csv('analysis_output.csv', index=False)
                    print(f"  💾 Progress saved")

                time.sleep(1)
                continue

        print(f"  ⚠️ No location data available")
        failed_count += 1

    # Final save
    df.to_csv('analysis_output.csv', index=False)

    print("\n" + "="*70)
    print(f"✅ GEOCODING WITH BREADCRUMBS COMPLETE")
    print("="*70)
    print(f"Successfully geocoded: {geocoded_count}")
    print(f"Already had coordinates: {skipped_count}")
    print(f"Failed (no location data): {failed_count}")
    print(f"Total: {len(df)}")

if __name__ == "__main__":
    main()
