#!/usr/bin/env python3
"""
Sync breadcrumbs from CSV to enriched_data.json and geocode missing properties
"""

import json
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time

# Europe bounding box
EUROPE_BBOX = {
    'lat_min': 35.0,
    'lat_max': 72.0,
    'lon_min': -10.0,
    'lon_max': 40.0
}

def is_in_europe(lat, lon):
    """Check if coordinates are in Europe"""
    if not lat or not lon:
        return False
    return (EUROPE_BBOX['lat_min'] <= lat <= EUROPE_BBOX['lat_max'] and
            EUROPE_BBOX['lon_min'] <= lon <= EUROPE_BBOX['lon_max'])

def parse_breadcrumb(breadcrumb):
    """Parse breadcrumb to extract location parts"""
    if not breadcrumb or pd.isna(breadcrumb):
        return None

    # Split by > and clean
    parts = [p.strip() for p in str(breadcrumb).split('>')]

    # Remove property types
    property_types = {'Huis', 'Boerderij', 'Villa', 'House', 'Farm', 'Casa', 'Maison',
                     'Finca', 'Terreno', 'Land', 'Property', 'Cottage', 'Estate'}
    parts = [p for p in parts if p and p not in property_types]

    return parts

def geocode_with_fallback(geolocator, breadcrumb, retries=3):
    """Try to geocode with fallback strategy"""
    parts = parse_breadcrumb(breadcrumb)
    if not parts:
        return None, None, None

    # Try different combinations: full -> progressively more general
    attempts = []

    if len(parts) >= 3:
        attempts.append(', '.join(parts[-3:]))  # Last 3 parts
    if len(parts) >= 2:
        attempts.append(', '.join(parts[-2:]))  # Last 2 parts
    if len(parts) >= 1:
        attempts.append(parts[-1])              # Last part only

    for attempt_text in attempts:
        for retry in range(retries):
            try:
                time.sleep(1)  # Rate limiting
                location = geolocator.geocode(attempt_text, exactly_one=True, timeout=10)

                if location:
                    lat, lon = location.latitude, location.longitude

                    # Validate it's in Europe
                    if is_in_europe(lat, lon):
                        print(f"   ✅ Geocoded '{attempt_text}' → ({lat:.6f}, {lon:.6f})")
                        return lat, lon, attempt_text
                    else:
                        print(f"   ❌ Rejected '{attempt_text}' → outside Europe ({lat:.6f}, {lon:.6f})")
                        break  # Don't retry if outside Europe

            except (GeocoderTimedOut, GeocoderServiceError) as e:
                if retry < retries - 1:
                    print(f"   ⚠️ Timeout, retrying ({retry + 1}/{retries})...")
                    time.sleep(2)
                else:
                    print(f"   ❌ Geocoding error: {e}")
                    break
            except Exception as e:
                print(f"   ❌ Unexpected error: {e}")
                break

    return None, None, None

def main():
    print("="*70)
    print("SYNC BREADCRUMBS AND GEOCODE MISSING PROPERTIES")
    print("="*70)

    # Load CSV with breadcrumbs
    try:
        df_csv = pd.read_csv('extracted_property_urls.csv')
        print(f"\n✅ Loaded {len(df_csv)} properties from CSV")
    except FileNotFoundError:
        print("❌ extracted_property_urls.csv not found!")
        return

    # Load enriched_data.json
    try:
        with open('enriched_data.json', 'r') as f:
            properties = json.load(f)
        print(f"✅ Loaded {len(properties)} properties from enriched_data.json")
    except FileNotFoundError:
        print("❌ enriched_data.json not found!")
        return

    # Create URL to breadcrumb mapping from CSV
    url_to_breadcrumb = {}
    for _, row in df_csv.iterrows():
        if 'URL' in row and 'Breadcrumb' in row and pd.notna(row['Breadcrumb']):
            url_to_breadcrumb[row['URL']] = row['Breadcrumb']

    print(f"✅ Found {len(url_to_breadcrumb)} breadcrumbs in CSV\n")

    # Initialize geocoder
    geolocator = Nominatim(user_agent="farmmatch_sync_geocoder")

    # Process properties
    active = [p for p in properties if p.get('status') == 'Active']
    missing_coords = [p for p in active if not (p.get('lat') and p.get('lon'))]

    print(f"📊 Active properties: {len(active)}")
    print(f"📍 Missing coordinates: {len(missing_coords)}\n")

    updated = 0
    failed = 0

    for i, prop in enumerate(missing_coords, 1):
        url = prop.get('url')
        title = prop.get('title', 'No title')[:60]

        print(f"\n[{i}/{len(missing_coords)}] {title}...")
        print(f"   URL: {url}")

        # Sync breadcrumb from CSV
        if url in url_to_breadcrumb:
            breadcrumb = url_to_breadcrumb[url]
            prop['breadcrumb'] = breadcrumb
            print(f"   📋 Breadcrumb: {breadcrumb}")

            # Try to geocode
            lat, lon, geocoded_text = geocode_with_fallback(geolocator, breadcrumb)

            if lat and lon:
                prop['lat'] = lat
                prop['lon'] = lon
                prop['geocoding_confidence'] = 'medium'
                prop['geocoded_from'] = geocoded_text
                updated += 1
            else:
                print(f"   ❌ Could not geocode")
                failed += 1
        else:
            print(f"   ⚠️ No breadcrumb in CSV")
            failed += 1

        # Save progress every 5 properties
        if i % 5 == 0:
            with open('enriched_data.json', 'w') as f:
                json.dump(properties, f, indent=2)
            print(f"   💾 Progress saved")

    # Final save
    with open('enriched_data.json', 'w') as f:
        json.dump(properties, f, indent=2)

    print("\n" + "="*70)
    print("📊 RESULTS")
    print("="*70)
    print(f"✅ Successfully geocoded: {updated}")
    print(f"❌ Failed: {failed}")
    print(f"📈 New coverage: {len([p for p in active if p.get('lat') and p.get('lon')])}/{len(active)}")

    # Calculate percentage
    with_coords = len([p for p in active if p.get('lat') and p.get('lon')])
    coverage = (with_coords / len(active) * 100) if active else 0
    print(f"📍 Coverage: {coverage:.1f}%")

if __name__ == '__main__':
    main()
