#!/usr/bin/env python3
"""
Validate Coordinates Against Breadcrumb Data
Prevents wrong geocoding by checking if coordinates match expected country/region
"""
import pandas as pd
import sys

# Expected coordinate ranges for each country
COUNTRY_BOUNDS = {
    'Frankrijk': {'lat': (41.0, 51.5), 'lon': (-5.5, 10.0), 'name': 'France'},
    'France': {'lat': (41.0, 51.5), 'lon': (-5.5, 10.0), 'name': 'France'},
    'Spanje': {'lat': (35.0, 44.0), 'lon': (-10.0, 5.0), 'name': 'Spain'},
    'Spain': {'lat': (35.0, 44.0), 'lon': (-10.0, 5.0), 'name': 'Spain'},
    'Portugal': {'lat': (36.5, 42.5), 'lon': (-10.0, -6.0), 'name': 'Portugal'},
    'Italië': {'lat': (35.0, 47.5), 'lon': (6.0, 19.0), 'name': 'Italy'},
    'Italy': {'lat': (35.0, 47.5), 'lon': (6.0, 19.0), 'name': 'Italy'},
    'Griekenland': {'lat': (34.5, 42.0), 'lon': (19.0, 29.0), 'name': 'Greece'},
    'Greece': {'lat': (34.5, 42.0), 'lon': (19.0, 29.0), 'name': 'Greece'},
    'Nederland': {'lat': (50.5, 54.0), 'lon': (3.0, 7.5), 'name': 'Netherlands'},
    'Netherlands': {'lat': (50.5, 54.0), 'lon': (3.0, 7.5), 'name': 'Netherlands'},
    'België': {'lat': (49.5, 51.5), 'lon': (2.5, 6.5), 'name': 'Belgium'},
    'Belgium': {'lat': (49.5, 51.5), 'lon': (2.5, 6.5), 'name': 'Belgium'},
    'Duitsland': {'lat': (47.0, 55.5), 'lon': (5.5, 15.5), 'name': 'Germany'},
    'Germany': {'lat': (47.0, 55.5), 'lon': (5.5, 15.5), 'name': 'Germany'},
}

def get_country_from_breadcrumb(breadcrumb):
    """Extract country from breadcrumb string"""
    if pd.isna(breadcrumb):
        return None

    # First part of breadcrumb is usually the country
    parts = breadcrumb.split(' > ')
    if parts:
        return parts[0].strip()
    return None

def validate_coordinate(lat, lon, country):
    """Check if coordinates are within expected country bounds"""
    if pd.isna(lat) or pd.isna(lon) or not country:
        return None, "No data to validate"

    if country not in COUNTRY_BOUNDS:
        return None, f"Unknown country: {country}"

    bounds = COUNTRY_BOUNDS[country]
    lat_ok = bounds['lat'][0] <= lat <= bounds['lat'][1]
    lon_ok = bounds['lon'][0] <= lon <= bounds['lon'][1]

    if lat_ok and lon_ok:
        return True, f"✅ Coordinates in {bounds['name']}"
    else:
        return False, f"❌ Coordinates OUTSIDE {bounds['name']} (expected lat {bounds['lat']}, lon {bounds['lon']})"

def main():
    # Load data
    df_analysis = pd.read_csv('analysis_output.csv')
    df_breadcrumbs = pd.read_csv('extracted_property_urls.csv')

    print("="*70)
    print("🔍 COORDINATE VALIDATION")
    print("="*70)
    print(f"Checking {len(df_analysis)} properties against breadcrumb data\n")

    errors = []
    warnings = []
    validated = 0

    for idx, row in df_analysis.iterrows():
        url = row['URL']
        lat = row.get('Latitude')
        lon = row.get('Longitude')

        # Skip if no coordinates
        if pd.isna(lat) or pd.isna(lon):
            continue

        # Get breadcrumb
        breadcrumb_row = df_breadcrumbs[df_breadcrumbs['URL'] == url]
        if len(breadcrumb_row) == 0:
            continue

        breadcrumb = breadcrumb_row['Breadcrumb'].iloc[0]
        country = get_country_from_breadcrumb(breadcrumb)

        if not country:
            continue

        # Validate
        is_valid, message = validate_coordinate(lat, lon, country)
        validated += 1

        if is_valid == False:
            prop_id = url.split('/')[-1]
            errors.append({
                'property_id': prop_id,
                'url': url,
                'lat': lat,
                'lon': lon,
                'expected_country': country,
                'breadcrumb': breadcrumb,
                'message': message
            })
            print(f"❌ {prop_id}: {message}")
            print(f"   Breadcrumb: {breadcrumb}")
            print(f"   Coordinates: ({lat:.4f}, {lon:.4f})")
            print()

    # Summary
    print("="*70)
    print("📊 VALIDATION SUMMARY")
    print("="*70)
    print(f"Properties validated: {validated}")
    print(f"✅ Valid coordinates: {validated - len(errors)}")
    print(f"❌ Invalid coordinates: {len(errors)}")

    if errors:
        print(f"\n⚠️  {len(errors)} properties have coordinates outside their expected country!")
        print(f"   These need to be re-geocoded or manually corrected.")
        print(f"\n   To fix automatically:")
        print(f"   1. Run: python3 bulletproof_geocoding.py --force-regeocode")
        print(f"   2. Or fix manually in analysis_output.csv")

        # Save error report
        error_df = pd.DataFrame(errors)
        error_df.to_csv('coordinate_validation_errors.csv', index=False)
        print(f"\n   📄 Error report saved to: coordinate_validation_errors.csv")

        return 1  # Exit with error code
    else:
        print(f"\n✅ All coordinates validated successfully!")
        return 0

if __name__ == '__main__':
    sys.exit(main())
