#!/usr/bin/env python3
"""
Bulletproof Geocoding System for FarmMatch

This system prevents geocoding errors through multiple validation layers:
1. Breadcrumb validation - Remove property types before geocoding
2. Geographic validation - Reject coordinates outside Europe
3. Country validation - Verify coordinates match expected country
4. Fallback strategy - Multiple attempts with increasing generality
5. Quality scoring - Track geocoding confidence
"""

import pandas as pd
import json
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time
from pathlib import Path

# Europe bounding box (strict)
EUROPE_BBOX = {
    'lat_min': 35.0,   # Southern Europe (Crete)
    'lat_max': 72.0,   # Northern Europe (North Cape)
    'lon_min': -10.0,  # Western Europe (Portugal)
    'lon_max': 40.0    # Eastern Europe (Russia)
}

# Country name mappings (breadcrumb → English)
COUNTRY_MAPPINGS = {
    'Frankrijk': 'France',
    'Spanje': 'Spain',
    'Italië': 'Italy',
    'Portugal': 'Portugal',
    'Griekenland': 'Greece',
    'Duitsland': 'Germany',
    'België': 'Belgium',
    'Nederland': 'Netherlands',
    'Oostenrijk': 'Austria',
    'Zwitserland': 'Switzerland',
    'Kroatië': 'Croatia',
    'Slovenië': 'Slovenia',
    'Bulgarije': 'Bulgaria',
    'Roemenië': 'Romania',
    'Tsjechië': 'Czech Republic',
    'Slowakije': 'Slovakia',
    'Hongarije': 'Hungary',
    'Polen': 'Poland',
    'Ierland': 'Ireland',
    'Schotland': 'Scotland',
    'Engeland': 'England',
    'Wales': 'Wales'
}

# Comprehensive property type list (NEVER geocode these)
PROPERTY_TYPES = {
    # Dutch
    'Huis', 'Boerderij', 'Vrijstaand huis', 'Huis in dorp',
    'Villa', 'Appartement', 'Grond', 'Perceel', 'Vakantiehuisje',
    'Vastgoed', 'Agrarische exploitatie', 'Terrein', 'Woning',
    'Landhuis', 'Bouwgrond', 'Eengezinswoning', 'Herenhuis',
    'Boerderijtje', 'Landgoed', 'Cottage', 'Chalet', 'Finca',
    # English
    'House', 'Farm', 'Farmhouse', 'Villa', 'Plot', 'Land',
    'Property', 'Estate', 'Cottage', 'Agricultural holding',
    'Detached house', 'Village house', 'Holiday cottage',
    'Real estate', 'Agricultural property', 'Terrain',
    'Country house', 'Building plot', 'Manor house', 'Mansion',
    # Spanish
    'Casa', 'Finca', 'Cortijo', 'Parcela', 'Terreno',
    # French
    'Maison', 'Ferme', 'Terrain', 'Propriété',
    # Italian
    'Casa', 'Casale', 'Terreno', 'Proprietà',
    # Portuguese
    'Casa', 'Quinta', 'Terreno', 'Propriedade'
}

class BulletproofGeocoder:
    """Geocoder with multiple validation layers"""

    def __init__(self):
        self.geolocator = Nominatim(user_agent="farmmatch_bulletproof_geocoder")
        self.stats = {
            'success': 0,
            'failed_property_type': 0,
            'failed_outside_europe': 0,
            'failed_country_mismatch': 0,
            'failed_no_breadcrumb': 0,
            'failed_geocoding_error': 0
        }

    def is_property_type(self, text):
        """Check if text is a property type (should NOT be geocoded)"""
        if not text:
            return False
        text_clean = text.strip().lower()
        return any(text_clean == ptype.lower() for ptype in PROPERTY_TYPES)

    def parse_breadcrumb(self, breadcrumb):
        """
        Parse breadcrumb into validated location components
        Returns: (country, region, city) or None if invalid
        """
        if not breadcrumb or pd.isna(breadcrumb):
            return None

        # Split and clean
        parts = [p.strip() for p in str(breadcrumb).split('>')]
        if len(parts) < 2:
            return None

        # Remove property type from end
        if parts and self.is_property_type(parts[-1]):
            parts = parts[:-1]

        if len(parts) < 2:
            return None

        # Extract components
        country = parts[0]  # First part is always country
        city = parts[-1]    # Last part (after removing property type) is city
        region = parts[-2] if len(parts) >= 3 else None

        # Validate: city should NOT be a property type
        if self.is_property_type(city):
            return None

        # Validate: region should NOT be a property type
        if region and self.is_property_type(region):
            region = parts[-3] if len(parts) >= 4 else None

        return {
            'country': country,
            'region': region,
            'city': city,
            'original': breadcrumb
        }

    def is_in_europe(self, lat, lon):
        """Check if coordinates are within Europe bounding box"""
        if lat is None or lon is None:
            return False
        return (EUROPE_BBOX['lat_min'] <= lat <= EUROPE_BBOX['lat_max'] and
                EUROPE_BBOX['lon_min'] <= lon <= EUROPE_BBOX['lon_max'])

    def geocode_with_validation(self, query, expected_country=None):
        """
        Geocode a query with validation
        Returns: (lat, lon, address, confidence) or (None, None, None, 0)
        """
        try:
            # Try bounded search first (Europe only)
            location = self.geolocator.geocode(
                query,
                exactly_one=True,
                viewbox=[
                    (EUROPE_BBOX['lon_min'], EUROPE_BBOX['lat_min']),
                    (EUROPE_BBOX['lon_max'], EUROPE_BBOX['lat_max'])
                ],
                bounded=True,
                timeout=10
            )

            if not location:
                # Try unbounded but still validate
                location = self.geolocator.geocode(query, exactly_one=True, timeout=10)

            if not location:
                return None, None, None, 0

            # Validation 1: Must be in Europe
            if not self.is_in_europe(location.latitude, location.longitude):
                print(f"      ❌ REJECTED: Outside Europe ({location.latitude}, {location.longitude})")
                self.stats['failed_outside_europe'] += 1
                return None, None, None, 0

            # Validation 2: Check country match if provided
            if expected_country:
                address_lower = location.address.lower()
                country_english = COUNTRY_MAPPINGS.get(expected_country, expected_country).lower()

                # Check if country appears in address
                if country_english not in address_lower and expected_country.lower() not in address_lower:
                    print(f"      ❌ REJECTED: Country mismatch (expected {expected_country}, got {location.address})")
                    self.stats['failed_country_mismatch'] += 1
                    return None, None, None, 0

            # Calculate confidence score
            confidence = 1.0
            if expected_country and expected_country.lower() in location.address.lower():
                confidence = 0.95  # High confidence - country matches
            else:
                confidence = 0.7   # Medium confidence - in Europe but no country verification

            return location.latitude, location.longitude, location.address, confidence

        except (GeocoderTimedOut, GeocoderServiceError) as e:
            print(f"      ⚠️ Geocoding error: {str(e)[:50]}")
            self.stats['failed_geocoding_error'] += 1
            return None, None, None, 0

    def geocode_property(self, url, breadcrumb, location_from_favorites=None):
        """
        Geocode a single property with bulletproof validation

        Returns dict with:
            - lat, lon: coordinates (or None)
            - address: full address
            - location_source: 'breadcrumb', 'favorites', or 'failed'
            - confidence: 0.0-1.0
            - reason: why it succeeded/failed
        """
        print(f"\n🔍 Geocoding: {url.split('/')[-1]}")

        result = {
            'lat': None,
            'lon': None,
            'address': None,
            'extracted_location': 'Unknown',
            'location_source': 'failed',
            'confidence': 0.0,
            'reason': 'No geocoding attempted'
        }

        # Strategy 1: Breadcrumb (highest priority)
        if breadcrumb and not pd.isna(breadcrumb):
            print(f"   📍 Breadcrumb: {breadcrumb}")

            parsed = self.parse_breadcrumb(breadcrumb)
            if not parsed:
                print(f"      ❌ Invalid breadcrumb (property type at end)")
                self.stats['failed_property_type'] += 1
                result['reason'] = 'Breadcrumb ends with property type'
                return result

            print(f"      Parsed: {parsed['city']}, {parsed['region'] or '[no region]'}, {parsed['country']}")

            # Try multiple query strategies
            queries = []

            # Query 1: City, Region, Country
            if parsed['region']:
                queries.append(f"{parsed['city']}, {parsed['region']}, {parsed['country']}")

            # Query 2: City, Country
            queries.append(f"{parsed['city']}, {parsed['country']}")

            # Query 3: Just City (least specific)
            queries.append(parsed['city'])

            # Try each query
            for i, query in enumerate(queries, 1):
                print(f"      Try {i}/{len(queries)}: {query}")
                lat, lon, address, confidence = self.geocode_with_validation(
                    query,
                    expected_country=parsed['country']
                )

                if lat and lon:
                    print(f"      ✅ SUCCESS: {address[:80]}")
                    print(f"      📍 {lat}, {lon} (confidence: {confidence:.2f})")
                    result.update({
                        'lat': lat,
                        'lon': lon,
                        'address': address,
                        'extracted_location': parsed['city'],
                        'location_source': 'breadcrumb',
                        'confidence': confidence,
                        'reason': f'Geocoded with query: {query}'
                    })
                    self.stats['success'] += 1
                    return result

                time.sleep(0.5)  # Rate limiting between attempts

        else:
            print(f"   ⚠️ No breadcrumb data")
            self.stats['failed_no_breadcrumb'] += 1
            result['reason'] = 'No breadcrumb data available'

        # Strategy 2: Favorites location (fallback)
        if location_from_favorites and location_from_favorites != "Unknown":
            print(f"   📍 Fallback to favorites location: {location_from_favorites}")
            lat, lon, address, confidence = self.geocode_with_validation(
                location_from_favorites
            )

            if lat and lon:
                print(f"      ✅ SUCCESS: {address[:80]}")
                result.update({
                    'lat': lat,
                    'lon': lon,
                    'address': address,
                    'extracted_location': location_from_favorites,
                    'location_source': 'favorites',
                    'confidence': confidence * 0.5,  # Lower confidence for favorites
                    'reason': 'Geocoded from favorites location'
                })
                self.stats['success'] += 1
                return result

        print(f"   ❌ FAILED: Could not geocode property")
        return result

    def print_stats(self):
        """Print geocoding statistics"""
        total = sum(self.stats.values())
        print("\n" + "="*70)
        print("📊 BULLETPROOF GEOCODING STATISTICS")
        print("="*70)
        print(f"✅ Successful: {self.stats['success']}")
        print(f"❌ Failed:")
        print(f"   - Property type in breadcrumb: {self.stats['failed_property_type']}")
        print(f"   - Outside Europe bbox: {self.stats['failed_outside_europe']}")
        print(f"   - Country mismatch: {self.stats['failed_country_mismatch']}")
        print(f"   - No breadcrumb data: {self.stats['failed_no_breadcrumb']}")
        print(f"   - Geocoding error: {self.stats['failed_geocoding_error']}")
        print(f"\n📈 Success rate: {self.stats['success']/total*100:.1f}%" if total > 0 else "\n📈 No geocoding attempts")

def main():
    """Run bulletproof geocoding on all properties"""

    print("="*70)
    print("🛡️ BULLETPROOF GEOCODING SYSTEM")
    print("="*70)

    # Load data
    try:
        df = pd.read_csv('analysis_output.csv')
    except FileNotFoundError:
        print("❌ analysis_output.csv not found!")
        return

    try:
        breadcrumbs_df = pd.read_csv('extracted_property_urls.csv')
        breadcrumb_map = dict(zip(breadcrumbs_df['URL'], breadcrumbs_df['Breadcrumb']))
        print(f"✅ Loaded {breadcrumbs_df['Breadcrumb'].notna().sum()} breadcrumbs")
    except FileNotFoundError:
        breadcrumb_map = {}
        print("⚠️ No breadcrumb data found - run extract_breadcrumbs.py first")

    geocoder = BulletproofGeocoder()

    # Filter: only geocode properties without coordinates OR with bad coordinates
    to_geocode = df[
        (df['Latitude'].isna()) |
        (df['Longitude'].isna()) |
        (~df['Latitude'].between(EUROPE_BBOX['lat_min'], EUROPE_BBOX['lat_max'])) |
        (~df['Longitude'].between(EUROPE_BBOX['lon_min'], EUROPE_BBOX['lon_max']))
    ]

    print(f"\n📊 Properties to geocode: {len(to_geocode)}/{len(df)}")
    print(f"   - Missing coordinates: {df['Latitude'].isna().sum()}")
    print(f"   - Outside Europe: {len(to_geocode) - df['Latitude'].isna().sum()}")

    # Geocode each property
    for idx, row in to_geocode.iterrows():
        url = row['URL']
        breadcrumb = breadcrumb_map.get(url)
        location = row.get('Locatie')

        result = geocoder.geocode_property(url, breadcrumb, location)

        # Update dataframe
        if result['lat'] and result['lon']:
            df.at[idx, 'Latitude'] = result['lat']
            df.at[idx, 'Longitude'] = result['lon']
            df.at[idx, 'ExtractedLocation'] = result['extracted_location']
            df.at[idx, 'LocationSource'] = f"{result['location_source']}_{result['confidence']:.2f}"

        # Save progress every 10 properties
        if (geocoder.stats['success'] + sum(geocoder.stats.values()) - geocoder.stats['success']) % 10 == 0:
            df.to_csv('analysis_output.csv', index=False)
            print(f"\n💾 Progress saved")

        time.sleep(1)  # Rate limiting

    # Final save
    df.to_csv('analysis_output.csv', index=False)

    geocoder.print_stats()

    print("\n✅ Bulletproof geocoding complete!")
    print("🔄 Run: python3 parse_criteria.py to update enriched_data.json")

if __name__ == "__main__":
    main()
