#!/usr/bin/env python3
"""
Unified Geocoding Module for Paradisomatch

Combines all geocoding functionality into one place:
- Breadcrumb extraction from property pages
- Multiple geocoding strategies with fallbacks
- Europe validation
- Database integration
- Title-based geocoding as last resort
"""

import asyncio
from playwright.async_api import async_playwright
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import time
import re
from typing import Optional, Tuple, List
from db_manager import DatabaseManager

# Europe bounding box (strict validation)
EUROPE_BBOX = {
    'lat_min': 35.0,   # Southern Europe (Crete)
    'lat_max': 72.0,   # Northern Europe (North Cape)
    'lon_min': -10.0,  # Western Europe (Portugal)
    'lon_max': 40.0    # Eastern Europe (Russia)
}

# Property types to filter out
PROPERTY_TYPES = {
    'Huis', 'Boerderij', 'Villa', 'House', 'Farm', 'Casa', 'Maison',
    'Finca', 'Terreno', 'Land', 'Property', 'Cottage', 'Estate',
    'Grond', 'Perceel', 'Vastgoed', 'Plot', 'Terrain', 'Woning',
    'Appartement', 'Landhuis', 'Bouwgrond'
}


class UnifiedGeocoder:
    """Unified geocoding with multiple strategies"""

    def __init__(self, user_agent="paradisomatch_unified_geocoder", db_path=None):
        self.geolocator = Nominatim(user_agent=user_agent)
        self.db = DatabaseManager(db_path) if db_path else None
        self.stats = {
            'total': 0,
            'success_breadcrumb': 0,
            'success_title': 0,
            'failed_no_breadcrumb': 0,
            'failed_outside_europe': 0,
            'failed_geocoding_error': 0
        }

    def is_in_europe(self, lat: float, lon: float) -> bool:
        """Validate coordinates are in Europe"""
        if not lat or not lon:
            return False
        return (EUROPE_BBOX['lat_min'] <= lat <= EUROPE_BBOX['lat_max'] and
                EUROPE_BBOX['lon_min'] <= lon <= EUROPE_BBOX['lon_max'])

    def parse_breadcrumb(self, breadcrumb: str) -> Optional[List[str]]:
        """Parse breadcrumb and filter out property types"""
        if not breadcrumb:
            return None

        # Split and clean
        parts = [p.strip() for p in str(breadcrumb).split('>')]

        # Remove property types and empty strings
        parts = [p for p in parts if p and p not in PROPERTY_TYPES]

        return parts if parts else None

    def extract_location_from_title(self, title: str) -> Optional[str]:
        """Extract location hints from property title"""
        if not title:
            return None

        # Pattern 1: Postal code + location (e.g., "35330 VAL-D'ANAST")
        match = re.search(r'(\d{5})\s+([A-Z][A-Za-z\-\']+)', title)
        if match:
            return f"{match.group(2)}, France"

        # Pattern 2: "in Location" or "koop in Location"
        match = re.search(r'(?:in|koop in)\s+([A-Z][A-Za-z\-\']+(?:\s+[A-Z][A-Za-z\-\']+)?)', title)
        if match:
            return match.group(1)

        return None

    async def extract_breadcrumb_from_page(self, page, url: str) -> Optional[str]:
        """Extract breadcrumb from a property page"""
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=30000)
            await page.wait_for_timeout(2000)

            # Try multiple selectors
            selectors = [
                ".breadcrumb-container",
                "nav[aria-label='breadcrumb']",
                ".breadcrumb"
            ]

            breadcrumb = None
            for selector in selectors:
                breadcrumb = await page.query_selector(selector)
                if breadcrumb:
                    break

            if breadcrumb:
                links = await breadcrumb.query_selector_all("a")
                parts = []
                for link in links:
                    text = await link.inner_text()
                    text = text.strip()
                    if text and text.lower() not in ['home', 'properstar', 'properties']:
                        parts.append(text)

                if parts:
                    return " > ".join(parts)

            # Fallback: meta tags
            location_meta = await page.query_selector("meta[property='og:locality']")
            if location_meta:
                location = await location_meta.get_attribute("content")
                if location:
                    return location

            return None

        except Exception as e:
            print(f"  ⚠️ Error extracting breadcrumb: {str(e)[:100]}")
            return None

    def geocode_with_fallback(self, location_parts: List[str], retries: int = 3) -> Tuple[Optional[float], Optional[float], Optional[str]]:
        """Try to geocode with progressive fallback"""
        if not location_parts:
            return None, None, None

        # Try different combinations: specific → general
        attempts = []
        if len(location_parts) >= 3:
            attempts.append(', '.join(location_parts[-3:]))
        if len(location_parts) >= 2:
            attempts.append(', '.join(location_parts[-2:]))
        if len(location_parts) >= 1:
            attempts.append(location_parts[-1])

        for attempt_text in attempts:
            for retry in range(retries):
                try:
                    time.sleep(1)  # Rate limiting
                    location = self.geolocator.geocode(attempt_text, exactly_one=True, timeout=10)

                    if location:
                        lat, lon = location.latitude, location.longitude

                        if self.is_in_europe(lat, lon):
                            return lat, lon, attempt_text
                        else:
                            # Outside Europe, don't retry
                            break

                except (GeocoderTimedOut, GeocoderServiceError) as e:
                    if retry < retries - 1:
                        time.sleep(2)
                    else:
                        break
                except Exception:
                    break

        return None, None, None

    def geocode_from_breadcrumb(self, breadcrumb: str, confidence: str = 'medium') -> Tuple[Optional[float], Optional[float], Optional[str]]:
        """Geocode using breadcrumb"""
        parts = self.parse_breadcrumb(breadcrumb)
        if not parts:
            return None, None, None

        lat, lon, source = self.geocode_with_fallback(parts)
        return lat, lon, source

    def geocode_from_title(self, title: str) -> Tuple[Optional[float], Optional[float], Optional[str]]:
        """Geocode using title hints (last resort)"""
        location_text = self.extract_location_from_title(title)
        if not location_text:
            return None, None, None

        try:
            time.sleep(1)
            location = self.geolocator.geocode(location_text, exactly_one=True, timeout=10)

            if location:
                lat, lon = location.latitude, location.longitude
                if self.is_in_europe(lat, lon):
                    return lat, lon, location_text

        except Exception:
            pass

        return None, None, None

    async def geocode_property(self, url: str, title: str = None, breadcrumb: str = None,
                               extract_breadcrumb: bool = True) -> Tuple[Optional[float], Optional[float], str, str]:
        """
        Geocode a property using all available strategies

        Returns: (lat, lon, confidence, source)
        """
        self.stats['total'] += 1

        # Strategy 1: Use existing breadcrumb
        if breadcrumb:
            lat, lon, source = self.geocode_from_breadcrumb(breadcrumb)
            if lat and lon:
                self.stats['success_breadcrumb'] += 1
                return lat, lon, 'medium', f'breadcrumb:{source}'

        # Strategy 2: Extract breadcrumb from page
        if extract_breadcrumb:
            async with async_playwright() as p:
                browser = await p.chromium.launch(headless=True)
                context = await browser.new_context()
                page = await context.new_page()

                new_breadcrumb = await self.extract_breadcrumb_from_page(page, url)
                await browser.close()

                if new_breadcrumb:
                    lat, lon, source = self.geocode_from_breadcrumb(new_breadcrumb)
                    if lat and lon:
                        self.stats['success_breadcrumb'] += 1
                        return lat, lon, 'medium', f'breadcrumb:{source}'

        # Strategy 3: Use title hints (last resort)
        if title:
            lat, lon, source = self.geocode_from_title(title)
            if lat and lon:
                self.stats['success_title'] += 1
                return lat, lon, 'low', f'title:{source}'

        # Failed
        self.stats['failed_no_breadcrumb'] += 1
        return None, None, 'none', 'failed'

    async def geocode_all_missing(self, save_to_db: bool = True, export_json: bool = True):
        """Geocode all properties missing coordinates"""
        if not self.db:
            raise ValueError("DatabaseManager not initialized")

        missing = self.db.get_missing_geocoded_properties()
        print(f"\n📍 Properties to geocode: {len(missing)}\n")

        if not missing:
            print("✅ All properties have coordinates!")
            return

        for i, prop in enumerate(missing, 1):
            url = prop['url']
            title = prop.get('title', '')[:80]
            breadcrumb = prop.get('breadcrumb')

            print(f"[{i}/{len(missing)}] {title}...")

            lat, lon, confidence, source = await self.geocode_property(
                url=url,
                title=title,
                breadcrumb=breadcrumb,
                extract_breadcrumb=True
            )

            if lat and lon:
                print(f"   ✅ ({lat:.6f}, {lon:.6f}) - {source}")

                if save_to_db:
                    self.db.update_geocoding(url, lat, lon, confidence, source)
                    print(f"   💾 Saved to database")
            else:
                print(f"   ❌ Could not geocode")

        # Export
        if export_json and save_to_db:
            print(f"\n📤 Exporting to JSON...")
            self.db.export_to_json()

        # Final stats
        self.print_stats()

    def print_stats(self):
        """Print geocoding statistics"""
        print("\n" + "="*70)
        print("📊 GEOCODING STATISTICS")
        print("="*70)
        print(f"Total attempts: {self.stats['total']}")
        print(f"✅ Success (breadcrumb): {self.stats['success_breadcrumb']}")
        print(f"✅ Success (title): {self.stats['success_title']}")
        print(f"❌ Failed (no breadcrumb): {self.stats['failed_no_breadcrumb']}")
        print(f"❌ Failed (outside Europe): {self.stats['failed_outside_europe']}")
        print(f"❌ Failed (geocoding error): {self.stats['failed_geocoding_error']}")

        if self.db:
            geo_stats = self.db.get_geocoding_stats()
            print(f"\n📍 Current coverage: {geo_stats['geocoded']}/{geo_stats['total']} ({geo_stats['coverage_percent']}%)")


# CLI Interface
async def main():
    import argparse

    parser = argparse.ArgumentParser(description='Unified Geocoding for Paradisomatch')
    parser.add_argument('--missing', action='store_true', help='Geocode all missing properties')
    parser.add_argument('--url', type=str, help='Geocode a specific property URL')
    parser.add_argument('--no-db', action='store_true', help='Don\'t save to database')
    parser.add_argument('--no-export', action='store_true', help='Don\'t export to JSON')

    args = parser.parse_args()

    geocoder = UnifiedGeocoder(db_path='paradisomatch.db' if not args.no_db else None)

    if args.missing:
        # Geocode all missing
        await geocoder.geocode_all_missing(
            save_to_db=not args.no_db,
            export_json=not args.no_export
        )

    elif args.url:
        # Geocode single property
        lat, lon, confidence, source = await geocoder.geocode_property(args.url)
        if lat and lon:
            print(f"✅ ({lat:.6f}, {lon:.6f}) - Confidence: {confidence}, Source: {source}")
        else:
            print(f"❌ Could not geocode")

    else:
        parser.print_help()


if __name__ == '__main__':
    asyncio.run(main())
