"""
Enhanced geocoding that extracts location from Properstar pages:
1. Municipality/council names from the page
2. Embedded map coordinates
3. Breadcrumb location data
4. Structured JSON-LD data
"""
import pandas as pd
import requests
import time
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import quote, parse_qs, urlparse

def extract_map_coordinates(soup, url):
    """Extract coordinates from embedded Google Maps or similar"""
    coords = {'lat': None, 'lon': None, 'source': None}

    # Method 1: Look for Google Maps embed
    iframes = soup.find_all('iframe')
    for iframe in iframes:
        src = iframe.get('src', '')
        if 'google.com/maps' in src or 'maps.google' in src:
            # Extract coordinates from Google Maps URL
            # Format: ?q=lat,lon or ?ll=lat,lon or ?center=lat,lon
            match = re.search(r'[?&](q|ll|center)=(-?\d+\.?\d*),(-?\d+\.?\d*)', src)
            if match:
                coords['lat'] = float(match.group(2))
                coords['lon'] = float(match.group(3))
                coords['source'] = 'google_maps_embed'
                print(f"  ✓ Found coordinates in Google Maps embed: ({coords['lat']:.4f}, {coords['lon']:.4f})")
                return coords

    # Method 2: Look for coordinates in JavaScript/data attributes
    scripts = soup.find_all('script')
    for script in scripts:
        if script.string:
            # Look for patterns like: lat: 40.123, lng: -0.456
            match = re.search(r'lat["\s:]+(-?\d+\.?\d*)[,\s]+["\s]*lng["\s:]+(-?\d+\.?\d*)', script.string)
            if match:
                coords['lat'] = float(match.group(1))
                coords['lon'] = float(match.group(2))
                coords['source'] = 'javascript_data'
                print(f"  ✓ Found coordinates in JavaScript: ({coords['lat']:.4f}, {coords['lon']:.4f})")
                return coords

            # Look for: "latitude": 40.123, "longitude": -0.456
            match = re.search(r'["\']latitude["\']:\s*(-?\d+\.?\d*)[,\s]+["\']longitude["\']:\s*(-?\d+\.?\d*)', script.string)
            if match:
                coords['lat'] = float(match.group(1))
                coords['lon'] = float(match.group(2))
                coords['source'] = 'json_data'
                print(f"  ✓ Found coordinates in JSON data: ({coords['lat']:.4f}, {coords['lon']:.4f})")
                return coords

    # Method 3: Look in meta tags
    for meta in soup.find_all('meta'):
        if meta.get('property') == 'og:latitude':
            coords['lat'] = float(meta.get('content', 0))
        elif meta.get('property') == 'og:longitude':
            coords['lon'] = float(meta.get('content', 0))
        elif meta.get('name') == 'geo.position':
            content = meta.get('content', '')
            parts = content.split(';')
            if len(parts) == 2:
                coords['lat'] = float(parts[0])
                coords['lon'] = float(parts[1])
                coords['source'] = 'meta_geo'

    if coords['lat'] and coords['lon']:
        print(f"  ✓ Found coordinates in meta tags: ({coords['lat']:.4f}, {coords['lon']:.4f})")
        return coords

    return coords

def extract_municipality_info(soup, url):
    """Extract municipality/council/region information"""
    location_info = {
        'municipality': None,
        'region': None,
        'province': None,
        'country': None,
        'postal_code': None
    }

    # Method 1: Structured data (JSON-LD)
    scripts = soup.find_all('script', type='application/ld+json')
    for script in scripts:
        try:
            data = json.loads(script.string)
            if isinstance(data, dict):
                # Look for address information
                address = data.get('address', {})
                if isinstance(address, dict):
                    location_info['municipality'] = address.get('addressLocality', location_info['municipality'])
                    location_info['region'] = address.get('addressRegion', location_info['region'])
                    location_info['country'] = address.get('addressCountry', location_info['country'])
                    location_info['postal_code'] = address.get('postalCode', location_info['postal_code'])
        except:
            continue

    # Method 2: Breadcrumb navigation
    breadcrumb = soup.find('ol', class_=re.compile('breadcrumb', re.I))
    if breadcrumb:
        items = breadcrumb.find_all('li')
        if len(items) >= 2:
            # Usually: Home > Country > Region > City
            locations = [item.get_text(strip=True) for item in items[1:]]  # Skip "Home"
            if len(locations) >= 1:
                location_info['country'] = locations[0]
            if len(locations) >= 2:
                location_info['region'] = locations[1]
            if len(locations) >= 3:
                location_info['municipality'] = locations[2]

    # Method 3: Look for location in specific elements
    # Properstar often has: <div class="item-location">City, Province</div>
    location_div = soup.find('div', class_=re.compile('location', re.I))
    if location_div:
        text = location_div.get_text(strip=True)
        # Parse "City, Province" or "City Province"
        parts = [p.strip() for p in text.split(',')]
        if len(parts) >= 1 and not location_info['municipality']:
            location_info['municipality'] = parts[0]
        if len(parts) >= 2 and not location_info['region']:
            location_info['region'] = parts[1]

    # Method 4: Extract from title or h1
    title = soup.find('h1')
    if title:
        text = title.get_text()
        # Look for patterns like "Property in CityName" or "CityName, RegionName"
        match = re.search(r'in\s+([A-Za-zÀ-ÿ\s-]+?)(?:,|\s+\d|\s*$)', text)
        if match and not location_info['municipality']:
            location_info['municipality'] = match.group(1).strip()

    # Method 5: Look for postal code patterns
    if not location_info['postal_code']:
        text = soup.get_text()
        # European postal code patterns
        postal_patterns = [
            r'\b(\d{5})\b',  # 5-digit (Spain, France)
            r'\b([A-Z]{1,2}\d{1,2}\s?\d[A-Z]{2})\b',  # UK
            r'\b(\d{4}\s?[A-Z]{2})\b',  # Netherlands
        ]
        for pattern in postal_patterns:
            match = re.search(pattern, text)
            if match:
                location_info['postal_code'] = match.group(1)
                break

    return location_info

def build_location_string(location_info):
    """Build a searchable location string from extracted info"""
    parts = []

    if location_info['municipality']:
        parts.append(location_info['municipality'])

    if location_info['postal_code']:
        parts.append(location_info['postal_code'])

    if location_info['region']:
        parts.append(location_info['region'])

    if location_info['country']:
        parts.append(location_info['country'])

    return ', '.join(parts) if parts else None

def geocode_address(address):
    """Geocode an address using Nominatim"""
    if not address:
        return None, None

    try:
        url = f"https://nominatim.openstreetmap.org/search?q={quote(address)}&format=json&limit=1"
        headers = {'User-Agent': 'PropertyMapper/1.0'}

        response = requests.get(url, headers=headers, timeout=10)
        data = response.json()

        if data and len(data) > 0:
            lat = float(data[0]['lat'])
            lon = float(data[0]['lon'])
            print(f"  ✓ Geocoded via Nominatim: ({lat:.4f}, {lon:.4f})")
            return lat, lon
    except Exception as e:
        print(f"  ✗ Geocoding error: {e}")

    return None, None

def enrich_property_location(url, existing_lat=None, existing_lon=None):
    """
    Extract comprehensive location information from a Properstar property page
    Returns: (lat, lon, location_string, location_details_dict, source)
    """
    if existing_lat and existing_lon:
        print("  → Already has coordinates, skipping")
        return existing_lat, existing_lon, None, {}, 'existing'

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
        }
        response = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Step 1: Try to extract coordinates from embedded map
        coords = extract_map_coordinates(soup, url)
        if coords['lat'] and coords['lon']:
            # We have direct coordinates!
            location_info = extract_municipality_info(soup, url)
            location_string = build_location_string(location_info)
            return coords['lat'], coords['lon'], location_string, location_info, coords['source']

        # Step 2: Extract municipality information and geocode
        location_info = extract_municipality_info(soup, url)
        location_string = build_location_string(location_info)

        if location_string:
            print(f"  → Extracted location: {location_string}")
            lat, lon = geocode_address(location_string)
            if lat and lon:
                return lat, lon, location_string, location_info, 'nominatim'

        print("  ✗ Could not determine location")
        return None, None, location_string, location_info, 'failed'

    except Exception as e:
        print(f"  ✗ Error processing page: {e}")
        return None, None, None, {}, 'error'

def enhance_all_properties(csv_file="analysis_output.csv", output_json="enriched_data.json"):
    """
    Enhance all properties with better location data
    """
    print(f"📊 Reading {csv_file}...")
    df = pd.read_csv(csv_file)

    # Ensure columns exist
    for col in ['Latitude', 'Longitude', 'ExtractedLocation', 'Municipality',
                'Region', 'Province', 'Country', 'PostalCode', 'LocationSource']:
        if col not in df.columns:
            df[col] = None

    properties = []
    success_count = 0

    for idx, row in df.iterrows():
        print(f"\n🔍 [{idx+1}/{len(df)}] {row['URL']}")

        lat, lon = row.get('Latitude'), row.get('Longitude')

        # Only process if we don't have coordinates yet
        if pd.isna(lat) or pd.isna(lon):
            new_lat, new_lon, loc_string, loc_info, source = enrich_property_location(
                row['URL'],
                existing_lat=lat,
                existing_lon=lon
            )

            if new_lat and new_lon:
                df.at[idx, 'Latitude'] = new_lat
                df.at[idx, 'Longitude'] = new_lon
                df.at[idx, 'LocationSource'] = source
                success_count += 1

            if loc_string:
                df.at[idx, 'ExtractedLocation'] = loc_string

            if loc_info:
                if loc_info.get('municipality'):
                    df.at[idx, 'Municipality'] = loc_info['municipality']
                if loc_info.get('region'):
                    df.at[idx, 'Region'] = loc_info['region']
                if loc_info.get('province'):
                    df.at[idx, 'Province'] = loc_info['province']
                if loc_info.get('country'):
                    df.at[idx, 'Country'] = loc_info['country']
                if loc_info.get('postal_code'):
                    df.at[idx, 'PostalCode'] = loc_info['postal_code']

            # Rate limiting
            time.sleep(2)
        else:
            print(f"  ✓ Already has coordinates: ({lat:.4f}, {lon:.4f})")

        # Save progress every 10 properties
        if (idx + 1) % 10 == 0:
            df.to_csv(csv_file, index=False, encoding='utf-8')
            print(f"  💾 Progress saved ({success_count} new coordinates so far)")

    # Final save
    df.to_csv(csv_file, index=False, encoding='utf-8')

    print("\n" + "=" * 70)
    print("📊 ENRICHMENT SUMMARY")
    print("=" * 70)
    print(f"Total properties: {len(df)}")
    print(f"New coordinates found: {success_count}")
    print(f"Total geocoded: {df['Latitude'].notna().sum()}")
    print(f"Success rate: {(df['Latitude'].notna().sum() / len(df) * 100):.1f}%")

    # Show location source breakdown
    if 'LocationSource' in df.columns:
        print("\n📍 Location Sources:")
        sources = df['LocationSource'].value_counts()
        for source, count in sources.items():
            if pd.notna(source):
                print(f"  {source}: {count} properties")

    # Show countries found
    if 'Country' in df.columns:
        print("\n🌍 Countries:")
        countries = df['Country'].value_counts()
        for country, count in countries.items():
            if pd.notna(country):
                print(f"  {country}: {count} properties")

    print("\n✅ Done! Updated " + csv_file)

if __name__ == "__main__":
    import sys

    # Allow specifying how many to process
    limit = int(sys.argv[1]) if len(sys.argv) > 1 else None

    if limit:
        print(f"Processing first {limit} properties without coordinates...")
        df = pd.read_csv("analysis_output.csv")
        # Filter to only those without coordinates
        df_no_coords = df[df['Latitude'].isna()]
        if len(df_no_coords) > limit:
            # Process a subset
            urls_to_process = df_no_coords['URL'].head(limit).tolist()
            print(f"Found {len(df_no_coords)} properties without coordinates")
            print(f"Will process {limit} of them")

    enhance_all_properties()
