"""
Enhanced geocoding script that extracts location from property pages
"""
import pandas as pd
import requests
import time
import json
from urllib.parse import quote
from bs4 import BeautifulSoup
import re

def extract_location_from_url(url):
    """Extract location by scraping the property page"""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try various selectors for location
        location = None

        # Try structured data
        script_tags = soup.find_all('script', type='application/ld+json')
        for script in script_tags:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict):
                    # Check for address in structured data
                    if 'address' in data:
                        addr = data['address']
                        if isinstance(addr, dict):
                            parts = []
                            if 'addressLocality' in addr:
                                parts.append(addr['addressLocality'])
                            if 'addressRegion' in addr:
                                parts.append(addr['addressRegion'])
                            if 'addressCountry' in addr:
                                parts.append(addr['addressCountry'])
                            if parts:
                                location = ', '.join(parts)
                                break
            except:
                continue

        # Try meta tags
        if not location:
            desc_tag = soup.find('meta', {'property': 'og:description'})
            if desc_tag and desc_tag.get('content'):
                content = desc_tag['content']
                # Look for patterns like "City, Country" or postal codes
                match = re.search(r'(\d{5})\s+([A-Za-zÀ-ÿ\s-]+)', content)
                if match:
                    location = f"{match.group(2).strip()}, {match.group(1)}"

        # Try breadcrumbs
        if not location:
            breadcrumb = soup.find('ol', {'class': 'breadcrumb'})
            if breadcrumb:
                items = breadcrumb.find_all('li')
                if len(items) >= 2:
                    # Usually last items are location
                    location_parts = [item.get_text(strip=True) for item in items[-3:]]
                    location = ', '.join(filter(None, location_parts))

        # Try h1 title
        if not location:
            h1 = soup.find('h1')
            if h1:
                text = h1.get_text()
                # Extract location from patterns like "Property in City, Country"
                match = re.search(r'in\s+([A-Za-zÀ-ÿ\s-]+,\s*[A-Za-zÀ-ÿ\s-]+)', text)
                if match:
                    location = match.group(1)

        if location:
            print(f"  → Found location: {location}")
        return location

    except Exception as e:
        print(f"  ✗ Error extracting location: {e}")
        return None

def geocode_address(address):
    """Geocode an address using Nominatim"""
    if not address or pd.isna(address):
        return None, None

    address = str(address).strip()

    try:
        url = f"https://nominatim.openstreetmap.org/search?q={quote(address)}&format=json&limit=1"
        headers = {'User-Agent': 'PropertyMapper/1.0'}

        response = requests.get(url, headers=headers, timeout=10)
        data = response.json()

        if data and len(data) > 0:
            lat = float(data[0]['lat'])
            lon = float(data[0]['lon'])
            print(f"  ✓ Geocoded: {address[:50]}... -> ({lat:.4f}, {lon:.4f})")
            return lat, lon
        else:
            print(f"  ✗ No geocoding results for: {address[:50]}...")
            return None, None

    except Exception as e:
        print(f"  ✗ Geocoding error: {e}")
        return None, None

def improve_geocoding(csv_file="analysis_output.csv", output_json="map_data.json", limit=None):
    """Improve geocoding by scraping property pages"""

    print(f"📊 Reading {csv_file}...")
    df = pd.read_csv(csv_file)

    if 'Latitude' not in df.columns:
        df['Latitude'] = None
    if 'Longitude' not in df.columns:
        df['Longitude'] = None
    if 'ExtractedLocation' not in df.columns:
        df['ExtractedLocation'] = None

    properties = []
    processed = 0

    for idx, row in df.iterrows():
        if limit and processed >= limit:
            break

        print(f"\n🔍 [{idx+1}/{len(df)}] {row['URL']}")

        lat, lon = row.get('Latitude'), row.get('Longitude')
        location = row.get('ExtractedLocation')

        # If we don't have coordinates, try to extract location
        if pd.isna(lat) or pd.isna(lon):
            if pd.isna(location) or not location:
                print("  → Extracting location from page...")
                location = extract_location_from_url(row['URL'])
                if location:
                    df.at[idx, 'ExtractedLocation'] = location
                time.sleep(2)  # Be nice to the server

            if location:
                print("  → Geocoding location...")
                lat, lon = geocode_address(location)
                if lat and lon:
                    df.at[idx, 'Latitude'] = lat
                    df.at[idx, 'Longitude'] = lon
                time.sleep(1.5)  # Rate limiting for Nominatim
        else:
            print(f"  ✓ Already geocoded: ({lat:.4f}, {lon:.4f})")

        # Create property object
        prop = {
            'url': row['URL'],
            'title': row.get('Titel', 'Untitled')[:100] if pd.notna(row.get('Titel')) else 'Untitled',
            'summary': row.get('Samenvatting', '')[:200] if pd.notna(row.get('Samenvatting')) else '',
            'score': float(row.get('Gewogen Score', 0)) if pd.notna(row.get('Gewogen Score')) else 0,
            'location': location or 'Unknown',
            'lat': float(lat) if lat and not pd.isna(lat) else None,
            'lon': float(lon) if lon and not pd.isna(lon) else None,
            'analysis': row.get('GPT Analyse', '') if pd.notna(row.get('GPT Analyse')) else ''
        }

        properties.append(prop)
        processed += 1

        # Save progress every 10 properties
        if (idx + 1) % 10 == 0:
            df.to_csv(csv_file, index=False, encoding='utf-8')
            with open(output_json, 'w', encoding='utf-8') as f:
                json.dump(properties, f, ensure_ascii=False, indent=2)
            print(f"  💾 Progress saved")

    # Final save
    df.to_csv(csv_file, index=False, encoding='utf-8')
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(properties, f, ensure_ascii=False, indent=2)

    geocoded = sum(1 for p in properties if p['lat'] is not None)
    print(f"\n✅ Complete!")
    print(f"📍 Geocoded: {geocoded}/{len(properties)} properties")
    print(f"💾 Saved to {output_json}")

if __name__ == "__main__":
    # Start with a small batch to test
    import sys
    limit = int(sys.argv[1]) if len(sys.argv) > 1 else 20
    print(f"Processing first {limit} properties...")
    improve_geocoding(limit=limit)
