#!/usr/bin/env python3
"""
Improved Geocoding with Context Extraction
Uses GPT to extract better location context from property descriptions
"""
import json
import re
import requests
import time
from pathlib import Path
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

EUROPEAN_COUNTRIES = {
    'ES', 'FR', 'PT', 'IT', 'GR', 'DE', 'NL', 'BE', 'AT', 'CH',
    'HR', 'PL', 'CZ', 'SK', 'HU', 'RO', 'BG', 'SI', 'IE', 'GB'
}

def extract_location_context_with_gpt(title, summary, analysis, current_location):
    """Use GPT to extract better location context from property description"""

    prompt = f"""Extract the most specific location information from this property listing.

Current location field: "{current_location}"

Title: {title}

Description: {summary[:300]}

Analysis: {analysis[:300]}

Return ONLY a JSON object with:
- city: The city/town name mentioned
- region: The region/province/state if mentioned
- country: The country (use full name like "Spain", "France", "Portugal")
- nearby_landmark: Any nearby city or landmark mentioned with distance

Example:
{{"city": "Monforte de Lemos", "region": "Galicia", "country": "Spain", "nearby_landmark": "10 km from Monforte de Lemos"}}

If not found, use null. Return ONLY the JSON, no explanation."""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You extract location data. Return only valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=150
        )

        result_text = response.choices[0].message.content.strip()

        # Clean markdown
        if result_text.startswith("```"):
            result_text = re.sub(r'```json\n?|```\n?', '', result_text).strip()

        return json.loads(result_text)

    except Exception as e:
        print(f"      ⚠️  GPT extraction failed: {e}")
        return {}

def build_geocoding_query(location_field, gpt_context):
    """Build best geocoding query from available data"""

    # Priority 1: Use GPT-extracted city + country
    if gpt_context.get('city') and gpt_context.get('country'):
        query = f"{gpt_context['city']}, {gpt_context['country']}"
        if gpt_context.get('region'):
            query = f"{gpt_context['city']}, {gpt_context['region']}, {gpt_context['country']}"
        return query, "gpt_context"

    # Priority 2: Use location field + country if we have country
    if location_field and gpt_context.get('country'):
        return f"{location_field}, {gpt_context['country']}", "location_with_country"

    # Priority 3: Just location field
    if location_field:
        return location_field, "location_only"

    return None, None

def geocode_with_validation(query, expected_country_code=None):
    """Geocode with European country validation"""

    url = "https://nominatim.openstreetmap.org/search"
    params = {
        'q': query,
        'format': 'json',
        'addressdetails': 1,
        'limit': 3
    }
    headers = {'User-Agent': 'FarmMatch/1.0'}

    try:
        response = requests.get(url, params=params, headers=headers, timeout=10)
        results = response.json()

        if not results:
            return None, None, {}

        # Find first European result
        for result in results:
            address = result.get('address', {})
            country_code = address.get('country_code', '').upper()

            # Validate: must be European country
            if country_code in EUROPEAN_COUNTRIES:
                # If we have an expected country, prefer that
                if expected_country_code and country_code == expected_country_code:
                    return extract_location_data(result)
                # Otherwise, take first European result
                if not expected_country_code:
                    return extract_location_data(result)

        # If no European results, return first anyway but flag it
        print(f"      ⚠️  No European results found for: {query}")
        return extract_location_data(results[0])

    except Exception as e:
        print(f"      ❌ Geocoding error: {e}")
        return None, None, {}

def extract_location_data(result):
    """Extract comprehensive location data from Nominatim result"""

    lat = float(result['lat'])
    lon = float(result['lon'])
    address = result.get('address', {})

    location_details = {
        'country': address.get('country', ''),
        'country_code': address.get('country_code', '').upper(),
        'state': address.get('state', ''),
        'region': address.get('region', ''),
        'province': address.get('province', address.get('state_district', '')),
        'district': address.get('district', ''),
        'county': address.get('county', ''),
        'municipality': address.get('municipality', ''),
        'city': address.get('city', ''),
        'town': address.get('town', ''),
        'village': address.get('village', ''),
        'hamlet': address.get('hamlet', ''),
        'suburb': address.get('suburb', ''),
        'postcode': address.get('postcode', ''),
        'display_name': result.get('display_name', ''),
        'osm_type': result.get('osm_type', ''),
        'osm_id': result.get('osm_id', ''),
        'place_id': result.get('place_id', '')
    }

    # Determine locality
    locality = (
        address.get('village') or
        address.get('hamlet') or
        address.get('town') or
        address.get('city') or
        address.get('municipality') or
        ''
    )
    location_details['locality'] = locality

    # Admin levels
    location_details['admin_level_1'] = address.get('state', address.get('province', ''))
    location_details['admin_level_2'] = address.get('county', address.get('district', ''))

    return lat, lon, location_details

def improve_geocoding_for_property(prop):
    """Improve geocoding for a single property"""

    url = prop.get('url', '')
    title = prop.get('title', '')
    summary = prop.get('summary', '')
    analysis = prop.get('analysis', '')
    current_location = prop.get('location', '')

    print(f"   Extracting location context with GPT...")
    gpt_context = extract_location_context_with_gpt(title, summary, analysis, current_location)

    if gpt_context:
        print(f"      GPT found: {gpt_context.get('city', 'N/A')}, {gpt_context.get('country', 'N/A')}")

    # Build best query
    query, method = build_geocoding_query(current_location, gpt_context)

    if not query:
        print(f"      ❌ No location data available")
        return False

    print(f"      Geocoding: \"{query}\" (method: {method})")

    # Get expected country code from GPT context
    expected_country = ((gpt_context or {}).get('country') or '').lower()
    country_code_map = {
        'spain': 'ES', 'españa': 'ES',
        'france': 'FR',
        'portugal': 'PT',
        'italy': 'IT', 'italia': 'IT',
        'greece': 'GR',
        'germany': 'DE'
    }
    expected_code = country_code_map.get(expected_country)

    # Geocode with validation
    lat, lon, location_details = geocode_with_validation(query, expected_code)

    if lat and lon:
        # Update property
        prop['lat'] = lat
        prop['lon'] = lon
        prop.update(location_details)
        prop['geocoding_method'] = method
        prop['geocoding_improved'] = True

        country = location_details.get('country', 'Unknown')
        locality = location_details.get('locality', 'Unknown')
        print(f"      ✅ {locality}, {country} ({lat:.4f}, {lon:.4f})")
        return True
    else:
        print(f"      ❌ Geocoding failed")
        return False

def improve_all_geocoding():
    """Improve geocoding for all properties with poor/missing location data"""

    enriched_file = Path("enriched_data.json")

    if not enriched_file.exists():
        print("❌ enriched_data.json not found!")
        return

    print("🌍 Improved Geocoding with GPT Context Extraction")
    print("=" * 70)

    with open(enriched_file, 'r', encoding='utf-8') as f:
        properties = json.load(f)

    # Find properties needing improvement
    needs_improvement = []
    for prop in properties:
        if prop.get('status') == 'Removed':
            continue

        # Needs improvement if:
        # 1. No country data
        # 2. No detailed location data (locality)
        # 3. Coordinates but no country verification
        if not prop.get('country') or not prop.get('locality'):
            needs_improvement.append(prop)

    print(f"📊 Found {len(needs_improvement)} properties needing improved geocoding")
    print()

    improved = 0
    failed = 0

    for i, prop in enumerate(needs_improvement, 1):
        location = prop.get('location', 'Unknown')
        print(f"[{i}/{len(needs_improvement)}] {location}")

        if improve_geocoding_for_property(prop):
            improved += 1
        else:
            failed += 1

        # Save progress every 5
        if i % 5 == 0:
            with open(enriched_file, 'w', encoding='utf-8') as f:
                json.dump(properties, f, indent=2, ensure_ascii=False)
            print(f"      💾 Progress saved")

        # Rate limiting
        time.sleep(2)
        print()

    # Final save
    with open(enriched_file, 'w', encoding='utf-8') as f:
        json.dump(properties, f, indent=2, ensure_ascii=False)

    print("=" * 70)
    print("✅ IMPROVED GEOCODING COMPLETE")
    print("=" * 70)
    print(f"Successfully improved: {improved}")
    print(f"Failed: {failed}")
    print(f"Success rate: {(improved / max(1, len(needs_improvement)) * 100):.1f}%")

if __name__ == "__main__":
    improve_all_geocoding()
