#!/usr/bin/env python3
"""
Geocode missing properties using simple location hints (town/region) to close coverage gaps.
- Uses Locatie column if present, otherwise parses the last part of the URL slug.
- Geocodes with Nominatim (France bias).
"""
import json
import re
import time
from pathlib import Path
from typing import Optional, Tuple

import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError

ANALYSIS_CSV = Path("analysis_output.csv")
ENRICHED_JSON = Path("enriched_data.json")


def hint_from_url(url: str) -> Optional[str]:
    try:
        slug = url.rstrip("/").split("/")[-1]
        parts = slug.split("-for-sale-")
        tail = parts[-1] if parts else slug
        tail = tail.replace("-", " ")
        return tail
    except Exception:
        return None


def geocode_hint(geolocator, hint: str) -> Optional[Tuple[float, float]]:
    try:
        time.sleep(1)
        loc = geolocator.geocode(f"{hint}, France", exactly_one=True, timeout=10)
        if loc:
            return (loc.latitude, loc.longitude)
    except (GeocoderTimedOut, GeocoderServiceError):
        return None
    except Exception:
        return None
    return None


def main():
    if not ANALYSIS_CSV.exists():
        print(f"❌ {ANALYSIS_CSV} not found.")
        return

    geolocator = Nominatim(user_agent="farmmatch_hint_geocoder")
    df = pd.read_csv(ANALYSIS_CSV)
    if "Latitude" not in df.columns:
        df["Latitude"] = None
    if "Longitude" not in df.columns:
        df["Longitude"] = None
    if "Locatie" not in df.columns:
        df["Locatie"] = None

    updates = 0
    for idx, row in df.iterrows():
        if pd.notna(row.get("Latitude")) and pd.notna(row.get("Longitude")):
            continue
        url = row.get("URL", "")
        loc_hint = row.get("Locatie")
        if not loc_hint or pd.isna(loc_hint) or loc_hint == "Unknown":
            loc_hint = hint_from_url(url)
        if not loc_hint:
            continue
        coords = geocode_hint(geolocator, loc_hint)
        if coords:
            lat, lon = coords
            df.at[idx, "Latitude"] = lat
            df.at[idx, "Longitude"] = lon
            updates += 1
            print(f"✅ {url} -> ({lat:.5f}, {lon:.5f}) via '{loc_hint}'")

    df.to_csv(ANALYSIS_CSV, index=False, encoding="utf-8")

    # Update JSON if present
    if ENRICHED_JSON.exists():
        data = json.loads(ENRICHED_JSON.read_text(encoding="utf-8"))
        idx_map = {row["URL"]: (row.get("Latitude"), row.get("Longitude"), row.get("Locatie")) for _, row in df.iterrows()}
        for prop in data:
            url = prop.get("url")
            if url and url in idx_map:
                lat, lon, loc = idx_map[url]
                if lat is not None:
                    prop["lat"] = lat
                if lon is not None:
                    prop["lon"] = lon
                if loc and not pd.isna(loc) and (not prop.get("location") or prop.get("location") == "Unknown"):
                    prop["location"] = loc
        ENRICHED_JSON.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

    print(f"\n💾 Geocoded {updates} properties from hints.")
    print("Re-run risk_features.py and quality_gate.py to update coverage.")


if __name__ == "__main__":
    main()
