#!/usr/bin/env python3
"""
Fill missing lat/lon for FrenchEstateAgents properties by scraping the detail pages.

Sources:
- <img class="map-region" data-lat=".." data-lon="..">
- Optional Google Maps link: a.btn-map[href*="maps.google.com/maps?q="]
- Region/town hints: .locations .primary, footer .town

Updates:
- analysis_output.csv (Latitude/Longitude/Locatie)
- enriched_data.json (lat/lon/location)
"""
import json
import re
import sys
from pathlib import Path
from typing import Optional, Tuple

import pandas as pd
import requests
from bs4 import BeautifulSoup

ANALYSIS_CSV = Path("analysis_output.csv")
ENRICHED_JSON = Path("enriched_data.json")


def extract_from_page(html: str) -> Tuple[Optional[float], Optional[float], Optional[str]]:
    soup = BeautifulSoup(html, "html.parser")

    # 1) data-lat/lon on img.map-region
    img = soup.select_one("img.map-region")
    lat = lon = None
    if img:
        lat = img.get("data-lat")
        lon = img.get("data-lon")
        try:
            lat = float(lat) if lat else None
            lon = float(lon) if lon else None
        except Exception:
            lat = lon = None

    # 2) maps link
    maps_link = soup.select_one('a.btn-map[href*="maps.google.com/maps?q="]')
    location_hint = None
    if maps_link:
        href = maps_link.get("href", "")
        m = re.search(r'maps\\?q=([^&]+)', href)
        if m:
            location_hint = requests.utils.unquote(m.group(1)).replace("+", " ")

    # 3) fallback location text
    if not location_hint:
        loc_primary = soup.select_one(".locations .primary")
        if loc_primary:
            location_hint = loc_primary.get_text(strip=True)
    if not location_hint:
        town = soup.select_one(".card-footer .town")
        if town:
            location_hint = town.get_text(strip=True)

    return lat, lon, location_hint


def update_files(updates):
    # CSV
    if ANALYSIS_CSV.exists():
        df = pd.read_csv(ANALYSIS_CSV)
        if "Latitude" not in df.columns:
            df["Latitude"] = None
        if "Longitude" not in df.columns:
            df["Longitude"] = None
        if "Locatie" not in df.columns:
            df["Locatie"] = None
        for idx, row in df.iterrows():
            url = row.get("URL")
            if url in updates:
                lat, lon, loc = updates[url]
                if lat is not None and pd.isna(row.get("Latitude")):
                    df.at[idx, "Latitude"] = lat
                if lon is not None and pd.isna(row.get("Longitude")):
                    df.at[idx, "Longitude"] = lon
                if loc and (pd.isna(row.get("Locatie")) or not row.get("Locatie")):
                    df.at[idx, "Locatie"] = loc
        df.to_csv(ANALYSIS_CSV, index=False, encoding="utf-8")

    # JSON
    if ENRICHED_JSON.exists():
        data = json.loads(ENRICHED_JSON.read_text(encoding="utf-8"))
        for prop in data:
            url = prop.get("url")
            if url in updates:
                lat, lon, loc = updates[url]
                if lat is not None and prop.get("lat") is None:
                    prop["lat"] = lat
                if lon is not None and prop.get("lon") is None:
                    prop["lon"] = lon
                if loc and (not prop.get("location") or prop.get("location") == "Unknown"):
                    prop["location"] = loc
        ENRICHED_JSON.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")


def main():
    if not ANALYSIS_CSV.exists():
        print(f"❌ {ANALYSIS_CSV} not found")
        sys.exit(1)

    df = pd.read_csv(ANALYSIS_CSV)
    targets = df[(df["Latitude"].isna()) & (df["URL"].str.contains("frenchestateagents.com", na=False))]
    print(f"🔍 FrenchEstateAgents targets with missing coords: {len(targets)}")

    updates = {}
    for _, row in targets.iterrows():
        url = row["URL"]
        try:
            resp = requests.get(url, timeout=25, headers={"User-Agent": "Mozilla/5.0"})
            if resp.status_code != 200:
                print(f"  ⚠️ {url} -> HTTP {resp.status_code}")
                continue
            lat, lon, loc = extract_from_page(resp.text)
            if lat is not None and lon is not None:
                updates[url] = (lat, lon, loc)
                print(f"  ✅ {url} -> ({lat:.5f}, {lon:.5f}) | {loc or 'n/a'}")
            else:
                print(f"  ❌ No coords in page for {url}")
        except Exception as e:
            print(f"  ❌ Error fetching {url}: {e}")
            continue

    if updates:
        update_files(updates)
        print(f"\n💾 Applied {len(updates)} coordinate updates to CSV/JSON")
    else:
        print("ℹ️ No updates applied")

if __name__ == "__main__":
    main()
