#!/usr/bin/env python3
"""
Fetch coordinates for FrenchEstateAgents properties with missing lat/lon using Playwright (headful).

Flow:
- Open a persistent browser (headful) so you can solve the Cloudflare challenge once.
- For each target URL (missing coords, domain= frenchestateagents.com):
    - navigate
    - read img.map-region[data-lat][data-lon], .locations .primary (region/dep), .town (town), btn-map href
- Update analysis_output.csv (Latitude/Longitude/Locatie) and enriched_data.json (lat/lon/location).

Usage:
  ../venv/bin/python3.14 fetch_coords_frenchestateagents.py

Note: Keep the browser window in front; solve any challenge on the first page, then the script will continue.
"""
import asyncio
import json
import re
import time
from pathlib import Path
from typing import Dict, Tuple

import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
from playwright.async_api import async_playwright

ANALYSIS_CSV = Path("analysis_output.csv")
ENRICHED_JSON = Path("enriched_data.json")


def load_targets() -> Dict[str, Dict]:
    df = pd.read_csv(ANALYSIS_CSV)
    targets = df[
        (df["Latitude"].isna()) &
        (df["URL"].str.contains("frenchestateagents.com", na=False))
    ]
    return {row["URL"]: {} for _, row in targets.iterrows()}


async def fetch_coords(urls):
    updates = {}
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        print("🌐 Browser opened. Solve any challenge on the first page, then let it continue.")
        geolocator = Nominatim(user_agent="farmmatch_fea_coords")

        for i, url in enumerate(urls, 1):
            print(f"[{i}/{len(urls)}] {url}")
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=120000)
                await page.wait_for_timeout(4000)
                # Extract data
                data = await page.evaluate("""
() => {
  const mapImg = document.querySelector('img.map-region');
  const lat = mapImg?.dataset?.lat || null;
  const lon = mapImg?.dataset?.lon || null;
  const region = document.querySelector('.locations .primary')?.textContent?.trim() || null;
  const town = document.querySelector('.card-footer .town')?.textContent?.trim() || null;
  const maps = document.querySelector('a.btn-map[href*="maps.google.com/maps?q="]')?.href || null;
  return {lat, lon, region, town, maps};
}
""")
                lat = float(data["lat"]) if data.get("lat") else None
                lon = float(data["lon"]) if data.get("lon") else None
                loc_parts = [p for p in [data.get("town"), data.get("region")] if p]
                loc = ", ".join(loc_parts) if loc_parts else None

                # Fallback: geocode maps link if no lat/lon
                if (lat is None or lon is None) and data.get("maps"):
                    loc_hint = re.sub(r".*maps\\.google\\.com/maps\\?q=", "", data["maps"])
                    loc_hint = loc_hint.replace("+", " ")
                    try:
                        time.sleep(1)
                        geo = geolocator.geocode(loc_hint, exactly_one=True, timeout=10)
                        if geo:
                            lat = geo.latitude
                            lon = geo.longitude
                            if not loc:
                                loc = loc_hint
                    except (GeocoderTimedOut, GeocoderServiceError):
                        pass
                    except Exception:
                        pass

                updates[url] = (lat, lon, loc)
                if lat is not None and lon is not None:
                    print(f"   ✅ ({lat:.5f}, {lon:.5f}) | {loc or 'n/a'}")
                else:
                    print("   ❌ No coords found on page or via maps link")
            except Exception as e:
                print(f"   ❌ Error: {e}")
        await browser.close()
    return updates


def update_files(updates: Dict[str, Tuple]):
    # CSV
    if ANALYSIS_CSV.exists():
        df = pd.read_csv(ANALYSIS_CSV)
        for col in ["Latitude", "Longitude", "Locatie"]:
            if col not in df.columns:
                df[col] = None
        for idx, row in df.iterrows():
            url = row.get("URL")
            if url in updates:
                lat, lon, loc = updates[url]
                if lat is not None:
                    df.at[idx, "Latitude"] = lat
                if lon is not None:
                    df.at[idx, "Longitude"] = lon
                if loc and (pd.isna(row.get("Locatie")) or not row.get("Locatie")):
                    df.at[idx, "Locatie"] = loc
        df.to_csv(ANALYSIS_CSV, index=False, encoding="utf-8")

    # JSON
    if ENRICHED_JSON.exists():
        data = json.loads(ENRICHED_JSON.read_text(encoding="utf-8"))
        for prop in data:
            url = prop.get("url")
            if url in updates:
                lat, lon, loc = updates[url]
                if lat is not None:
                    prop["lat"] = lat
                if lon is not None:
                    prop["lon"] = lon
                if loc and (not prop.get("location") or prop.get("location") == "Unknown"):
                    prop["location"] = loc
        ENRICHED_JSON.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")


async def main():
    targets = load_targets()
    if not targets:
        print("✅ No FrenchEstateAgents targets with missing coords.")
        return
    print(f"🔍 {len(targets)} FrenchEstateAgents properties need coords.")
    updates = await fetch_coords(list(targets.keys()))
    if updates:
        update_files(updates)
        print(f"💾 Applied {len(updates)} coordinate updates.")
    else:
        print("ℹ️ No updates applied.")


if __name__ == "__main__":
    asyncio.run(main())
