#!/usr/bin/env python3
"""
Headful scraper to extract breadcrumb/location hints from Properstar pages that block simple requests.

Targets: properties missing Latitude/Longitude in analysis_output.csv and with Properstar URLs.
Extracts:
- breadcrumb text (nav[aria-label='breadcrumb'] or .breadcrumb-container/.breadcrumb)
- meta locality (og:locality)

Then geocodes the best hint and updates analysis_output.csv / enriched_data.json.

Run from scraper/:
  ../venv/bin/python3.14 fetch_breadcrumb_properstar.py
"""
import asyncio
import json
import time
from pathlib import Path
from typing import Dict, Tuple, Optional

import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
from playwright.async_api import async_playwright

ANALYSIS_CSV = Path("analysis_output.csv")
ENRICHED_JSON = Path("enriched_data.json")


def load_targets() -> pd.DataFrame:
    df = pd.read_csv(ANALYSIS_CSV)
    targets = df[
        (df["Latitude"].isna()) &
        (df["Longitude"].isna()) &
        (df["URL"].str.contains("properstar", na=False))
    ]
    return targets


async def scrape_page(page, url: str) -> Dict[str, Optional[str]]:
    await page.goto(url, wait_until="domcontentloaded", timeout=120000)
    await page.wait_for_timeout(4000)
    data = await page.evaluate("""
() => {
  const breadcrumb = document.querySelector("nav[aria-label='breadcrumb']") ||
                     document.querySelector(".breadcrumb-container") ||
                     document.querySelector(".breadcrumb");
  let crumbText = null;
  if (breadcrumb) {
    const parts = Array.from(breadcrumb.querySelectorAll("a, span"))
      .map(el => el.textContent?.trim())
      .filter(Boolean);
    if (parts.length) crumbText = parts.join(" > ");
  }
  const metaLoc = document.querySelector("meta[property='og:locality']")?.content || null;
  return {breadcrumb: crumbText, metaLoc};
}
""")
    return data


def geocode_hint(hint: str, geolocator) -> Optional[Tuple[float, float]]:
    for attempt in range(2):
        try:
            time.sleep(1)
            loc = geolocator.geocode(hint, exactly_one=True, timeout=10)
            if loc:
                return loc.latitude, loc.longitude
        except (GeocoderTimedOut, GeocoderServiceError):
            continue
        except Exception:
            return None
    return None


def update_files(updates: Dict[str, Tuple[float, float, Optional[str]]]):
    if ANALYSIS_CSV.exists():
        df = pd.read_csv(ANALYSIS_CSV)
        for col in ["Latitude", "Longitude", "Locatie"]:
            if col not in df.columns:
                df[col] = None
        for idx, row in df.iterrows():
            url = row.get("URL")
            if url in updates:
                lat, lon, loc = updates[url]
                if lat is not None:
                    df.at[idx, "Latitude"] = lat
                if lon is not None:
                    df.at[idx, "Longitude"] = lon
                if loc and not pd.isna(loc) and (pd.isna(row.get("Locatie")) or not row.get("Locatie")):
                    df.at[idx, "Locatie"] = loc
        df.to_csv(ANALYSIS_CSV, index=False, encoding="utf-8")

    if ENRICHED_JSON.exists():
        data = json.loads(ENRICHED_JSON.read_text(encoding="utf-8"))
        for prop in data:
            url = prop.get("url")
            if url in updates:
                lat, lon, loc = updates[url]
                if lat is not None:
                    prop["lat"] = lat
                if lon is not None:
                    prop["lon"] = lon
                if loc and not pd.isna(loc) and (not prop.get("location") or prop.get("location") == "Unknown"):
                    prop["location"] = loc
        ENRICHED_JSON.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")


async def main():
    targets = load_targets()
    if targets.empty:
        print("✅ No Properstar targets with missing coords.")
        return

    geolocator = Nominatim(user_agent="farmmatch_properstar_breadcrumbs")
    updates = {}

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        print("🌐 Browser opened headful. Solve any challenge on the first page if shown.")
        for i, (_, row) in enumerate(targets.iterrows(), 1):
            url = row["URL"]
            print(f"[{i}/{len(targets)}] {url}")
            try:
                data = await scrape_page(page, url)
                hints = [data.get("breadcrumb"), data.get("metaLoc"), row.get("Locatie")]
                hints = [h for h in hints if h]
                lat = lon = None
                loc = None
                for hint in hints:
                    coords = geocode_hint(hint, geolocator)
                    if coords:
                        lat, lon = coords
                        loc = loc or hint
                        break
                updates[url] = (lat, lon, loc)
                if lat and lon:
                    print(f"   ✅ ({lat:.5f}, {lon:.5f}) | {loc}")
                else:
                    print(f"   ❌ No coords found for hints: {hints}")
            except Exception as e:
                print(f"   ❌ Error: {e}")
        await browser.close()

    if updates:
        update_files(updates)
        found = sum(1 for v in updates.values() if v[0] is not None)
        print(f"💾 Applied {found} coordinate updates.")
    else:
        print("ℹ️ No updates applied.")


if __name__ == "__main__":
    asyncio.run(main())
