#!/usr/bin/env python3
"""
Import favorites from frenchestateagents.com into the FarmMatch pipeline.

Usage:
  ../venv/bin/python3.14 import_french_favorites.py \\
    --url "https://www.frenchestateagents.com/french-property-for-sale/myList?..."
    [--headless] [--output french_favorites.csv]

Behavior:
- Opens the favorites page in Playwright (headful by default to pass Cloudflare).
- Extracts unique property links that match /french-property-for-sale/A... pattern.
- Saves them to a sidecar CSV for reference.
- Appends missing URLs into extracted_property_urls.csv with empty price/location
  so the existing analysis + parse_criteria pipeline can process them.
"""
import argparse
import asyncio
import csv
import os
import re
from pathlib import Path
from urllib.parse import urljoin
from playwright.async_api import async_playwright

DEFAULT_URL = ("https://www.frenchestateagents.com/french-property-for-sale/myList"
               "?0=A18831CRR83&1=A20736CCU56&2=A22838ANB17&3=A23260DEM22"
               "&4=A25130RL50&5=A25266EB17&6=A26151JRD22&7=A29436CAH33"
               "&8=A32331LRL53&9=A34658BBE17&10=A34672PRD19&11=A34876HA47"
               "&12=A35835RSI30&13=A35881SSA17&14=A35943JHI17&15=A36835AA50"
               "&16=A37466AR36&17=A37612BT32&18=A37960LOK61&19=A38017MKE23"
               "&20=A38051MCW22&21=A38161BBE17&22=A38429ELM17&23=A38952RL50"
               "&24=A39488SNM46&25=A40906JHK56&26=A41559HL29&27=A42361EI61"
               "&28=A42822CST34&29=A42949MCW22&30=A43089EDA29")

EXTRACTED_CSV = Path("extracted_property_urls.csv")


async def fetch_links(url: str, headless: bool, keep_open: bool, wait_seconds: int) -> list[dict]:
    """Load the favorites page, allow solving challenges, and extract property links + metadata.

    Extraction strategy:
    1) Parse JSON-LD ItemList entries (most reliable on this page) → url list.
    2) Parse card DOM for url, thumbnail, price, coords, region/town, maps link.
    3) Fallback to anchor hrefs matching /french-property-for-sale/A...
    """
    import sys

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless)
        context = await browser.new_context(
            user_agent=("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/120.0.0.0 Safari/537.36")
        )
        page = await context.new_page()
        print(f"🌐 Opening favorites page (headless={headless})...")
        await page.goto(url, wait_until="domcontentloaded", timeout=120000)
        if wait_seconds > 0:
            print(f"⏳ Waiting {wait_seconds}s to let the page/challenge settle...")
            await page.wait_for_timeout(wait_seconds * 1000)
        print("⏳ If you see a challenge or blank page, solve it in the opened browser.")

        attempt = 0
        max_attempts = 5 if not keep_open and not sys.stdin.isatty() else 9999

        while attempt < max_attempts:
            attempt += 1
            if sys.stdin.isatty():
                try:
                    user_input = input(f"[Attempt {attempt}] Press Enter to extract links (or 'q' to quit): ").strip().lower()
                    if user_input.startswith('q'):
                        break
                except KeyboardInterrupt:
                    break
            else:
                # Non-interactive: pause a bit between attempts
                await page.wait_for_timeout(20000)

            await page.wait_for_timeout(1500)

            # Attempt 1: JSON-LD ItemList (preferred)
            items = await page.evaluate("""
(() => {
  try {
    const scripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]'));
    for (const s of scripts) {
      try {
        const data = JSON.parse(s.textContent || '{}');
        // Handle ItemList structure
        const items = (data?.mainEntity?.itemListElement) || [];
        const urls = items
          .map(i => i?.item?.url)
          .filter(Boolean)
          .map(url => ({ url }));
        if (urls.length) return Array.from(new Set(urls.map(o => o.url))).map(u => ({ url: u }));
      } catch (e) {}
    }
  } catch (e) {}
  return [];
})()
""")

            # Attempt 2: parse card DOM for richer metadata
            if not items:
                items = await page.evaluate("""
(() => {
  const cards = Array.from(document.querySelectorAll('.card'));
  const results = [];
  for (const card of cards) {
    const anchor = card.querySelector('a[href*="/french-property-for-sale/view/"]');
    const url = anchor?.href;
    if (!url) continue;
    const img = card.querySelector('.card-image img');
    const thumb = img?.src || null;
    const priceEl = card.querySelector('.price .new-price, .price');
    const priceText = priceEl ? priceEl.textContent.replace(/[^0-9,.]/g, '') : null;
    const map = card.querySelector('img.map-region');
    const lat = map?.dataset?.lat || null;
    const lon = map?.dataset?.lon || null;
    const regionText = card.querySelector('.locations .primary')?.textContent?.trim() || null;
    const townText = card.querySelector('.card-footer .town')?.textContent?.trim() || null;
    const mapsLink = card.querySelector('a.btn-map')?.href || null;
    results.push({
      url,
      thumb,
      price: priceText,
      lat,
      lon,
      region: regionText,
      town: townText,
      maps: mapsLink
    });
  }
  return results;
})()
""")

            # Attempt 3: fallback to anchor scan if still empty
            if not items:
                items = await page.eval_on_selector_all(
                    "a",
                    """(els) => {
                        const hrefs = els
                          .map(e => e.href || '')
                          .filter(h => /french-property-for-sale\\/view\\/A\\w+/.test(h));
                        const unique = Array.from(new Set(hrefs));
                        return unique.map(url => ({ url }));
                    }"""
                )

            if items:
                await browser.close()
                return items

            print("⚠️  No links found yet. Solve the challenge / ensure the list is visible, then retry.")

        if keep_open:
            print("🟢 Keeping browser open; close it manually when done. Exiting without links.")
            await asyncio.sleep(2)  # small delay to ensure message is seen
        await browser.close()
        return []
        return []


def write_reference_csv(items: list[dict], path: Path):
    """Write a sidecar CSV for visibility."""
    path = path or Path("french_favorites.csv")
    fieldnames = ["Property URL", "Price", "Thumbnail", "Latitude", "Longitude", "Region", "Town", "MapsLink"]
    with path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for item in items:
            writer.writerow({
                "Property URL": item.get("url"),
                "Price": item.get("price"),
                "Thumbnail": item.get("thumb"),
                "Latitude": item.get("lat"),
                "Longitude": item.get("lon"),
                "Region": item.get("region"),
                "Town": item.get("town"),
                "MapsLink": item.get("maps")
            })
    print(f"✅ Saved {len(items)} URLs to {path}")


def append_to_extracted(items: list[dict], extracted_path: Path):
    """Append new URLs into extracted_property_urls.csv (deduped, with optional metadata)."""
    extracted_path = extracted_path or EXTRACTED_CSV

    existing = set()
    rows = []

    if extracted_path.exists():
        with extracted_path.open(newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            rows = list(reader)
            for row in rows:
                existing.add(row.get("Property URL"))
                existing.add(row.get("URL"))  # Check both columns for dedup

    new_rows = []
    for item in items:
        url = item.get("url") if isinstance(item, dict) else item
        if url in existing:
            continue
        # Build location string from town/region if available
        town = item.get("town") if isinstance(item, dict) else None
        region = item.get("region") if isinstance(item, dict) else None
        loc = ", ".join([p for p in [town, region] if p])
        lat = item.get("lat") if isinstance(item, dict) else None
        lon = item.get("lon") if isinstance(item, dict) else None
        maps_link = item.get("maps") if isinstance(item, dict) else None
        price_val = item.get("price") if isinstance(item, dict) else ""
        thumb = item.get("thumb") if isinstance(item, dict) else ""
        new_rows.append({
            "Property URL": url,
            "URL": url,  # Also populate URL column for analyze_from_urls.py compatibility
            "Locatie": loc,
            "Prijs": price_val,
            "Thumbnail": thumb,
            "Latitude": lat,
            "Longitude": lon,
            "MapsLink": maps_link
        })

    if new_rows:
        print(f"➕ Adding {len(new_rows)} new URLs to {extracted_path}")
        # Preserve any existing columns, but ensure our known ones are present
        base_fieldnames = ["Property URL", "Locatie", "Prijs", "Thumbnail", "Latitude", "Longitude", "MapsLink"]
        extra_fields = list(rows[0].keys()) if rows else []
        fieldnames = list(dict.fromkeys(base_fieldnames + extra_fields))
        with extracted_path.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows + new_rows)
    else:
        print("ℹ️  No new URLs to add (already present).")


async def main():
    parser = argparse.ArgumentParser(description="Import FrenchEstateAgents favorites")
    parser.add_argument("--url", default=DEFAULT_URL, help="Favorites page URL")
    parser.add_argument("--headless", action="store_true", help="Run browser headless")
    parser.add_argument("--output", default="french_favorites.csv", help="Sidecar CSV for extracted URLs")
    parser.add_argument("--keep-browser-open", action="store_true",
                        help="Leave browser open on failure so you can keep solving challenges")
    parser.add_argument("--wait-seconds", type=int, default=10,
                        help="Wait time before first extraction attempt (default 10s)")
    args = parser.parse_args()

    items = await fetch_links(args.url, headless=args.headless, keep_open=args.keep_browser_open,
                              wait_seconds=args.wait_seconds)
    if not items:
        print("❌ No links found. Solve any challenge in the opened browser and retry without --headless.")
        if args.keep_browser_open:
            print("🟢 Browser left open; keep solving, then rerun this command and press Enter on each attempt.")
        return

    # Normalize to absolute URLs and retain metadata if present
    normalized = []
    for item in items:
        if isinstance(item, dict):
            url_val = item.get('url')
            if not url_val:
                continue
            url_val = urljoin(args.url, url_val)
            normalized.append({**item, 'url': url_val})
        else:
            url_val = urljoin(args.url, str(item))
            normalized.append({'url': url_val})

    write_reference_csv(normalized, Path(args.output))
    append_to_extracted(normalized, EXTRACTED_CSV)
    print("🎯 Done. Run analyze_from_urls.py next, then parse_criteria.py and quality_gate.py")


if __name__ == "__main__":
    asyncio.run(main())
