#!/usr/bin/env python3
"""
Import favorites from Idealista (.pt, .es, .it) into the Paradisomatch pipeline.

Usage:
  source ../venv/bin/activate
  python3 import_idealista_favorites.py --domain pt   # Portugal
  python3 import_idealista_favorites.py --domain es   # Spain
  python3 import_idealista_favorites.py --domain it   # Italy
  python3 import_idealista_favorites.py --domain all  # All three
  [--headless] [--wait-seconds 15]

Behavior:
- Opens the Idealista favorites page in Playwright (headful by default).
- Waits for manual login if needed, then saves auth for next time.
- Extracts property links, prices, thumbnails, and locations from cards.
- Saves to a sidecar CSV (idealista_favorites.csv) for reference.
- Appends missing URLs into extracted_property_urls.csv (both Property URL
  and URL columns) so the analysis + parse_criteria pipeline can process them.
"""
import argparse
import asyncio
import csv
import os
import re
import sys
from pathlib import Path
from urllib.parse import urljoin
from playwright.async_api import async_playwright

EXTRACTED_CSV = Path("extracted_property_urls.csv")

DOMAINS = {
    "pt": {
        "base": "https://www.idealista.pt",
        "favorites": "https://www.idealista.pt/area-pessoal/favoritos",
        "auth": "idealista_auth_pt.json",
        "listing_pattern": r"/imovel/\d+",
    },
    "es": {
        "base": "https://www.idealista.com",
        "favorites": "https://www.idealista.com/area-personal/favoritos",
        "auth": "idealista_auth_es.json",
        "listing_pattern": r"/inmueble/\d+",
    },
    "it": {
        "base": "https://www.idealista.it",
        "favorites": "https://www.idealista.it/area-personale/preferiti",
        "auth": "idealista_auth_it.json",
        "listing_pattern": r"/immobile/\d+",
    },
}


async def fetch_favorites(domain_key: str, headless: bool, wait_seconds: int) -> list[dict]:
    """Open favorites page, handle login, extract property cards."""
    cfg = DOMAINS[domain_key]
    auth_file = cfg["auth"]

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless)

        if os.path.exists(auth_file):
            context = await browser.new_context(storage_state=auth_file)
            print(f"✅ [{domain_key}] Using saved auth from {auth_file}")
        else:
            context = await browser.new_context()
            print(f"⚠️  [{domain_key}] No auth found — you'll need to log in manually.")

        page = await context.new_page()
        print(f"🌐 [{domain_key}] Opening {cfg['favorites']}...")
        await page.goto(cfg["favorites"], wait_until="domcontentloaded", timeout=60000)

        if wait_seconds > 0:
            print(f"⏳ [{domain_key}] Waiting {wait_seconds}s for page to settle...")
            await page.wait_for_timeout(wait_seconds * 1000)

        # Wait for any redirects to finish
        try:
            await page.wait_for_load_state("load", timeout=10000)
        except:
            pass

        # Check if we landed on a login page or got redirected
        current_url = page.url
        is_logged_in = "favorit" in current_url.lower() or "preferit" in current_url.lower()

        if not is_logged_in:
            print(f"🔐 [{domain_key}] Redirected to: {current_url}")
            print(f"   [{domain_key}] Please log in in the browser window.")
            if sys.stdin.isatty():
                input(f"   [{domain_key}] Press Enter once you're on the favorites page...")
            else:
                # Non-interactive: poll for favorites page
                for _ in range(30):
                    await page.wait_for_timeout(5000)
                    current_url = page.url
                    if "favorit" in current_url.lower() or "preferit" in current_url.lower():
                        break
                    print(f"   [{domain_key}] Still waiting... (current: {current_url[:60]})")

            # Save auth for next time
            try:
                await context.storage_state(path=auth_file)
                print(f"💾 [{domain_key}] Auth saved to {auth_file}")
            except:
                pass

        # Navigate to favorites explicitly if not already there
        current_url = page.url
        if "favorit" not in current_url.lower() and "preferit" not in current_url.lower():
            print(f"🔄 [{domain_key}] Navigating to favorites page...")
            await page.goto(cfg["favorites"], wait_until="domcontentloaded", timeout=30000)

        try:
            await page.wait_for_load_state("networkidle", timeout=15000)
        except:
            await page.wait_for_timeout(3000)

        # Scroll to load all favorites (Idealista uses lazy loading)
        prev_count = 0
        for scroll_attempt in range(20):
            try:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            except:
                break
            await page.wait_for_timeout(1500)
            try:
                cards = await page.query_selector_all("article.item, .item-multimedia-container, .item_contains_branding, article[data-adid]")
            except:
                cards = []
            if len(cards) == prev_count and scroll_attempt > 2:
                break
            prev_count = len(cards)

        # Also try pagination
        while True:
            next_btn = await page.query_selector("a.icon-arrow-right-after, .pagination .next a")
            if next_btn:
                try:
                    await next_btn.click()
                    await page.wait_for_load_state("networkidle")
                    await page.wait_for_timeout(2000)
                except:
                    break
            else:
                break

        # Extract property data from all cards
        listing_pattern = cfg["listing_pattern"]
        escaped_pattern = listing_pattern.replace("/", "\\/")
        base_url = cfg["base"]
        js_code = """
(() => {
  const results = [];
  const seen = new Set();

  // Strategy 1: article items (most common Idealista layout)
  const articles = document.querySelectorAll('article.item, article[data-adid], .item-info-container');
  for (const art of articles) {
    const anchor = art.closest('a[href]') || art.querySelector('a[href*="/imovel/"], a[href*="/inmueble/"], a[href*="/immobile/"]');
    if (!anchor) continue;
    let url = anchor.href;
    if (seen.has(url)) continue;
    seen.add(url);

    const img = art.querySelector('img[src], picture img');
    const thumb = img?.src || null;
    const priceEl = art.querySelector('.item-price, .price-row, [class*="price"]');
    const priceText = priceEl?.textContent?.replace(/[^0-9]/g, '') || null;
    const locEl = art.querySelector('.item-detail-char .item-location, .item-description .ellipsis, [class*="location"]');
    const location = locEl?.textContent?.trim() || null;
    const titleEl = art.querySelector('.item-link, a.item-link, .ellipsis');
    const title = titleEl?.textContent?.trim() || null;

    results.push({ url, thumb, price: priceText, location, title });
  }

  // Strategy 2: fallback — scan all links matching listing pattern
  if (results.length === 0) {
    const links = document.querySelectorAll('a[href]');
    const pattern = new RegExp('PATTERN_PLACEHOLDER');
    for (const a of links) {
      const href = a.href;
      if (!pattern.test(href) || seen.has(href)) continue;
      seen.add(href);
      results.push({ url: href, thumb: null, price: null, location: null, title: a.textContent?.trim() || null });
    }
  }

  return results;
})()
""".replace("PATTERN_PLACEHOLDER", escaped_pattern)
        items = await page.evaluate(js_code)

        await browser.close()

        # Normalize URLs
        normalized = []
        for item in items:
            url = item.get("url", "")
            if not url.startswith("http"):
                url = base_url + url
            normalized.append({**item, "url": url, "domain": domain_key})

        print(f"✅ [{domain_key}] Found {len(normalized)} favorites")
        return normalized


def write_reference_csv(items: list[dict], path: Path):
    """Write sidecar CSV for visibility."""
    fieldnames = ["Property URL", "Price", "Thumbnail", "Location", "Title", "Domain"]
    with path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for item in items:
            writer.writerow({
                "Property URL": item.get("url"),
                "Price": item.get("price"),
                "Thumbnail": item.get("thumb"),
                "Location": item.get("location"),
                "Title": item.get("title"),
                "Domain": item.get("domain"),
            })
    print(f"✅ Saved {len(items)} URLs to {path}")


def append_to_extracted(items: list[dict]):
    """Append new URLs to extracted_property_urls.csv (deduped, both URL columns)."""
    existing_urls = set()
    rows = []

    if EXTRACTED_CSV.exists():
        with EXTRACTED_CSV.open(newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            rows = list(reader)
            for row in rows:
                existing_urls.add(row.get("URL", ""))
                existing_urls.add(row.get("Property URL", ""))

    new_rows = []
    for item in items:
        url = item.get("url", "")
        if url in existing_urls:
            continue
        new_rows.append({
            "Property URL": url,
            "URL": url,  # Both columns for pipeline compatibility
            "Locatie": item.get("location", ""),
            "Prijs": item.get("price", ""),
            "Thumbnail": item.get("thumb", ""),
        })

    if new_rows:
        print(f"➕ Adding {len(new_rows)} new URLs to {EXTRACTED_CSV}")
        base_fieldnames = ["Property URL", "Locatie", "Prijs", "Thumbnail", "Latitude", "Longitude", "MapsLink", "URL", "Breadcrumb", "Status_404"]
        extra_fields = list(rows[0].keys()) if rows else []
        fieldnames = list(dict.fromkeys(base_fieldnames + extra_fields))
        with EXTRACTED_CSV.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows + new_rows)
    else:
        print("ℹ️  No new URLs to add (already present).")


async def main():
    parser = argparse.ArgumentParser(description="Import Idealista favorites into Paradisomatch")
    parser.add_argument("--domain", default="all", choices=["pt", "es", "it", "all"],
                        help="Which Idealista domain to scrape (default: all)")
    parser.add_argument("--headless", action="store_true", help="Run browser headless")
    parser.add_argument("--output", default="idealista_favorites.csv", help="Sidecar CSV path")
    parser.add_argument("--wait-seconds", type=int, default=10,
                        help="Wait time before first extraction attempt (default 10s)")
    args = parser.parse_args()

    domains = ["pt", "es", "it"] if args.domain == "all" else [args.domain]
    all_items = []

    for domain_key in domains:
        try:
            items = await fetch_favorites(domain_key, headless=args.headless, wait_seconds=args.wait_seconds)
            all_items.extend(items)
        except Exception as e:
            print(f"❌ [{domain_key}] Failed: {e}")
            continue

    if not all_items:
        print("❌ No favorites found on any domain.")
        return

    write_reference_csv(all_items, Path(args.output))
    append_to_extracted(all_items)

    # Summary
    from collections import Counter
    by_domain = Counter(i["domain"] for i in all_items)
    print(f"\n📊 Summary: {len(all_items)} total favorites")
    for d, c in by_domain.most_common():
        print(f"   idealista.{d}: {c}")
    print(f"\n🎯 Done. Run analyze_from_urls.py next, then parse_criteria.py and quality_gate.py")


if __name__ == "__main__":
    asyncio.run(main())
