#!/usr/bin/env python3
"""
Fetch Idealista favorites using patchright (undetected browser).

Opens a browser window, loads each favorites list page, extracts property data.
You may need to solve ONE CAPTCHA manually, then the rest proceeds automatically.

Usage:
    python3 fetch_idealista_favorites.py
    python3 fetch_idealista_favorites.py --dry-run
    python3 fetch_idealista_favorites.py --pages 1,2,3
"""
import argparse
import asyncio
import json
from datetime import datetime
from pathlib import Path

from store import load, persist, upsert

SCRIPT_DIR = Path(__file__).parent
PROFILE_DIR = SCRIPT_DIR / '.idealista_profile_favorites'
AUTH_FILE = SCRIPT_DIR / 'idealista_auth_it.json'

FAVORITES_BASE = 'https://www.idealista.it/en/utente/preferiti/lista-{}'
DEFAULT_PAGES = list(range(1, 7))  # lista-1 through lista-6

LISTING_PATTERN = '/immobile/'


async def wait_for_usable_page(page, timeout_s=120):
    """Wait until the page is usable (no CAPTCHA, no IP block). Returns True if OK."""
    for attempt in range(timeout_s // 3):
        try:
            html = await page.evaluate('() => document.documentElement.outerHTML.substring(0, 3000)')
        except Exception:
            await page.wait_for_timeout(3000)
            continue

        # Check for IP block
        if 'uso improprio' in html.lower() or 'bloccato' in html.lower():
            if attempt == 0:
                print("    IP blocked. Try a different VPN server.")
            return False

        # Check for CAPTCHA
        if 'captcha-delivery.com' in html or 'datadome' in html.lower():
            if attempt == 0:
                print("    CAPTCHA detected, please solve it in the browser...")
            if attempt % 10 == 9:
                print("    Still waiting for CAPTCHA...")
            await page.wait_for_timeout(3000)
            continue

        # Check for actual content (favorites list or login page)
        has_content = await page.evaluate(
            '() => document.querySelectorAll("article, .item, .favorite-item, .item-multimedia-container, a[href*=\\"/immobile/\\"]").length'
        )
        if has_content > 0:
            return True

        # Might be login page
        if 'login' in page.url.lower() or 'accedi' in page.url.lower():
            if attempt == 0:
                print("    Login required, please log in...")
            await page.wait_for_timeout(3000)
            continue

        # Page loaded but no content yet, wait a bit
        if attempt > 3:
            return True  # Give it a shot anyway
        await page.wait_for_timeout(2000)

    print("    Timeout waiting for usable page")
    return False


async def extract_properties(page):
    """Extract property data from a favorites list page."""
    return await page.evaluate("""() => {
        const results = [];
        const seen = new Set();

        // Find all property links
        const allLinks = document.querySelectorAll('a[href*="/immobile/"]');
        for (const a of allLinks) {
            const url = a.href.split('?')[0].replace(/\\/$/, '');
            if (seen.has(url)) continue;
            seen.add(url);

            // Walk up to find the card container
            let card = a.closest('article') || a.closest('.item') || a.closest('.favorite-item') || a.parentElement?.parentElement;

            let price = null, location = null, title = null, sqm = null, rooms = null, thumb = null;

            if (card) {
                // Price
                const priceEl = card.querySelector('.item-price, .price-row, [class*="price"] h3, [class*="price"]');
                if (priceEl) {
                    const digits = priceEl.textContent.replace(/[^0-9]/g, '');
                    if (digits) price = parseInt(digits);
                }

                // Location
                const locEl = card.querySelector('.item-detail-char .item-detail, .item-description, [class*="location"]');
                if (locEl) location = locEl.textContent.trim().substring(0, 100);

                // Title
                const titleEl = card.querySelector('.item-link, a.item-link, .ellipsis');
                if (titleEl) title = titleEl.textContent.trim();

                // Image
                const img = card.querySelector('img[src*="http"], picture img');
                if (img) thumb = img.src;

                // Size and rooms from detail spans
                const details = card.querySelectorAll('.item-detail, span');
                for (const d of details) {
                    const t = d.textContent.trim();
                    const sqmMatch = t.match(/(\\d[\\d.,]*)\\s*m[²2]/);
                    if (sqmMatch && !sqm) sqm = parseInt(sqmMatch[1].replace(/[.,]/g, ''));
                    const roomMatch = t.match(/(\\d+)\\s*(room|local|vano|stanz)/i);
                    if (roomMatch && !rooms) rooms = parseInt(roomMatch[1]);
                }
            }

            results.push({ url, price, location, title, sqm, rooms, thumb });
        }
        return results;
    }""")


async def main():
    parser = argparse.ArgumentParser(description='Fetch Idealista favorites via patchright')
    parser.add_argument('--dry-run', action='store_true', help='Show results without saving')
    parser.add_argument('--pages', type=str, default=None,
                        help='Comma-separated page numbers (default: 1-6)')
    args = parser.parse_args()

    pages = [int(p) for p in args.pages.split(',')] if args.pages else DEFAULT_PAGES

    from patchright.async_api import async_playwright as patchright_playwright

    store = load()
    existing = set(store.keys())
    all_new = []
    thumbs_updated = 0
    now = datetime.now().isoformat()

    PROFILE_DIR.mkdir(exist_ok=True)

    print(f"Fetching Idealista favorites (lists {', '.join(str(p) for p in pages)})")
    print(f"Store: {len(store)} existing properties")
    print(f"A browser window will open. Solve CAPTCHA if prompted.\n")

    async with patchright_playwright() as pw:
        context = await pw.chromium.launch_persistent_context(
            user_data_dir=str(PROFILE_DIR),
            channel="chrome",
            headless=False,
            no_viewport=True,
        )
        page = context.pages[0] if context.pages else await context.new_page()

        captcha_solved = False

        for page_num in pages:
            url = FAVORITES_BASE.format(page_num)
            print(f"  Page {page_num}: {url}")

            try:
                await page.goto(url, wait_until='domcontentloaded', timeout=30000)
                await page.wait_for_timeout(3000)

                ok = await wait_for_usable_page(page)
                if not ok:
                    print(f"    Skipping page {page_num}")
                    # If IP blocked, no point trying more pages
                    body = await page.evaluate('() => document.body ? document.body.innerText.substring(0, 200) : ""')
                    if 'uso improprio' in body.lower():
                        print("    IP blocked, stopping.")
                        break
                    continue

                if not captcha_solved:
                    captcha_solved = True
                    # Save cookies after first successful page
                    try:
                        await context.storage_state(path=str(AUTH_FILE))
                    except Exception:
                        pass

                # Scroll to load lazy content
                for _ in range(5):
                    await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
                    await page.wait_for_timeout(1000)

                items = await extract_properties(page)
                batch_new = 0

                for item in items:
                    prop_url = item.get('url', '')
                    if not prop_url:
                        continue
                    if prop_url in existing:
                        # Backfill thumbnail on an existing property that lacks one.
                        cur = store.get(prop_url, {})
                        if item.get('thumb') and not (cur.get('thumbnail') or cur.get('photo_urls')):
                            if not args.dry_run:
                                store[prop_url]['thumbnail'] = item['thumb']
                            thumbs_updated += 1
                        continue
                    price = item.get('price')
                    if price and price > 450000:
                        continue

                    batch_new += 1
                    existing.add(prop_url)
                    all_new.append(item)

                    if not args.dry_run:
                        fields = {
                            'source': 'idealista',
                            'discovered_at': now,
                            'country': 'IT',
                        }
                        if item.get('price'):
                            fields['price'] = item['price']
                        if item.get('location'):
                            fields['city'] = item['location']
                        if item.get('title'):
                            fields['title'] = item['title']
                        if item.get('sqm'):
                            fields['building_size'] = item['sqm']
                        if item.get('rooms'):
                            fields['rooms'] = item['rooms']
                        if item.get('thumb'):
                            fields['thumbnail'] = item['thumb']
                        upsert(store, prop_url, fields)

                print(f"    Found {len(items)} listings, {batch_new} new")

            except Exception as e:
                print(f"    Error: {str(e)[:80]}")

            await page.wait_for_timeout(2000)

        await context.close()

    print(f"\n{'=' * 50}")
    print(f"  Total new: {len(all_new)} | thumbnails backfilled: {thumbs_updated}")

    if args.dry_run:
        for item in all_new[:20]:
            price_str = f"EUR {item['price']:,}" if item.get('price') else '?'
            loc = item.get('location', '')[:30]
            print(f"    {loc:30s} {price_str:>12s}  {item['url'][:55]}")
    elif all_new or thumbs_updated:
        persist(store)
        print(f"  Saved to store ({len(store)} total)")
    else:
        print("  No new properties found")


if __name__ == '__main__':
    asyncio.run(main())
