#!/usr/bin/env python3
"""
Playwright-based availability check — handles Cloudflare.

Usage:
    python3 check_availability_pw.py              # Check all active
    python3 check_availability_pw.py --headless   # Headless mode
"""
import argparse
import asyncio
import re
from datetime import datetime

from playwright.async_api import async_playwright

from store import load, persist, upsert, is_active, short_url

UNAVAILABLE_PATTERNS = [
    r'niet beschikbaar', r'not available', r'no longer available',
    r'niet meer beschikbaar', r'listing removed', r'verwijderd',
    r'verkocht', r'\bsold\b', r'under offer', r'in optie',
    r'deze woning is niet meer', r'this property is no longer',
    r'pagina niet gevonden', r'page not found', r'erreur 404',
]

REDIRECT_PATTERNS = [
    r'/search\b', r'/zoeken\b', r'/recherche\b',
    r'properstar\.nl/$', r'properstar\.nl/nl/$',
]


async def check_one(page, url, timeout=20000):
    """Check a single URL. Returns (available: bool, reason: str)."""
    try:
        resp = await page.goto(url, wait_until='domcontentloaded', timeout=timeout)

        # Wait a bit for Cloudflare
        await page.wait_for_timeout(2000)

        # Check HTTP status
        if resp and resp.status == 404:
            return False, '404 not found'
        if resp and resp.status == 410:
            return False, '410 gone'

        final_url = page.url

        # Redirected to search/homepage?
        for pat in REDIRECT_PATTERNS:
            if re.search(pat, final_url):
                return False, f'Redirected to {final_url}'

        # Check page text for unavailability markers
        try:
            text = await page.inner_text('body')
            text_lower = text.lower()
        except Exception:
            text_lower = ''

        for pat in UNAVAILABLE_PATTERNS:
            if re.search(pat, text_lower):
                return False, f'Page says: {pat}'

        # Check if it's a Cloudflare challenge (not a real page)
        title = await page.title()
        if 'just a moment' in title.lower():
            # Wait longer for CF
            await page.wait_for_timeout(5000)
            title = await page.title()
            if 'just a moment' in title.lower():
                return True, 'Cloudflare blocked (assuming available)'

        # Check for price/listing content
        has_price = '€' in text_lower or 'eur' in text_lower or 'price' in text_lower
        if not has_price and len(text_lower) < 200:
            return False, 'Empty or minimal page content'

        return True, 'Available'

    except Exception as e:
        err = str(e)[:80]
        if 'timeout' in err.lower():
            return True, f'Timeout (assuming available)'
        return True, f'Error: {err} (assuming available)'


async def run(headless=True):
    store = load()
    active = [(u, p) for u, p in store.items() if is_active(p)]
    print(f"Checking availability: {len(active)} active properties")
    print(f"  Headless: {headless}")
    print()

    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=headless)
        context = await browser.new_context(
            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            viewport={'width': 1280, 'height': 720},
        )
        page = await context.new_page()

        available = 0
        removed = 0

        for i, (url, prop) in enumerate(active):
            print(f"  [{i+1}/{len(active)}] {short_url(url)}...", end=' ', flush=True)

            ok, reason = await check_one(page, url)
            now = datetime.now().isoformat()

            if ok:
                available += 1
                upsert(store, url, {'last_checked': now, 'availability': 'available'})
                print(f"OK")
            else:
                removed += 1
                upsert(store, url, {
                    'status': 'Removed',
                    'last_checked': now,
                    'availability': 'unavailable',
                    'removal_reason': reason,
                    'removed_at': now,
                })
                print(f"REMOVED — {reason}")

            # Save every 20
            if (i + 1) % 20 == 0:
                persist(store)
                print(f"  [saved progress]")

            await page.wait_for_timeout(1500)

        await browser.close()

    persist(store)
    print(f"\nDone: {available} available, {removed} removed")
    print(f"Active now: {sum(1 for p in store.values() if is_active(p))}")


def main():
    parser = argparse.ArgumentParser(description='Playwright availability check')
    parser.add_argument('--headless', action='store_true', help='Run headless')
    args = parser.parse_args()
    asyncio.run(run(headless=args.headless))


if __name__ == '__main__':
    main()
