#!/usr/bin/env python3
"""Re-verify property availability via Playwright using per-source POSITIVE signals.

The previous check_availability.py used plain requests + generic 'sold' substring
match, which gave 26 false-positives on Leggett (Cloudflare 403 → "likely removed";
footer nav-link "Some properties sold by us" → "page contains 'sold'").

This rewrite:
  1. Uses Playwright fresh-context-per-page (proven Cloudflare-defeat pattern)
  2. Checks per-source POSITIVE markers:
       - leggett: status 200 + title starts with "House/Property for sale" +
                  JSON-LD RealEstateListing present
       - greenacres: status 200 + lb1.green-acres.com photo URLs present
       - immonot: status 200 + annonce content markers present
  3. Skips sources we can't reliably check (idealista, properstar, immobiliare)
  4. Records `availability_uncertain: true` when signals are ambiguous, rather
     than auto-removing (no more false-positive removals)

Usage:
    python3 recheck_availability.py                 # all checkable sources
    python3 recheck_availability.py --source leggett
    python3 recheck_availability.py --top 30        # only top N by cp_score
"""
from __future__ import annotations

import argparse
import json
import random
import re
import sys
import time
from datetime import datetime
from pathlib import Path

from playwright.sync_api import sync_playwright

SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from store import CHROME_UA, is_active, load, persist  # noqa: E402

CHECKABLE_SOURCES = {'leggett', 'greenacres', 'immonot', 'leggett_email'}


# ─── Source-specific POSITIVE signal checks ───

def _leggett_signals(html: str, title: str) -> tuple[bool, str]:
    """Leggett (frenchestateagents.com): listing live = title pattern + JSON-LD."""
    title_lower = title.lower()
    if not any(p in title_lower for p in ['for sale in', 'house for sale', 'property for sale']):
        # The site reverts to a search-results / homepage title when a listing is gone
        return False, f'title not a listing: {title[:60]!r}'
    has_listing_ld = bool(re.search(r'"@type"\s*:\s*"RealEstateListing"', html))
    if not has_listing_ld:
        return False, 'no RealEstateListing JSON-LD on page'
    # Explicit "sold" / "under offer" patterns Leggett uses on the LISTING (not nav)
    if re.search(r'<[^>]*class="[^"]*(?:sold|under-offer)[^"]*"[^>]*>', html, re.I):
        return False, 'sold/under-offer status badge'
    return True, 'listing live'


def _greenacres_signals(html: str, title: str) -> tuple[bool, str]:
    """Green-Acres: lb1.green-acres.com photo URLs = listing live."""
    has_photos = bool(re.search(r'https://lb1\.green-acres\.com/[^"\' ]+\.(?:jpg|jpeg|webp|png)', html, re.I))
    if not has_photos:
        # Some listings show the "no-photo" placeholder when gone
        return False, 'no listing photos on lb1.green-acres.com'
    # Green-Acres uses "Sold" or French equivalent badges on removed listings
    if re.search(r'\b(sold|vendu|sous compromis)\b', title, re.I):
        return False, f'sold marker in title: {title[:60]!r}'
    return True, 'listing live'


def _immonot_signals(html: str, title: str) -> tuple[bool, str]:
    """Immonot: title format 'achat-maison-...' / 'annonce' markers."""
    title_lower = title.lower()
    if 'immonot' not in title_lower and 'annonce' not in title_lower and 'maison' not in title_lower:
        return False, f'title not a listing: {title[:60]!r}'
    # Immonot redirects to search results when a listing is gone
    if 'recherche' in title_lower or 'résultats' in title_lower:
        return False, 'redirected to search results'
    return True, 'listing live'


SOURCE_CHECKERS = {
    'leggett': _leggett_signals,
    'leggett_email': _leggett_signals,  # underlying URL is frenchestateagents.com
    'greenacres': _greenacres_signals,
    'immonot': _immonot_signals,
}


def check_one(context, url: str, source: str) -> dict:
    """Returns {available, status, reason, checked_at}."""
    page = context.new_page()
    try:
        try:
            r = page.goto(url, wait_until='domcontentloaded', timeout=45000)
        except Exception as e:  # noqa: BLE001
            return {'available': None, 'status': 'error',
                    'reason': f'navigation: {type(e).__name__}',
                    'checked_at': datetime.now().isoformat()}
        status = r.status if r else None
        page.wait_for_timeout(3000)  # Cloudflare settle
        if status == 404 or status == 410:
            return {'available': False, 'status': status,
                    'reason': f'HTTP {status}',
                    'checked_at': datetime.now().isoformat()}
        if status == 403:
            # 403 after Playwright JS solve = real Cloudflare lockout, not removal
            page.wait_for_timeout(7000)
            html = page.content()
            if 'just a moment' in (page.title() or '').lower():
                return {'available': None, 'status': 403,
                        'reason': 'Cloudflare challenge unresolved (uncertain)',
                        'checked_at': datetime.now().isoformat()}
            status = 200  # JS solve completed
        if status and status >= 500:
            return {'available': None, 'status': status,
                    'reason': f'server error {status} (uncertain)',
                    'checked_at': datetime.now().isoformat()}
        html = page.content()
        title = page.title() or ''
        checker = SOURCE_CHECKERS.get(source)
        if not checker:
            return {'available': None, 'status': status,
                    'reason': f'no checker for source {source}',
                    'checked_at': datetime.now().isoformat()}
        ok, reason = checker(html, title)
        return {'available': ok, 'status': status,
                'reason': reason,
                'checked_at': datetime.now().isoformat()}
    finally:
        page.close()


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--source', choices=sorted(CHECKABLE_SOURCES))
    ap.add_argument('--top', type=int, help='only top N by cp_score')
    ap.add_argument('--limit', type=int, help='cap total to check')
    ap.add_argument('--skip-recent-hours', type=float, default=0,
                    help='skip properties checked within the last N hours (resume support)')
    args = ap.parse_args()

    store = load()
    candidates = [(u, p) for u, p in store.items()
                  if is_active(p) and p.get('source') in CHECKABLE_SOURCES]
    if args.source:
        candidates = [(u, p) for u, p in candidates if p.get('source') == args.source]
    if args.skip_recent_hours > 0:
        from datetime import timedelta
        cutoff = datetime.now() - timedelta(hours=args.skip_recent_hours)
        before = len(candidates)
        candidates = [(u, p) for u, p in candidates
                      if not p.get('availability_checked_at')
                      or datetime.fromisoformat(p['availability_checked_at']) < cutoff]
        print(f'  skipped {before - len(candidates)} recently-checked '
              f'(within {args.skip_recent_hours}h)')
    if args.top:
        candidates.sort(key=lambda kv: -float(kv[1].get('cp_score') or 0))
        candidates = candidates[: args.top]
    if args.limit:
        candidates = candidates[: args.limit]

    print(f'Re-checking {len(candidates)} properties (sources: '
          f'{set(p.get("source") for _, p in candidates)})')

    live = removed = uncertain = errored = 0
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        for i, (url, prop) in enumerate(candidates, 1):
            ctx = browser.new_context(user_agent=CHROME_UA, locale='en-GB')
            try:
                res = check_one(ctx, url, prop.get('source'))
            finally:
                ctx.close()
            prop['availability_checked_at'] = res['checked_at']
            prop['availability_last_status'] = res['status']
            prop['availability_last_reason'] = res['reason']
            if res['available'] is False:
                prop['status'] = 'Removed'
                removed += 1
                mark = 'REMOVED'
            elif res['available'] is True:
                # Live again — explicitly clear any stale Removed flag
                if prop.get('status') == 'Removed':
                    del prop['status']
                live += 1
                mark = 'live   '
            elif res['status'] == 'error':
                errored += 1
                mark = 'ERR    '
            else:
                prop['availability_uncertain'] = True
                uncertain += 1
                mark = '?      '
            city = (prop.get('city') or '?')[:22]
            print(f'  [{i:>3}/{len(candidates)}] {mark} {city:22} {res["reason"][:70]}')
            if i % 10 == 0:
                persist(store)
            time.sleep(random.uniform(5.0, 9.0))
        browser.close()
    persist(store)
    print(f'\nLive: {live}  Removed: {removed}  Uncertain: {uncertain}  Errored: {errored}')


if __name__ == '__main__':
    main()
