#!/usr/bin/env python3
"""
Store-native availability checker. Checks active property URLs for 404/sold/removed.

Usage:
    python3 check_availability_store.py              # Check all active
    python3 check_availability_store.py --limit 5    # Test on 5 properties
    python3 check_availability_store.py --dry-run    # Show what would be checked
    python3 check_availability_store.py --force      # Re-check recently checked
"""
import argparse
import re
import time
from datetime import datetime, timedelta

import requests
from bs4 import BeautifulSoup

from store import load, persist, upsert, is_active, short_url, CHROME_UA, STATUS_REMOVED

HEADERS = {'User-Agent': CHROME_UA}

# STRONG indicators: specific phrases that almost always mean "this listing is gone".
# A single match is conclusive.
STRONG_UNAVAILABLE_INDICATORS = [
    'no longer available', 'niet meer beschikbaar', 'listing removed',
    'ce bien a été vendu', "cette annonce n'est plus disponible",
    'questo immobile non è più disponibile',
    'this property has been sold', 'this listing is no longer',
    'annonce supprimée', 'annonce expirée',
]

# WEAK indicators: short/generic words that legitimate live pages may contain
# in nav, footer, or filter chrome (e.g. "Sold properties" link, "not available
# in your region", "verkocht" in a "recently sold" sidebar).
# Require ≥2 WEAK matches OR appearance inside a known status container.
WEAK_UNAVAILABLE_INDICATORS = [
    'not available', 'niet beschikbaar', 'verwijderd',
    'sold', 'verkocht', 'under offer', 'in optie',
]

# Selectors where status text actually means the listing's status (high signal).
# If a WEAK indicator appears inside any of these, treat as conclusive.
# Kept tight on purpose — broad globs like [class*="status"] catch sidebar
# filters and "Recently sold properties" widgets, producing false-removals.
STATUS_SELECTORS = [
    '.listing-status', '.property-status', '.status-badge',
    '.price-status', '.sold-banner', '.listing-unavailable',
    '.property-sold', '.unavailable-banner',
]


def check_url(url, timeout=10):
    """Check if a property URL is still live. Returns (available, reason)."""
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
        code = resp.status_code

        if code == 404:
            return False, '404 Not Found'
        if code == 410:
            return False, '410 Gone'
        if code >= 500:
            return True, f'Server error {code}, assuming available'

        # Cloudflare/bot protection — can't determine status, skip
        text_raw = resp.text[:2000].lower()
        if 'just a moment' in text_raw or 'checking your browser' in text_raw or 'cf-browser-verification' in text_raw or '_cf_chl' in text_raw:
            return True, 'Cloudflare protected, skipping'
        if code == 403:
            return True, '403, assuming bot-blocked'

        # Redirected to homepage/search = removed
        if resp.url.rstrip('/') != url.rstrip('/'):
            final = resp.url
            if re.search(r'/(search|risultati|zoeken|chercher)\b', final) or final.count('/') <= 3:
                return False, f'Redirected to {final[:80]}'

        # Check page text for sold/removed indicators (tightened: scope + multi-match)
        soup = BeautifulSoup(resp.content, 'html.parser')
        text = soup.get_text().lower()

        # 1. STRONG single-match — specific phrases that mean what they say
        for indicator in STRONG_UNAVAILABLE_INDICATORS:
            if indicator in text:
                return False, f'Strong: "{indicator}"'

        # 2. WEAK match inside a known status container — also conclusive
        for sel in STATUS_SELECTORS:
            for el in soup.select(sel):
                el_text = el.get_text(' ', strip=True).lower()
                for indicator in WEAK_UNAVAILABLE_INDICATORS:
                    if indicator in el_text:
                        return False, f'Status[{sel}]: "{indicator}"'

        # 3. WEAK match in full text — require ≥2 distinct indicators
        weak_hits = [i for i in WEAK_UNAVAILABLE_INDICATORS if i in text]
        if len(weak_hits) >= 2:
            return False, f'Multi-weak: {weak_hits[:3]}'

        return True, 'Active'

    except requests.exceptions.Timeout:
        return True, 'Timeout, assuming available'
    except requests.exceptions.RequestException as e:
        return True, f'Connection error: {str(e)[:60]}'


def main():
    parser = argparse.ArgumentParser(description='Check property availability (store-native)')
    parser.add_argument('--limit', type=int, help='Max properties to check')
    parser.add_argument('--dry-run', action='store_true', help='Show plan without checking')
    parser.add_argument('--force', action='store_true', help='Re-check recently checked')
    args = parser.parse_args()

    store = load()
    now = datetime.now()
    cutoff = (now - timedelta(hours=24)).isoformat()

    # Select active properties, skip recently checked unless --force
    candidates = []
    for url, prop in store.items():
        if not is_active(prop):
            continue
        if not args.force:
            last = prop.get('availability_last_checked', '')
            if last > cutoff:
                continue
        candidates.append(url)

    if args.limit:
        candidates = candidates[:args.limit]

    print(f'Checking {len(candidates)} properties (of {sum(1 for p in store.values() if is_active(p))} active)')

    if args.dry_run:
        for url in candidates[:10]:
            print(f'  Would check: {short_url(url)}')
        if len(candidates) > 10:
            print(f'  ... and {len(candidates) - 10} more')
        return

    removed = 0
    errors = 0
    for i, url in enumerate(candidates, 1):
        available, reason = check_url(url)
        ts = now.isoformat()

        if available:
            upsert(store, url, {
                'availability_last_checked': ts,
                'availability_checked_at': ts,  # alias read by shortlist.html
                'availability_status_code': 200,
                'availability_reason': reason,
            })
        else:
            upsert(store, url, {
                'status': STATUS_REMOVED,
                'removed_at': ts,
                'removal_reason': reason,
                'availability_last_checked': ts,
                'availability_checked_at': ts,  # alias read by shortlist.html
                'availability_status_code': 404,
                'availability_reason': reason,
            })
            removed += 1
            print(f'  REMOVED: {short_url(url)} -- {reason}')

        if i % 20 == 0:
            print(f'  ... {i}/{len(candidates)} checked')
        time.sleep(0.5)  # Rate limit

    persist(store)
    active_count = sum(1 for p in store.values() if is_active(p))
    print(f'Done: {len(candidates)} checked, {removed} removed, {active_count} active remaining')


if __name__ == '__main__':
    main()
