#!/usr/bin/env python3
"""
thumbnail_backfill.py — give every property a thumbnail via the listing page's
og:image meta tag. Low-tech: requests + regex, no browser, no extra deps.

Works for non-DataDome sources (immonot, green-acres, leggett, notaires…).
DataDome-protected pages (idealista) return nothing here → reported, left for the
browser harvester (fetch_idealista_favorites.py). Never silent: prints per-URL outcome.

Usage:
    python3 thumbnail_backfill.py                 # all active props missing a thumbnail
    python3 thumbnail_backfill.py --shortlist      # only properties on the current shortlist
    python3 thumbnail_backfill.py --dry-run
"""
import argparse
import json
import re
import time
from pathlib import Path

import requests

from store import load, persist

DIR = Path(__file__).parent
UA = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
      '(KHTML, like Gecko) Chrome/120.0 Safari/537.36')
OG = re.compile(r'<meta[^>]+(?:property|name)=["\'](?:og:image|twitter:image)["\'][^>]*>', re.I)
CONTENT = re.compile(r'content=["\']([^"\']+)["\']', re.I)


def fetch_og_image(url):
    """Return the og:image/twitter:image URL, or None."""
    try:
        r = requests.get(url, headers={'User-Agent': UA, 'Accept-Language': 'en'}, timeout=15)
        if r.status_code != 200:
            return None, f'HTTP {r.status_code}'
        m = OG.search(r.text)
        if not m:
            return None, 'no og:image (likely bot-blocked)'
        c = CONTENT.search(m.group(0))
        if not c:
            return None, 'og:image tag without content'
        img = c.group(1)
        return (img, None) if img.startswith('http') else (None, 'non-absolute url')
    except Exception as e:
        return None, str(e)[:60]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--shortlist', action='store_true', help='only shortlisted properties')
    ap.add_argument('--dry-run', action='store_true')
    args = ap.parse_args()

    store = load()

    if args.shortlist:
        sl = json.loads((DIR / 'cyber_prairie_shortlist.json').read_text())['shortlist']
        urls = [p['url'] for p in sl if p.get('url')]
    else:
        urls = [u for u, p in store.items()
                if p.get('status') != 'Removed' and p.get('lat') is not None]

    todo = [u for u in urls
            if not (store.get(u, {}).get('thumbnail') or store.get(u, {}).get('photo_urls'))]
    print(f'{len(todo)} properties missing a thumbnail (of {len(urls)} considered)\n')

    ok = fail = 0
    for i, u in enumerate(todo, 1):
        img, err = fetch_og_image(u)
        if img:
            ok += 1
            print(f'  [{i}/{len(todo)}] ✓ {u[:55]}')
            if not args.dry_run:
                store[u]['thumbnail'] = img
        else:
            fail += 1
            print(f'  [{i}/{len(todo)}] ✗ {err:35} {u[:55]}')
        time.sleep(1.0)  # politeness

    print(f'\nBackfilled {ok}, failed {fail}.')
    if ok and not args.dry_run:
        persist(store)
        print('Saved to store.')
    if fail:
        print('Failed ones are DataDome-protected (idealista) → use the browser harvester.')


if __name__ == '__main__':
    main()