#!/usr/bin/env python3
"""
Enrich Green-Acres properties by fetching detail pages for descriptions.

Search results only capture title/price/size. Detail pages have full descriptions
that dramatically improve GPT analysis quality.

Usage:
    python3 enrich_greenacres.py              # Enrich all without description
    python3 enrich_greenacres.py --limit 10   # Test on 10 properties
    python3 enrich_greenacres.py --force       # Re-fetch even if already enriched
    python3 enrich_greenacres.py --dry-run     # Show what would be fetched
"""
import argparse
import re
import time
from datetime import datetime

import requests

from store import load, persist, upsert, is_active

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
}

DELAY = 1.5  # seconds between requests (be polite)


def fetch_detail(url):
    """Fetch a Green-Acres detail page and extract description + features."""
    try:
        resp = requests.get(url, headers=HEADERS, timeout=15, allow_redirects=True)
        if resp.status_code == 410:
            return {'removed': True, 'removed_reason': 'HTTP 410 Gone'}
        if resp.status_code != 200:
            return {'error': f'HTTP {resp.status_code}'}

        html = resp.text
        result = {}

        # Description text
        match = re.search(r'description-text">(.*?)</div>', html, re.DOTALL)
        if match:
            desc = match.group(1).strip()
            # Clean HTML tags
            desc = re.sub(r'<br\s*/?>', '\n', desc)
            desc = re.sub(r'<[^>]+>', '', desc)
            desc = desc.strip()
            if len(desc) > 10:
                result['description'] = desc

        # Meta description as fallback
        if 'description' not in result:
            meta = re.search(r'meta name="description" content="([^"]+)"', html)
            if meta and len(meta.group(1)) > 20:
                desc = meta.group(1).strip()
                if 'n\'existe plus' not in desc:
                    result['description'] = desc

        # DPE (energy rating)
        dpe_match = re.search(r'dpe-letter[^>]*>[^<]*([A-G])', html)
        if dpe_match:
            result['dpe'] = dpe_match.group(1)

        # Additional features from structured data
        features = []
        for feat_match in re.finditer(r'characteristic-value">\s*([^<]+)', html):
            val = feat_match.group(1).strip()
            if val and len(val) > 1:
                features.append(val)
        if features:
            result['detail_features'] = features[:20]

        # Keyword signals from description
        if 'description' in result:
            desc_lower = result['description'].lower()
            signals = []
            signal_patterns = {
                'has_outbuilding': r'outbuilding|barn|shed|hangar|grange|dépendance|annex',
                'has_workshop': r'workshop|atelier|studio',
                'has_pool': r'pool|piscine|swimming',
                'has_well': r'well|puits|spring|source',
                'has_garden': r'garden|jardin|potager|orchard|verger',
                'has_terrace': r'terrace|terrasse|patio',
                'has_fireplace': r'fireplace|cheminée|wood.?burn|poêle',
                'has_stone': r'stone|pierre|character|caractère|charm',
                'has_view': r'view|vue|panoram|overlooking',
                'has_electricity': r'electric|électric|mains|connected',
                'has_mains_water': r'mains water|water.*connect|eau.*ville|tout.?à.?l',
            }
            for key, pattern in signal_patterns.items():
                if re.search(pattern, desc_lower):
                    signals.append(key)
            if signals:
                result['keyword_signals'] = signals

        result['page_enriched'] = True
        return result

    except requests.exceptions.Timeout:
        return {'error': 'timeout'}
    except Exception as e:
        return {'error': str(e)[:80]}


def main():
    parser = argparse.ArgumentParser(description='Enrich Green-Acres detail pages')
    parser.add_argument('--limit', type=int, help='Max properties to fetch')
    parser.add_argument('--force', action='store_true', help='Re-fetch already enriched')
    parser.add_argument('--dry-run', action='store_true', help='Show plan only')
    args = parser.parse_args()

    store = load()

    # Find Green-Acres properties needing enrichment
    candidates = []
    for url, p in store.items():
        if p.get('source') != 'greenacres':
            continue
        if not is_active(p):
            continue
        if not args.force and p.get('page_enriched'):
            continue
        candidates.append(url)

    if args.limit:
        candidates = candidates[:args.limit]

    total_ga = sum(1 for p in store.values() if p.get('source') == 'greenacres' and is_active(p))
    already = total_ga - len(candidates)

    print(f"GREEN-ACRES ENRICHMENT")
    print(f"  Active Green-Acres: {total_ga}")
    print(f"  Already enriched: {already}")
    print(f"  To fetch: {len(candidates)}")
    print()

    if args.dry_run or not candidates:
        if not candidates:
            print("  Nothing to enrich.")
        return

    enriched = 0
    removed = 0
    errors = 0
    t0 = time.time()

    for i, url in enumerate(candidates, 1):
        slug = url.split('/')[-1][:30]
        print(f"  [{i}/{len(candidates)}] {slug}...", end=' ', flush=True)

        data = fetch_detail(url)

        if data.get('removed'):
            upsert(store, url, {
                'removed': True,
                'removed_at': datetime.now().isoformat(),
                'removed_reason': data['removed_reason'],
            })
            removed += 1
            print(f"REMOVED ({data['removed_reason']})")
        elif data.get('error'):
            errors += 1
            print(f"ERROR ({data['error']})")
        else:
            desc_len = len(data.get('description', ''))
            signals = data.get('keyword_signals', [])
            upsert(store, url, data)
            enriched += 1
            signal_str = f" [{', '.join(signals[:3])}]" if signals else ""
            print(f"OK ({desc_len} chars{signal_str})")

        # Save periodically
        if i % 25 == 0:
            persist(store)
            print(f"  -- saved ({enriched} enriched, {removed} removed) --")

        time.sleep(DELAY)

    persist(store)
    elapsed = time.time() - t0

    print()
    print(f"  DONE in {elapsed:.0f}s")
    print(f"  Enriched: {enriched}")
    print(f"  Removed:  {removed}")
    print(f"  Errors:   {errors}")


if __name__ == '__main__':
    main()
