#!/usr/bin/env python3
"""
Rebuild analysis_output.csv from GPT cache by force-processing ALL URLs regardless of gpt_score.
This is a one-time recovery script to extract the 161 successful analyses from cache.
"""

import json
import pandas as pd

# Load enriched_data to get all URLs
with open('enriched_data.json', 'r', encoding='utf-8') as f:
    properties = json.load(f)

# Load existing CSV if it exists (to append)
try:
    existing_df = pd.read_csv('analysis_output_PARTIAL_8.csv', encoding='utf-8')
    print(f"✓ Loaded {len(existing_df)} existing analyses from partial CSV")
except FileNotFoundError:
    existing_df = pd.DataFrame(columns=['URL', 'Titel', 'Samenvatting', 'GPT Analyse', 'Gewogen Score'])
    print("⚠️  No partial CSV found, starting fresh")

# Get URLs already in CSV
existing_urls = set(existing_df['URL'].tolist())
print(f"✓ Found {len(existing_urls)} URLs already in CSV")

# Get all URLs from enriched_data (regardless of gpt_score)
all_urls = [p['url'] for p in properties if 'url' in p]
print(f"✓ Found {len(all_urls)} total properties in enriched_data.json")

# Filter to URLs that need processing (not in CSV yet)
urls_to_process = [url for url in all_urls if url not in existing_urls]
print(f"\n🔧 Need to process {len(urls_to_process)} URLs not yet in CSV")

# Create dummy entries for all missing URLs
# (They'll be filled with actual data when analyze script runs with USE_CACHE=y)
new_rows = []
for url in urls_to_process:
    prop = next((p for p in properties if p.get('url') == url), {})
    new_rows.append({
        'URL': url,
        'Titel': '',
        'Samenvatting': '',
        'GPT Analyse': 'Pending',
        'Gewogen Score': 0.0
    })

# Combine and save
combined_df = pd.concat([existing_df, pd.DataFrame(new_rows)], ignore_index=True)
combined_df.to_csv('analysis_output.csv', index=False, encoding='utf-8')

print(f"\n✅ Saved analysis_output.csv with {len(combined_df)} entries")
print(f"   - {len(existing_urls)} from partial CSV (with data)")
print(f"   - {len(new_rows)} placeholders for cache rebuild")
