#!/usr/bin/env python3
"""
Clean up analysis_output.csv - Keep only best analysis per URL
"""
import pandas as pd
import shutil
from datetime import datetime

# Backup current CSV
backup_name = f"analysis_output_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
shutil.copy('analysis_output.csv', backup_name)
print(f"✅ Backed up to: {backup_name}")

# Load CSV
df = pd.read_csv('analysis_output.csv')
print(f"📊 Original: {len(df)} entries, {df['URL'].nunique()} unique URLs")

# For each URL, keep the entry with highest non-zero score
# If all scores are 0, keep the first one
cleaned_rows = []
for url in df['URL'].unique():
    url_df = df[df['URL'] == url]

    # Get entries with non-zero scores
    non_zero = url_df[url_df['Gewogen Score'] > 0]

    if len(non_zero) > 0:
        # Keep the one with highest score
        best = non_zero.loc[non_zero['Gewogen Score'].idxmax()]
    else:
        # All scores are 0, keep first one
        best = url_df.iloc[0]

    cleaned_rows.append(best)

# Create cleaned DataFrame
cleaned_df = pd.DataFrame(cleaned_rows)

# Save
cleaned_df.to_csv('analysis_output.csv', index=False, encoding='utf-8')

print(f"✅ Cleaned: {len(cleaned_df)} entries (1 per URL)")
print(f"   Removed: {len(df) - len(cleaned_df)} duplicate/bad entries")
print()
print("Summary of kept entries:")
score_dist = cleaned_df['Gewogen Score'].value_counts().sort_index()
print(f"  Score = 0: {len(cleaned_df[cleaned_df['Gewogen Score'] == 0])} entries")
print(f"  Score > 0: {len(cleaned_df[cleaned_df['Gewogen Score'] > 0])} entries")
print()
print("✅ Run sync_gpt_results.py now to update enriched_data.json")