# analyze_from_urls.py
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import load_dotenv
import re

# Laad API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Prompt-template laden
with open("prompt.txt", "r", encoding="utf-8") as f:
    base_prompt = f.read()

# Data inladen
input_file = "extracted_property_urls.csv"
output_file = "analysis_output.csv"

# Bestaande analyses laden (indien aanwezig) - only skip if we have a non-empty GPT analysis
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
    if "GPT Analyse" in existing_df.columns:
        # Only skip if GPT analysis is present AND it produced a non-zero weighted score.
        analyzed_urls = set(
            existing_df[
                existing_df["GPT Analyse"].notna()
                & (existing_df["GPT Analyse"].astype(str).str.strip() != "")
                & (existing_df.get("Gewogen Score", 0) != 0)
            ]["URL"].tolist()
        )
    else:
        analyzed_urls = set()
else:
    existing_df = pd.DataFrame()
    analyzed_urls = set()

df = pd.read_csv(input_file)
results = []

# Headers instellen om scraping te omzeilen
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15"
}

# Weegfactoren per score (1 t/m 5)
score_weights = {
    "1": -2,
    "2": -1,
    "3": 1,
    "4": 2,
    "5": 3
}

# Improved criteria weights - more balanced based on business impact
# Total weight = 13.0 for better granularity
criteria_weights = {
    "regeneratieve market garden": 2.5,     # Core business - primary revenue
    "gastenverblijf": 3.0,                  # Highest weight - main income + lifestyle
    "werkplaats": 2.0,                      # Important for value-added products
    "zelfstandige verhuureenheden": 2.5,    # Secondary income stream
    "locatie": 2.0,                         # Important but not decisive
    "afstand tot lokale markt": 1.0         # Nice to have, less critical
}

for i, row in df.iterrows():
    url = row.get("Property URL") or row.get("URL")
    if url in analyzed_urls:
        print(f"⏭️ Al geanalyseerd, overslaan: {url}")
        continue

    try:
        print(f"🔎 Analyse {i+1}/{len(df)}: {url}")
        # Retry fetch with a bit more resilience
        response = None
        for attempt in range(2):
            try:
                response = requests.get(url, headers=headers, timeout=20)
                if response.status_code == 200 and response.text.strip():
                    break
            except requests.RequestException:
                if attempt == 0:
                    time.sleep(1)
                    continue
        if response is None:
            raise Exception("Failed to fetch page after retries")

        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.title.text.strip() if soup.title else ""
        desc_tag = soup.find("meta", {"name": "description"})
        description = desc_tag["content"] if desc_tag else ""

        # Grab additional on-page text to give GPT more context
        paragraph_texts = []
        for p in soup.find_all("p"):
            txt = p.get_text(" ", strip=True)
            if txt:
                paragraph_texts.append(txt)

        # Also collect headings and list items (often hold specs)
        heading_texts = []
        for tag in soup.find_all(["h1", "h2", "h3", "li"]):
            txt = tag.get_text(" ", strip=True)
            if txt:
                heading_texts.append(txt)

        extra_text = " ".join(paragraph_texts + heading_texts)

        # Extract structured facts (JSON-LD, key-value lists)
        facts = []
        # JSON-LD
        for script in soup.find_all("script", {"type": "application/ld+json"}):
            try:
                import json
                data = json.loads(script.string or "{}")
                if isinstance(data, list):
                    data = data[0] if data else {}
                for key in ["name", "description", "address", "areaServed", "numberOfRooms", "floorSize"]:
                    val = data.get(key)
                    if isinstance(val, dict):
                        val = " ".join(str(v) for v in val.values() if v)
                    if val:
                        facts.append(f"{key}: {val}")
            except Exception:
                continue

        # Key-value from list items
        for li in soup.find_all("li"):
            txt = li.get_text(" ", strip=True)
            if txt and len(txt.split()) <= 20:  # avoid huge blobs
                facts.append(txt)

        facts_text = "\n".join(facts[:50])  # cap

        # Build a richer text block (cap to avoid token blow-up)
        full_text_parts = []
        if description:
            full_text_parts.append(description)
        if extra_text:
            full_text_parts.append(extra_text)
        if title and title not in full_text_parts:
            full_text_parts.append(title)
        combined = "\n\n".join(full_text_parts).strip()
        full_text = combined[:1800] if combined else title[:800]

        # Extra context uit locatievelden (indien beschikbaar)
        location_parts = []
        for field in ["Plaats", "Provincie", "Adres"]:
            if field in row and pd.notna(row[field]):
                location_parts.append(str(row[field]))

        location_context = "\n".join(location_parts)
        if location_context:
            full_text += f"\n\n[Locatiegegevens]:\n{location_context}"

        # Prompt opbouwen
        final_prompt = base_prompt.replace("{advertentietekst}", full_text)
        # Always fill placeholders to avoid leaking template tokens
        final_prompt = final_prompt.replace("{locatie_context}", location_context or "Geen extra locatiegegevens beschikbaar.")
        # Include structured facts to help GPT score sparse pages
        final_prompt = final_prompt.replace("{custom_criteria_data}", facts_text or "Geen aanvullende objectieve data beschikbaar.")

        # Enforce scoring even with sparse data
        final_prompt += "\n\nAls gegevens ontbreken, geef toch voor ELK criterium een score 1-5 met een korte motivatie op basis van aannames."
        final_prompt += "\n\nVertaal de bovenstaande titel en beschrijving ook naar het Nederlands, los van je analyse."
        final_prompt += "\n\nVoeg toe aan het einde: Risicoprofiel: Laag / Gemiddeld / Hoog"

        # GPT-aanvraag
        def has_all_criteria(text: str) -> bool:
            return all(f"{n}." in text for n in range(1, 7))

        def ask_gpt(prompt_text: str):
            resp = client.chat.completions.create(
                model="gpt-3.5-turbo-1106",
                messages=[{"role": "user", "content": prompt_text}],
                temperature=0.0
            )
            return resp.choices[0].message.content.strip()

        reply = ask_gpt(final_prompt)

        # If GPT did not return the six numbered criteria, retry once with an explicit format reminder
        if not has_all_criteria(reply):
            print("  ⚠️ Missing criteria lines, retrying with strict format...")
            strict_prompt = final_prompt + "\n\nFORMAT REQUIREMENT: Gebruik exact deze opmaak:\n1. Regenerative market garden: <score 1-5> - <korte motivatie>\n2. Guest accommodation: <score 1-5> - <korte motivatie>\n3. Workshop: <score 1-5> - <korte motivatie>\n4. Independent rental units: <score 1-5> - <korte motivatie>\n5. Location: <score 1-5> - <korte motivatie>\n6. Distance to local market: <score 1-5> - <korte motivatie>\n\nSluit af met: Risicoprofiel: Laag / Gemiddeld / Hoog"
            reply_retry = ask_gpt(strict_prompt)
            if has_all_criteria(reply_retry):
                reply = reply_retry
            else:
                # Third attempt: minimal score-only template to avoid empty answers
                print("  ⚠️ Still missing criteria, forcing score-only output...")
                score_only_prompt = (
                    "Geef ALLEEN het volgende format, zonder extra tekst:\n"
                    "1. Regenerative market garden: <score 1-5> - <korte motivatie>\n"
                    "2. Guest accommodation: <score 1-5> - <korte motivatie>\n"
                    "3. Workshop: <score 1-5> - <korte motivatie>\n"
                    "4. Independent rental units: <score 1-5> - <korte motivatie>\n"
                    "5. Location: <score 1-5> - <korte motivatie>\n"
                    "6. Distance to local market: <score 1-5> - <korte motivatie>\n"
                    "Risicoprofiel: Laag / Gemiddeld / Hoog\n\n"
                    "Context:\n" + full_text
                )
                reply_force = ask_gpt(score_only_prompt)
                if has_all_criteria(reply_force):
                    reply = reply_force
                else:
                    # Fourth attempt: numbers only, no motivation (to ensure parsable scores)
                    print("  ⚠️ Forcing numbers-only output...")
                    numbers_only_prompt = (
                        "Geef ALLEEN dit formaat, zonder verdere tekst of uitleg:\n"
                        "1: <score 1-5>\n"
                        "2: <score 1-5>\n"
                        "3: <score 1-5>\n"
                        "4: <score 1-5>\n"
                        "5: <score 1-5>\n"
                        "6: <score 1-5>\n"
                        "Risicoprofiel: Laag / Gemiddeld / Hoog\n\n"
                        "Context:\n" + full_text
                    )
                    reply_numbers = ask_gpt(numbers_only_prompt)
                    if has_all_criteria(reply_numbers):
                        reply = reply_numbers
                    else:
                        # Fifth attempt: ultra-minimal single-line scores only
                        print("  ⚠️ Forcing single-line scores-only output...")
                        single_line_prompt = (
                            "Geef ALLEEN één regel met precies dit formaat:\n"
                            "1:<1-5> 2:<1-5> 3:<1-5> 4:<1-5> 5:<1-5> 6:<1-5> Risicoprofiel:<Laag/Gemiddeld/Hoog>\n\n"
                            "Context:\n" + full_text
                        )
                        reply_single = ask_gpt(single_line_prompt)
                        if has_all_criteria(reply_single):
                            reply = reply_single
                        else:
                            # Keep the longest of all attempts
                            reply = max([reply, reply_retry, reply_force, reply_numbers, reply_single], key=len)

        # Score extractie
        lines = reply.split("\n")
        total_score = 0.0
        total_weight = 0.0

        for line in lines:
            if ":" in line:
                parts = line.split(":")
                criterium = parts[0].strip().lower()
                score_part = parts[1].strip().split(" ")[0]
                score_value = score_weights.get(score_part, 0)

                for key in criteria_weights:
                    if key in criterium:
                        weight = criteria_weights[key]
                        total_score += score_value * weight
                        total_weight += weight
                        break

        # Risicoprofiel meenemen in score
        risk_factor = 1.0
        risk_match = re.search(r"[Rr]isicoprofiel:\s*(Laag|Gemiddeld|Hoog)", reply)
        if risk_match:
            level = risk_match.group(1).lower()
            if level == "laag":
                risk_factor = 1.0
            elif level == "gemiddeld":
                risk_factor = 0.9
            elif level == "hoog":
                risk_factor = 0.7

        weighted_score = round((total_score / total_weight) * risk_factor, 2) if total_weight > 0 else 0

        results.append({
            "URL": url,
            "Titel": title,
            "Samenvatting": description,
            "GPT Analyse": reply,
            "Gewogen Score": weighted_score
        })

        # Tussentijds opslaan
        pd.DataFrame(results + existing_df.to_dict("records")).to_csv(output_file, index=False, encoding="utf-8")
        time.sleep(2)

    except Exception as e:
        print(f"❌ Fout bij {url}: {e}")
        results.append({
            "URL": url,
            "Titel": title if 'title' in locals() else "",
            "Samenvatting": description if 'description' in locals() else "",
            "GPT Analyse": f"Fout: {e}",
            "Gewogen Score": 0
        })
        pd.DataFrame(results + existing_df.to_dict("records")).to_csv(output_file, index=False, encoding="utf-8")

print("✅ Analyse voltooid. Resultaten opgeslagen in analysis_output.csv")