#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
PhishGuard AI 
Autor: Mejra Mujanović
Opis:
  Napredna verzija aplikacije za detekciju phishing i spam e-mailova putem kombinacije
  Hugging Face NLP modela i forenzičke heurističke analize.
"""

# ==============================================================
# 🔹 1. IMPORT BIBLIOTEKA
# ==============================================================

import gradio as gr
import pandas as pd
import re
import os
from email import policy
from email.parser import BytesParser
from bs4 import BeautifulSoup
from transformers import pipeline
import plotly.graph_objects as go
from urllib.parse import urlparse

# ==============================================================
# 🔹 2. MODEL KONFIGURACIJA
# ==============================================================

HF_MODEL = "MujMej/phishguard-ai"
token = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")

try:
    hf = pipeline("text-classification", model=HF_MODEL, truncation=True)
except Exception as e:
    raise RuntimeError(f"❌ Greška prilikom učitavanja modela: {e}")

# ==============================================================
# 🔹 3. HEURISTIČKA PRAVILA I REGEX
# ==============================================================

URL_REGEX = re.compile(r'https?://[^\s)>\]"]+', re.IGNORECASE)

CUES = [
    "urgent", "verify", "password", "reset", "account", "limited", "suspend", "confirm",
    "click", "login", "bank", "invoice", "unusual", "action required", "payment", "locked",
    "security alert", "update your information", "unauthorized", "verify identity",
    "immediately", "gift card", "invoice due", "credit", "paypal", "apple id", "unauthorized login"
]

# ==============================================================
# 🔹 4. EKSTRAKCIJA SADRŽAJA IZ .eml FAJLA
# ==============================================================

def extract_text_from_eml(eml_bytes):
    """Parsira .eml fajl i vraća tekst, subject, sender i URL-ove."""
    msg = BytesParser(policy=policy.default).parsebytes(eml_bytes)
    text_parts, html_parts = [], []

    for part in msg.walk():
        ctype = part.get_content_type()
        try:
            payload = part.get_content()
        except Exception:
            continue
        if payload is None:
            continue
        if ctype == "text/plain":
            text_parts.append(str(payload))
        elif ctype == "text/html":
            html_parts.append(str(payload))

    text = "\n".join(text_parts)
    if html_parts:
        try:
            soup = BeautifulSoup(" ".join(html_parts), "html.parser")
            html_text = soup.get_text(" ", strip=True)
            text += "\n" + html_text
        except Exception:
            pass

    urls = sorted(set(URL_REGEX.findall(text)))
    subject = msg["Subject"] or ""
    sender = msg["From"] or ""

    return text.strip(), subject, sender, urls

# ==============================================================
# 🔹 5. NAPREDNA HEURISTIČKA ANALIZA (FORENZIČKA)
# ==============================================================

def heuristic_score(text, urls):
    """
    Napredna forenzička heuristika:
    - prepoznaje ključne riječi
    - analizira domene, dužinu i broj URL-ova
    - otkriva HTML forme (phishing logine)
    - mjeri ponavljanje istih linkova
    """
    score = 0.0
    text_l = text.lower()

    # --- 1️⃣ Ključne riječi (socijalni inženjering)
    if any(k in text_l for k in CUES):
        score += 0.25

    # --- 2️⃣ Broj i dužina URL-ova
    if len(urls) >= 3:
        score += 0.15
    if len(urls) >= 5:
        score += 0.10
    long_urls = [u for u in urls if len(u) > 100]
    if len(long_urls) > 0:
        score += 0.10

    # --- 3️⃣ Analiza domena (sumnjive TLD ekstenzije)
    suspicious_tlds = (".ru", ".cn", ".tk", ".zip", ".xyz", ".top", ".gq", ".ml")
    if any(u.lower().endswith(suspicious_tlds) for u in urls):
        score += 0.15

    # --- 4️⃣ Ako koristi skraćene linkove
    if any(short in u for u in urls for short in ["bit.ly", "tinyurl", "goo.gl", "short.ly", "t.co"]):
        score += 0.10

    # --- 5️⃣ Ako su linkovi ponavljani (tracking pattern)
    if len(urls) > len(set(urls)):
        score += 0.10

    # --- 6️⃣ Ako ima email adresu u tekstu i barem jedan URL
    if "@" in text_l and len(urls) > 0:
        score += 0.10

    # --- 7️⃣ Ako sadrži HTML formu ili input polja
    if "<form" in text_l or "input type=" in text_l:
        score += 0.15

    # --- 8️⃣ Ako koristi riječi prijetnje / hitnosti
    if any(w in text_l for w in ["immediately", "suspend", "action required", "unauthorized login"]):
        score += 0.10

    return min(score, 1.0)

# ==============================================================
# 🔹 6. GLAVNA FUNKCIJA ZA ANALIZU I KLASIFIKACIJU
# ==============================================================

def classify_email(eml_file):
    """Analizira e-mail koristeći NLP model i heuristička pravila."""
    if eml_file is None:
        return "❗ Upload a valid .eml file.", None, None

    try:
        with open(eml_file.name, "rb") as f:
            eml_bytes = f.read()
    except Exception as e:
        return f"❌ Failed to read uploaded file: {e}", None, None

    text, subject, sender, urls = extract_text_from_eml(eml_bytes)
    if not text.strip():
        return "❗ Email body appears to be empty.", None, None

    try:
        hf_res = hf(text[:4000])[0]
    except Exception as e:
        return f"⚠️ Error running HF model: {e}", None, None

    hf_label = hf_res.get("label", "").upper()
    hf_score = float(hf_res.get("score", 0.0))
    h_score = heuristic_score(text, urls)

    combined = 0.6 * hf_score + 0.4 * h_score

    # ==============================================================
    # 🧠  Forenzička logika klasifikacije
    # ==============================================================

    if hf_label.startswith("PHISH") and (hf_score >= 0.90 or h_score >= 0.85):
        risk, label, color, action = "Critical", "PHISH", "#FF0000", "🚨 Prijaviti incident i blokirati pošiljaoca"
    elif hf_label.startswith("PHISH") and (hf_score >= 0.75 or h_score >= 0.60):
        risk, label, color, action = "High", "PHISH", "#FF6600", "🚫 Ne otvarati linkove, blokirati pošiljaoca"
    elif "unsubscribe" in text.lower() or "newsletter" in text.lower() or (0.40 <= combined < 0.65):
        risk, label, color, action = "Medium", "SPAM", "#FFD700", "⚠️ Označiti kao SPAM, nije maliciozno"
    else:
        risk, label, color, action = "Low", "LEGIT", "#4CAF50", "✅ Legitimna poruka — bez akcije"

    # ==============================================================
    # 📊  Vizualni prikaz – Procjena malicioznosti
    # ==============================================================

    malicious_percent = round(combined * 100, 1)
    chart = go.Figure()

    chart.add_trace(go.Indicator(
        mode="gauge+number",
        value=malicious_percent,
        number={'suffix': "%", 'font': {'size': 36, 'color': "#222"}},
        title={'text': "Procjena malicioznosti", 'font': {'size': 20, 'color': "#333"}},
        gauge={
            'axis': {'range': [0, 100], 'tickwidth': 1, 'tickcolor': "#999"},
            'bar': {'color': color},
            'steps': [
                {'range': [0, 40], 'color': "#a8e6a3"},
                {'range': [40, 70], 'color': "#ffe680"},
                {'range': [70, 100], 'color': "#ff9b9b"}
            ],
            'threshold': {
                'line': {'color': "#000", 'width': 4},
                'thickness': 0.8,
                'value': malicious_percent
            }
        }
    ))

    chart.update_layout(
        height=380,
        margin=dict(l=30, r=30, t=50, b=30),
        paper_bgcolor="#fafafa"
    )

    chart.add_annotation(
        text="🟢 Legitimno (0–40%) 🟡 Sumnjivo (40–70%) 🔴 Maliciozno (70–100%)",
        showarrow=False,
        xref="paper", yref="paper",
        x=0.5, y=-0.15, xanchor="center",
        font=dict(size=12, color="#444")
    )

    # ==============================================================
    # 📄  HTML ispis rezultata
    # ==============================================================

    result = f"""
    <div style="font-size:1.1em; line-height:1.6em;">
    <strong>🧪 Classification:</strong> {label}<br>
    <strong>⚠️ Risk Level:</strong> <span style='color:{color}; font-weight:bold;'>{risk}</span><br>
    <strong>🤖 HF Score:</strong> {hf_score:.2f}<br>
    <strong>🧠 Heuristic Score:</strong> {h_score:.2f}<br>
    <strong>🔀 Combined Score:</strong> {combined:.2f}<br>
    <strong>📧 Sender:</strong> {sender}<br>
    <strong>📝 Subject:</strong> {subject}<br>
    <strong>🛡️ Preporučena akcija:</strong> {action}
    </div>
    """

    url_table = pd.DataFrame({"Detected URLs": urls}) if urls else pd.DataFrame({"Detected URLs": ["(none)"]})
    return result, chart, url_table

# ==============================================================
# 🔹 7. GRADIO FRONTEND
# ==============================================================

with gr.Blocks(css="#output-box {border:1px solid #ddd;padding:20px;border-radius:10px;background:#fafafa;}") as app:
    gr.Markdown("""
    <h1 style='text-align:center;'>🛡️ PhishGuard AI</h1>
    <p style='text-align:center;font-size:1.2em;color:#555;'>
    Upload a <code>.eml</code> email to analyze it using <strong>Hugging Face NLP</strong> + <strong>forensic heuristic scoring</strong>.
    </p>""")

    with gr.Row():
        with gr.Column(scale=1):
            eml_file = gr.File(label="📤 Upload Email (.eml)", file_types=[".eml"])
        with gr.Column(scale=0.3):
            analyze_btn = gr.Button("🔍 Analyze Email", size="lg")

    gr.Markdown("---")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🧾 Analysis Result")
            output_md = gr.HTML(elem_id="output-box")
        with gr.Column():
            gr.Markdown("### 📊 Procjena malicioznosti")
            chart_output = gr.Plot()

    gr.Markdown("### 🌐 Detected URLs")
    url_table_output = gr.Dataframe(headers=["Detected URLs"], wrap=True)

    analyze_btn.click(fn=classify_email, inputs=eml_file,
                      outputs=[output_md, chart_output, url_table_output])

# ==============================================================
# 🚀 8. POKRETANJE
# ==============================================================

app.launch()