#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ PhishGuard AI Autor: Mejra Mujanović Opis: Napredna verzija aplikacije za detekciju phishing i spam e-mailova putem kombinacije Hugging Face NLP modela i forenzičke heurističke analize. """ # ============================================================== # 🔹 1. IMPORT BIBLIOTEKA # ============================================================== import gradio as gr import pandas as pd import re import os from email import policy from email.parser import BytesParser from bs4 import BeautifulSoup from transformers import pipeline import plotly.graph_objects as go from urllib.parse import urlparse # ============================================================== # 🔹 2. MODEL KONFIGURACIJA # ============================================================== HF_MODEL = "MujMej/phishguard-ai" token = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN") try: hf = pipeline("text-classification", model=HF_MODEL, truncation=True) except Exception as e: raise RuntimeError(f"❌ Greška prilikom učitavanja modela: {e}") # ============================================================== # 🔹 3. HEURISTIČKA PRAVILA I REGEX # ============================================================== URL_REGEX = re.compile(r'https?://[^\s)>\]"]+', re.IGNORECASE) CUES = [ "urgent", "verify", "password", "reset", "account", "limited", "suspend", "confirm", "click", "login", "bank", "invoice", "unusual", "action required", "payment", "locked", "security alert", "update your information", "unauthorized", "verify identity", "immediately", "gift card", "invoice due", "credit", "paypal", "apple id", "unauthorized login" ] # ============================================================== # 🔹 4. EKSTRAKCIJA SADRŽAJA IZ .eml FAJLA # ============================================================== def extract_text_from_eml(eml_bytes): """Parsira .eml fajl i vraća tekst, subject, sender i URL-ove.""" msg = BytesParser(policy=policy.default).parsebytes(eml_bytes) text_parts, html_parts = [], [] for part in msg.walk(): ctype = part.get_content_type() try: payload = part.get_content() except Exception: continue if payload is None: continue if ctype == "text/plain": text_parts.append(str(payload)) elif ctype == "text/html": html_parts.append(str(payload)) text = "\n".join(text_parts) if html_parts: try: soup = BeautifulSoup(" ".join(html_parts), "html.parser") html_text = soup.get_text(" ", strip=True) text += "\n" + html_text except Exception: pass urls = sorted(set(URL_REGEX.findall(text))) subject = msg["Subject"] or "" sender = msg["From"] or "" return text.strip(), subject, sender, urls # ============================================================== # 🔹 5. NAPREDNA HEURISTIČKA ANALIZA (FORENZIČKA) # ============================================================== def heuristic_score(text, urls): """ Napredna forenzička heuristika: - prepoznaje ključne riječi - analizira domene, dužinu i broj URL-ova - otkriva HTML forme (phishing logine) - mjeri ponavljanje istih linkova """ score = 0.0 text_l = text.lower() # --- 1️⃣ Ključne riječi (socijalni inženjering) if any(k in text_l for k in CUES): score += 0.25 # --- 2️⃣ Broj i dužina URL-ova if len(urls) >= 3: score += 0.15 if len(urls) >= 5: score += 0.10 long_urls = [u for u in urls if len(u) > 100] if len(long_urls) > 0: score += 0.10 # --- 3️⃣ Analiza domena (sumnjive TLD ekstenzije) suspicious_tlds = (".ru", ".cn", ".tk", ".zip", ".xyz", ".top", ".gq", ".ml") if any(u.lower().endswith(suspicious_tlds) for u in urls): score += 0.15 # --- 4️⃣ Ako koristi skraćene linkove if any(short in u for u in urls for short in ["bit.ly", "tinyurl", "goo.gl", "short.ly", "t.co"]): score += 0.10 # --- 5️⃣ Ako su linkovi ponavljani (tracking pattern) if len(urls) > len(set(urls)): score += 0.10 # --- 6️⃣ Ako ima email adresu u tekstu i barem jedan URL if "@" in text_l and len(urls) > 0: score += 0.10 # --- 7️⃣ Ako sadrži HTML formu ili input polja if "= 0.90 or h_score >= 0.85): risk, label, color, action = "Critical", "PHISH", "#FF0000", "🚨 Prijaviti incident i blokirati pošiljaoca" elif hf_label.startswith("PHISH") and (hf_score >= 0.75 or h_score >= 0.60): risk, label, color, action = "High", "PHISH", "#FF6600", "🚫 Ne otvarati linkove, blokirati pošiljaoca" elif "unsubscribe" in text.lower() or "newsletter" in text.lower() or (0.40 <= combined < 0.65): risk, label, color, action = "Medium", "SPAM", "#FFD700", "⚠️ Označiti kao SPAM, nije maliciozno" else: risk, label, color, action = "Low", "LEGIT", "#4CAF50", "✅ Legitimna poruka — bez akcije" # ============================================================== # 📊 Vizualni prikaz – Procjena malicioznosti # ============================================================== malicious_percent = round(combined * 100, 1) chart = go.Figure() chart.add_trace(go.Indicator( mode="gauge+number", value=malicious_percent, number={'suffix': "%", 'font': {'size': 36, 'color': "#222"}}, title={'text': "Procjena malicioznosti", 'font': {'size': 20, 'color': "#333"}}, gauge={ 'axis': {'range': [0, 100], 'tickwidth': 1, 'tickcolor': "#999"}, 'bar': {'color': color}, 'steps': [ {'range': [0, 40], 'color': "#a8e6a3"}, {'range': [40, 70], 'color': "#ffe680"}, {'range': [70, 100], 'color': "#ff9b9b"} ], 'threshold': { 'line': {'color': "#000", 'width': 4}, 'thickness': 0.8, 'value': malicious_percent } } )) chart.update_layout( height=380, margin=dict(l=30, r=30, t=50, b=30), paper_bgcolor="#fafafa" ) chart.add_annotation( text="🟢 Legitimno (0–40%) 🟡 Sumnjivo (40–70%) 🔴 Maliciozno (70–100%)", showarrow=False, xref="paper", yref="paper", x=0.5, y=-0.15, xanchor="center", font=dict(size=12, color="#444") ) # ============================================================== # 📄 HTML ispis rezultata # ============================================================== result = f"""
🧪 Classification: {label}
⚠️ Risk Level: {risk}
🤖 HF Score: {hf_score:.2f}
🧠 Heuristic Score: {h_score:.2f}
🔀 Combined Score: {combined:.2f}
📧 Sender: {sender}
📝 Subject: {subject}
🛡️ Preporučena akcija: {action}
""" url_table = pd.DataFrame({"Detected URLs": urls}) if urls else pd.DataFrame({"Detected URLs": ["(none)"]}) return result, chart, url_table # ============================================================== # 🔹 7. GRADIO FRONTEND # ============================================================== with gr.Blocks(css="#output-box {border:1px solid #ddd;padding:20px;border-radius:10px;background:#fafafa;}") as app: gr.Markdown("""

🛡️ PhishGuard AI

Upload a .eml email to analyze it using Hugging Face NLP + forensic heuristic scoring.

""") with gr.Row(): with gr.Column(scale=1): eml_file = gr.File(label="📤 Upload Email (.eml)", file_types=[".eml"]) with gr.Column(scale=0.3): analyze_btn = gr.Button("🔍 Analyze Email", size="lg") gr.Markdown("---") with gr.Row(): with gr.Column(): gr.Markdown("### 🧾 Analysis Result") output_md = gr.HTML(elem_id="output-box") with gr.Column(): gr.Markdown("### 📊 Procjena malicioznosti") chart_output = gr.Plot() gr.Markdown("### 🌐 Detected URLs") url_table_output = gr.Dataframe(headers=["Detected URLs"], wrap=True) analyze_btn.click(fn=classify_email, inputs=eml_file, outputs=[output_md, chart_output, url_table_output]) # ============================================================== # 🚀 8. POKRETANJE # ============================================================== app.launch()