#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ PhishGuard AI Autor: Mejra Mujanović Opis: Napredna verzija aplikacije za detekciju phishing i spam e-mailova putem kombinacije Hugging Face NLP modela i forenzičke heurističke analize. """ # ============================================================== # 🔹 1. IMPORT BIBLIOTEKA # ============================================================== import gradio as gr import pandas as pd import re import os from email import policy from email.parser import BytesParser from bs4 import BeautifulSoup from transformers import pipeline import plotly.graph_objects as go from urllib.parse import urlparse # ============================================================== # 🔹 2. MODEL KONFIGURACIJA # ============================================================== HF_MODEL = "MujMej/phishguard-ai" token = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN") try: hf = pipeline("text-classification", model=HF_MODEL, truncation=True) except Exception as e: raise RuntimeError(f"❌ Greška prilikom učitavanja modela: {e}") # ============================================================== # 🔹 3. HEURISTIČKA PRAVILA I REGEX # ============================================================== URL_REGEX = re.compile(r'https?://[^\s)>\]"]+', re.IGNORECASE) CUES = [ "urgent", "verify", "password", "reset", "account", "limited", "suspend", "confirm", "click", "login", "bank", "invoice", "unusual", "action required", "payment", "locked", "security alert", "update your information", "unauthorized", "verify identity", "immediately", "gift card", "invoice due", "credit", "paypal", "apple id", "unauthorized login" ] # ============================================================== # 🔹 4. EKSTRAKCIJA SADRŽAJA IZ .eml FAJLA # ============================================================== def extract_text_from_eml(eml_bytes): """Parsira .eml fajl i vraća tekst, subject, sender i URL-ove.""" msg = BytesParser(policy=policy.default).parsebytes(eml_bytes) text_parts, html_parts = [], [] for part in msg.walk(): ctype = part.get_content_type() try: payload = part.get_content() except Exception: continue if payload is None: continue if ctype == "text/plain": text_parts.append(str(payload)) elif ctype == "text/html": html_parts.append(str(payload)) text = "\n".join(text_parts) if html_parts: try: soup = BeautifulSoup(" ".join(html_parts), "html.parser") html_text = soup.get_text(" ", strip=True) text += "\n" + html_text except Exception: pass urls = sorted(set(URL_REGEX.findall(text))) subject = msg["Subject"] or "" sender = msg["From"] or "" return text.strip(), subject, sender, urls # ============================================================== # 🔹 5. NAPREDNA HEURISTIČKA ANALIZA (FORENZIČKA) # ============================================================== def heuristic_score(text, urls): """ Napredna forenzička heuristika: - prepoznaje ključne riječi - analizira domene, dužinu i broj URL-ova - otkriva HTML forme (phishing logine) - mjeri ponavljanje istih linkova """ score = 0.0 text_l = text.lower() # --- 1️⃣ Ključne riječi (socijalni inženjering) if any(k in text_l for k in CUES): score += 0.25 # --- 2️⃣ Broj i dužina URL-ova if len(urls) >= 3: score += 0.15 if len(urls) >= 5: score += 0.10 long_urls = [u for u in urls if len(u) > 100] if len(long_urls) > 0: score += 0.10 # --- 3️⃣ Analiza domena (sumnjive TLD ekstenzije) suspicious_tlds = (".ru", ".cn", ".tk", ".zip", ".xyz", ".top", ".gq", ".ml") if any(u.lower().endswith(suspicious_tlds) for u in urls): score += 0.15 # --- 4️⃣ Ako koristi skraćene linkove if any(short in u for u in urls for short in ["bit.ly", "tinyurl", "goo.gl", "short.ly", "t.co"]): score += 0.10 # --- 5️⃣ Ako su linkovi ponavljani (tracking pattern) if len(urls) > len(set(urls)): score += 0.10 # --- 6️⃣ Ako ima email adresu u tekstu i barem jedan URL if "@" in text_l and len(urls) > 0: score += 0.10 # --- 7️⃣ Ako sadrži HTML formu ili input polja if "