Sami2000 commited on
Commit
f3a3f04
·
verified ·
1 Parent(s): a66ce70

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1503 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,1503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from typing import List, Dict, Any, TypedDict, Optional, Tuple
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ import json
6
+ import ipaddress
7
+ import os
8
+ try:
9
+ import requests
10
+ except Exception:
11
+ requests = None
12
+
13
+ # Optional libraries
14
+ try:
15
+ from duckduckgo_search import DDGS
16
+ except Exception:
17
+ DDGS = None
18
+
19
+ try:
20
+ from PyPDF2 import PdfReader
21
+ except Exception:
22
+ PdfReader = None
23
+
24
+ try:
25
+ import docx
26
+ except Exception:
27
+ docx = None
28
+
29
+ try:
30
+ import olefile
31
+ except Exception:
32
+ olefile = None
33
+
34
+ try:
35
+ from mutagen import File as MutagenFile
36
+ except Exception:
37
+ MutagenFile = None
38
+
39
+ try:
40
+ from rapidfuzz import fuzz
41
+ except Exception:
42
+ fuzz = None
43
+
44
+ try:
45
+ import exifread
46
+ except Exception:
47
+ exifread = None
48
+
49
+ try:
50
+ import networkx as nx
51
+ except Exception:
52
+ nx = None
53
+
54
+ try:
55
+ from pyvis.network import Network
56
+ except Exception:
57
+ Network = None
58
+
59
+ try:
60
+ from sentence_transformers import SentenceTransformer
61
+ except Exception:
62
+ SentenceTransformer = None
63
+
64
+ try:
65
+ from jinja2 import Template
66
+ except Exception:
67
+ Template = None
68
+
69
+ # ---------------------------
70
+ # Config & Styles
71
+ # ---------------------------
72
+ st.set_page_config(page_title="OSINT Investigator", layout="wide")
73
+
74
+ HIDE_STREAMLIT_STYLE = """
75
+ <style>
76
+ #MainMenu {visibility: hidden;}
77
+ footer {visibility: hidden;}
78
+ .small {font-size: 0.85rem; color: #666}
79
+ code {white-space: pre-wrap;}
80
+ /* Floating Chat Styles */
81
+ .chat-window {position: fixed; bottom: 20px; right: 20px; width: 360px; max-height: 560px; background:#1c1c1c; border:1px solid #444; border-radius:14px; z-index:1000; display:flex; flex-direction:column; box-shadow:0 8px 24px rgba(0,0,0,.55);}
82
+ .chat-header {padding:8px 12px; display:flex; align-items:center; gap:8px; border-bottom:1px solid #333; background:#222; border-top-left-radius:14px; border-top-right-radius:14px;}
83
+ .chat-header .title {font-weight:600; color:#ffcc66;}
84
+ .chat-close {margin-left:auto; cursor:pointer; font-weight:700; color:#bbb;}
85
+ .chat-close:hover {color:#fff;}
86
+ .chat-messages {padding:10px 12px; overflow-y:auto; flex:1; font-size:0.8rem;}
87
+ .chat-messages p {margin:0 0 10px;}
88
+ .msg-user {color:#fff;}
89
+ .msg-bot {color:#ffcc66; font-style:italic;}
90
+ .chat-input {padding:8px 10px; border-top:1px solid #333; background:#181818; border-bottom-left-radius:14px; border-bottom-right-radius:14px;}
91
+ .chat-input textarea {font-size:0.75rem !important;}
92
+ .badge-action {display:inline-block; background:#333; color:#ffcc66; padding:2px 6px; margin:2px 4px 6px 0; border-radius:6px; font-size:0.6rem; cursor:pointer;}
93
+ .badge-action:hover {background:#444;}
94
+ .chat-mini-btn {position:fixed; bottom:20px; right:20px; width:62px; height:62px; border-radius:50%; background:#222; border:2px solid #ffcc66; display:flex; align-items:center; justify-content:center; font-size:30px; cursor:pointer; z-index:999; box-shadow:0 0 8px rgba(0,0,0,.6);}
95
+ .chat-mini-btn:hover {background:#333;}
96
+ /* App Enhancements */
97
+ .app-brand-bar {display:flex; align-items:center; gap:14px; padding:8px 18px 4px 8px; border-bottom:1px solid #262626; margin:-1rem -1rem 1.2rem -1rem; background:linear-gradient(90deg,#141414,#181818);}
98
+ .app-brand-title {font-size:1.35rem; font-weight:600; letter-spacing:.5px; color:#ffcc66;}
99
+ .app-badge {display:inline-block; padding:2px 8px; border-radius:12px; font-size:0.65rem; font-weight:600; text-transform:uppercase; letter-spacing:.5px; margin-right:6px; background:#222; border:1px solid #333; color:#bbb;}
100
+ .level-high {background:#11391f; border-color:#1f6d3b; color:#3ddc84;}
101
+ .level-medium {background:#3a2e12; border-color:#72581a; color:#ffcf66;}
102
+ .level-low {background:#3a1616; border-color:#7a2727; color:#ff6b6b;}
103
+ .metric-row {margin-top:.4rem;}
104
+ .stDataFrame {border:1px solid #262626; border-radius:10px; overflow:hidden;}
105
+ .styled-section {background:#141414; border:1px solid #2a2a2a; padding:1rem 1.2rem; border-radius:14px; box-shadow:0 0 0 1px #111 inset, 0 4px 18px -8px #000;}
106
+ .kpi-grid div[data-testid='metric-container'] {background:#181818; border:1px solid #262626; border-radius:12px; padding:.75rem;}
107
+ .kpi-grid div[data-testid='stMetric'] {padding:.25rem .5rem .35rem .5rem;}
108
+ .plan-expander summary {font-weight:600; letter-spacing:.5px;}
109
+ .report-btn button {background:#ffcc66 !important; color:#111 !important; font-weight:600;}
110
+ .stDownloadButton button {border-radius:10px;}
111
+ .stTextInput input, .stTextArea textarea {border-radius:10px !important;}
112
+ .stTabs [data-baseweb='tab-list'] {gap:4px;}
113
+ .stTabs [data-baseweb='tab'] {background:#161616; padding:.5rem .9rem; border-radius:10px; border:1px solid #262626;}
114
+ .stTabs [data-baseweb='tab']:hover {background:#1d1d1d;}
115
+ .stTabs [aria-selected='true'] {background:#222 !important; border-color:#444 !important;}
116
+ .section-title {font-size:1.05rem; font-weight:600; letter-spacing:.5px; margin-bottom:.35rem;}
117
+ .sticky-toolbar {position:sticky; top:0; z-index:50; background:linear-gradient(90deg,#181818,#141414); padding:.4rem .6rem; border:1px solid #262626; border-radius:10px; margin-bottom:.6rem; box-shadow:0 6px 12px -8px rgba(0,0,0,.6);}
118
+ .sticky-toolbar button {margin-right:.35rem;}
119
+ .score-table {width:100%; border-collapse:collapse; font-size:0.75rem;}
120
+ .score-table th {text-align:left; padding:6px 8px; background:#202020; position:sticky; top:0; z-index:2;}
121
+ .score-table td {padding:6px 8px; border-top:1px solid #262626; vertical-align:top;}
122
+ .badge {display:inline-block; padding:2px 7px; border-radius:10px; font-size:0.6rem; font-weight:600; letter-spacing:.5px;}
123
+ .badge.high {background:#11391f; color:#3ddc84;}
124
+ .badge.medium {background:#3a2e12; color:#ffcf66;}
125
+ .badge.low {background:#3a1616; color:#ff6b6b;}
126
+ .methodology-box {background:#141414; border:1px solid #262626; padding:.8rem 1rem; border-radius:12px; font-size:0.8rem; line-height:1.25rem;}
127
+ body.light-mode, .light-mode [data-testid='stAppViewContainer'] {background:#f6f7f9; color:#222;}
128
+ .light-mode .app-brand-bar {background:linear-gradient(90deg,#fafafa,#eceff1); border-color:#d8dadd;}
129
+ .light-mode .app-brand-title {color:#7a4d00;}
130
+ .light-mode .app-badge {background:#fff; border-color:#d1d4d8; color:#555;}
131
+ .light-mode .sticky-toolbar {background:linear-gradient(90deg,#fff,#f3f5f7); border-color:#d8dade;}
132
+ .light-mode .score-table th {background:#eceff1;}
133
+ .light-mode .score-table td {border-color:#d9dde1;}
134
+ .light-mode .badge.high {background:#d8f5e6; color:#0d7a3d;}
135
+ .light-mode .badge.medium {background:#fbeccb; color:#8a6500;}
136
+ .light-mode .badge.low {background:#fbd5d5; color:#b80000;}
137
+ .light-mode .stTabs [data-baseweb='tab'] {background:#f5f6f7; border-color:#d9dde1;}
138
+ .light-mode .stTabs [aria-selected='true'] {background:#ffffff !important; border-color:#b9bdc1 !important;}
139
+ /* Skeleton / Shimmer */
140
+ @keyframes shimmer {0% {transform:translateX(-60%);} 100% {transform:translateX(120%);} }
141
+ .skeleton-block {position:relative; overflow:hidden; background:#1e1e1e; border-radius:6px; margin:4px 0;}
142
+ .skeleton-block.light-mode {background:#e2e5e9;}
143
+ .skeleton-block::after {content:""; position:absolute; top:0; left:0; height:100%; width:50%; background:linear-gradient(90deg, rgba(255,255,255,0), rgba(255,255,255,.15), rgba(255,255,255,0)); animation:shimmer 1.25s infinite;}
144
+ .sk-line-sm {height:10px;}
145
+ .sk-line-md {height:14px;}
146
+ .sk-line-lg {height:22px;}
147
+ .sk-fade {animation:fadeIn .3s ease-in;}
148
+ @keyframes fadeIn {from {opacity:0;} to {opacity:1;}}
149
+ </style>
150
+ """
151
+
152
+ st.markdown(HIDE_STREAMLIT_STYLE, unsafe_allow_html=True)
153
+ st.markdown("""
154
+ <head>
155
+ <link rel='icon' type='image/svg+xml' href="data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 64 64'><circle fill='%23111' cx='32' cy='32' r='32'/><path fill='%23ffcc66' d='M12 38l4-14h32l4 14H12zm8 4h24c0 6-6 10-12 10s-12-4-12-10zM24 18c0-4 4-8 8-8s8 4 8 8v4H24v-4z'/></svg>">
156
+ <meta name='description' content='OSINT Investigator Suite - AI-augmented open source intelligence enumeration & scoring platform.'>
157
+ <meta name='viewport' content='width=device-width, initial-scale=1'>
158
+ </head>
159
+ """, unsafe_allow_html=True)
160
+ if st.session_state.get("settings", {}).get("light_mode"):
161
+ st.markdown("""<script>const b=window.parent.document.querySelector('body'); if(b&&!b.classList.contains('light-mode')) b.classList.add('light-mode');</script>""", unsafe_allow_html=True)
162
+ else:
163
+ st.markdown("""<script>const b=window.parent.document.querySelector('body'); if(b) b.classList.remove('light-mode');</script>""", unsafe_allow_html=True)
164
+
165
+ # ---------------------------
166
+ # Sidebar: Settings
167
+ # ---------------------------
168
+ def _get_settings() -> Dict[str, Any]:
169
+ with st.sidebar:
170
+ st.header("Settings")
171
+ model = st.selectbox(
172
+ "Advisor model (CPU-friendly)",
173
+ [
174
+ "qwen2.5-1.5b-instruct",
175
+ "phi-3-mini-4k-instruct",
176
+ "gemma-2-2b-it",
177
+ ],
178
+ index=0,
179
+ key="advisor_model_select",
180
+ help="Choose which free local LLM to use for advisor suggestions."
181
+ )
182
+ max_per = st.slider(
183
+ "Default max results per dork",
184
+ min_value=3,
185
+ max_value=50,
186
+ value=10,
187
+ step=1,
188
+ key="default_max_results",
189
+ help="Used as the default when executing dorks in Step 4."
190
+ )
191
+ logging = st.checkbox(
192
+ "Enable audit logging",
193
+ value=True,
194
+ key="enable_audit_logging",
195
+ help="If off, actions won't be written to the audit trail."
196
+ )
197
+ use_embeddings = st.checkbox(
198
+ "Enable semantic similarity (embeddings)",
199
+ value=False,
200
+ key="enable_embeddings",
201
+ help="Loads a small sentence-transformer to boost scoring by context relevance."
202
+ )
203
+ light_mode = st.checkbox(
204
+ "Light mode UI override",
205
+ value=False,
206
+ key="light_mode_toggle",
207
+ help="Apply a lighter palette without reloading base theme"
208
+ )
209
+ return {"model": model, "max_per": max_per, "logging": logging, "light_mode": light_mode}
210
+
211
+ SETTINGS = _get_settings()
212
+ st.session_state["settings"] = SETTINGS
213
+ st.session_state.setdefault("_embed_model", None)
214
+
215
+ # ---------------------------
216
+ # Google Dorks (typed catalog for many entities)
217
+ # ---------------------------
218
+ class TypedDork(TypedDict):
219
+ q: str
220
+ type: str
221
+ why: str
222
+
223
+ # Dork category glossary (shown in explainer)
224
+ DORK_TYPES: Dict[str, str] = {
225
+ "Footprinting": "Map surface area: sites/subdomains, logins, admin panels, basic presence.",
226
+ "Directory/Index": "Hunt for open listings or auto-generated indexes exposing files.",
227
+ "Docs/Collab": "Live docs/boards accidentally exposed (docs.google, Trello, etc.).",
228
+ "Code/Repo": "Public repos that may contain references, issues, or credentials.",
229
+ "Credentials/Secrets": "Clues that hint at passwords/keys or places leaks may exist.",
230
+ "Exposure/Leak": "Mentions of breaches, leaks, or dumps involving the entity.",
231
+ "People/Profiles": "Official bios, resumes/CVs, speaker pages, researcher profiles.",
232
+ "Social Activity": "Usernames/handles across social and developer communities.",
233
+ "Regulatory/Legal": "Filings and official records (e.g., SEC/EDGAR).",
234
+ "Incidents/Risk": "Incident reports, outages, protests, negative events.",
235
+ "Academic/Research": "Scholarly/technical works tied to a name or org.",
236
+ }
237
+
238
+ # ---- Typed dork builders ----
239
+
240
+ def typed_dorks_for_email(email: str) -> List[TypedDork]:
241
+ user, dom = (email.split("@", 1) + [""])[:2]
242
+ return [
243
+ {"q": f'"{email}"', "type": "Footprinting", "why": "Exact email mentions across the web."},
244
+ {"q": f'intext:"{email}"', "type": "Footprinting", "why": "Mentions inside page bodies."},
245
+ {"q": f'intext:"{user}" intext:"{dom}"', "type": "Footprinting", "why": "Mentions with split user/domain."},
246
+ {"q": f'site:{dom} intext:"@{dom}"', "type": "Footprinting", "why": "Emails published on the same domain."},
247
+ {"q": f'"{email}" filetype:pdf OR filetype:doc OR filetype:xls OR filetype:csv', "type": "Docs/Collab", "why": "Docs that may expose PII/roles."},
248
+ {"q": f'"{email}" site:github.com OR site:gitlab.com', "type": "Code/Repo", "why": "Commits/issues referencing the email."},
249
+ {"q": f'"{email}" site:gravatar.com', "type": "People/Profiles", "why": "Avatar/profile tied to the email hash."},
250
+ {"q": f'"{email}" site:pastebin.com OR site:ghostbin.com OR site:hastebin.com', "type": "Exposure/Leak", "why": "Common paste sites for leaks."},
251
+ {"q": f'"{email}" inurl:wp- OR inurl:wp-content OR inurl:wp-config', "type": "Directory/Index", "why": "WordPress artifacts sometimes leak emails."},
252
+ {"q": f'"{email}" AROUND(3) "password"', "type": "Credentials/Secrets", "why": "Heuristic for password-adjacent mentions."},
253
+ ]
254
+
255
+
256
+ def typed_dorks_for_domain(d: str) -> List[TypedDork]:
257
+ return [
258
+ {"q": f"site:{d} -www", "type": "Footprinting", "why": "Apex domain excluding www."},
259
+ {"q": f"site:*.{d} -www", "type": "Footprinting", "why": "Enumerate subdomains exposed to crawlers."},
260
+ {"q": f'"@{d}"', "type": "Footprinting", "why": "Emails belonging to the domain across the web."},
261
+ {"q": f'site:linkedin.com "{d}"', "type": "People/Profiles", "why": "Employees listing org domain."},
262
+ {"q": f'site:github.com "{d}"', "type": "Code/Repo", "why": "Repositories/issues referencing the domain."},
263
+ {"q": f'site:gitlab.com "{d}"', "type": "Code/Repo", "why": "Alternate forge often used by teams."},
264
+ {"q": f'site:docs.google.com "{d}"', "type": "Docs/Collab", "why": "Potentially exposed Google Docs/Sheets/Slides."},
265
+ {"q": f'site:trello.com "{d}"', "type": "Docs/Collab", "why": "Public Trello boards occasionally misconfigured."},
266
+ {"q": f'"{d}" filetype:pdf OR filetype:doc OR filetype:xls OR filetype:ppt OR filetype:csv', "type": "Docs/Collab", "why": "Documents with the org name/domain."},
267
+ {"q": f"site:{d} inurl:login OR inurl:admin OR inurl:signup", "type": "Footprinting", "why": "Auth surfaces (discovery only)."},
268
+ {"q": f'site:{d} intitle:"index of"', "type": "Directory/Index", "why": "Open directory listings on that domain."},
269
+ {"q": f"site:{d} ext:env OR ext:.git OR ext:git-credentials OR ext:sql OR ext:log", "type": "Credentials/Secrets", "why": "Common secret-bearing file extensions."},
270
+ {"q": f'"{d}" breach OR leak OR "data exposure"', "type": "Exposure/Leak", "why": "Press and trackers mentioning exposures."},
271
+ ]
272
+
273
+
274
+ def typed_dorks_for_ip(ip: str) -> List[TypedDork]:
275
+ return [
276
+ {"q": f'"{ip}"', "type": "Footprinting", "why": "Places where the raw IP is printed or logged."},
277
+ {"q": f'intext:"{ip}"', "type": "Footprinting", "why": "Body text mentions (forums, logs)."},
278
+ {"q": f'"{ip}" filetype:log OR filetype:txt', "type": "Directory/Index", "why": "Exposed logs referencing the IP."},
279
+ {"q": f'"{ip}" blacklist OR abuse', "type": "Incidents/Risk", "why": "Blacklist/abuse mentions and reports."},
280
+ {"q": f'"{ip}" intitle:"index of"', "type": "Directory/Index", "why": "Open indexes listing files with that IP."},
281
+ ]
282
+
283
+
284
+ def typed_dorks_for_username(u: str) -> List[TypedDork]:
285
+ return [
286
+ {"q": f'"{u}"', "type": "Footprinting", "why": "Exact handle mentions across the web."},
287
+ {"q": f'"{u}" site:twitter.com OR site:x.com OR site:reddit.com OR site:github.com OR site:stackexchange.com', "type": "Social Activity", "why": "Find consistent identity across major platforms."},
288
+ {"q": f'"{u}" site:medium.com OR site:substack.com', "type": "People/Profiles", "why": "Author pages tied to the handle."},
289
+ {"q": f'"{u}" site:keybase.io', "type": "People/Profiles", "why": "Cryptographic identity/proofs."},
290
+ {"q": f'"{u}" inurl:users OR inurl:profile', "type": "Footprinting", "why": "Generic user profile URLs."},
291
+ {"q": f'"{u}" filetype:pdf resume OR "curriculum vitae"', "type": "People/Profiles", "why": "CVs/resumes listing the handle."},
292
+ {"q": f'"{u}" AROUND(3) email', "type": "People/Profiles", "why": "Correlate handle to emails in bios/posts."},
293
+ {"q": f'"{u}" avatar OR "profile photo"', "type": "People/Profiles", "why": "Images tied to the identity."},
294
+ ]
295
+
296
+
297
+ def typed_dorks_for_person(name: str) -> List[TypedDork]:
298
+ return [
299
+ {"q": f'"{name}"', "type": "Footprinting", "why": "Exact full-name mentions."},
300
+ {"q": f'"{name}" site:linkedin.com', "type": "People/Profiles", "why": "Primary professional profile."},
301
+ {"q": f'"{name}" filetype:pdf resume OR "curriculum vitae"', "type": "People/Profiles", "why": "Resume/CV documents."},
302
+ {"q": f'"{name}" conference OR talk OR keynote', "type": "People/Profiles", "why": "Speaker bios and conference pages."},
303
+ {"q": f'"{name}" site:github.com OR site:gitlab.com', "type": "Code/Repo", "why": "Developer activity tied to the name."},
304
+ {"q": f'"{name}" site:researchgate.net OR site:scholar.google.com', "type": "Academic/Research", "why": "Scholarly output."},
305
+ {"q": f'"{name}" site:medium.com OR site:substack.com', "type": "People/Profiles", "why": "Editorial/social writing."},
306
+ {"q": f'"{name}" "email" OR "contact"', "type": "People/Profiles", "why": "Pages listing contact info."},
307
+ ]
308
+
309
+
310
+ def typed_dorks_for_org(org: str) -> List[TypedDork]:
311
+ return [
312
+ {"q": f'"{org}" site:sec.gov OR site:edgar', "type": "Regulatory/Legal", "why": "Official SEC/EDGAR filings."},
313
+ {"q": f'"{org}" contract award OR RFP OR "sources sought"', "type": "Regulatory/Legal", "why": "Gov procurement history and notices."},
314
+ {"q": f'"{org}" breach OR incident OR "data exposure"', "type": "Incidents/Risk", "why": "News/trackers about incidents/leaks."},
315
+ {"q": f'"{org}" site:linkedin.com', "type": "People/Profiles", "why": "Employees and org page."},
316
+ {"q": f'"{org}" site:github.com OR site:gitlab.com', "type": "Code/Repo", "why": "Public repos under org name."},
317
+ {"q": f'"{org}" filetype:pdf OR filetype:doc OR filetype:ppt OR filetype:xls', "type": "Docs/Collab", "why": "Documents carrying org name."},
318
+ {"q": f'"{org}" site:docs.google.com OR site:trello.com', "type": "Docs/Collab", "why": "Potentially exposed docs/boards."},
319
+ ]
320
+
321
+
322
+ def typed_dorks_for_location(loc: str) -> List[TypedDork]:
323
+ return [
324
+ {"q": f'"{loc}" incident OR protest OR outage', "type": "Incidents/Risk", "why": "Events/incidents tied to the place."},
325
+ {"q": f'"{loc}" satellite imagery OR "before after"', "type": "Footprinting", "why": "Imagery context for geospatial checks."},
326
+ {"q": f'"{loc}" site:news', "type": "Incidents/Risk", "why": "Recent news mentions for the place."},
327
+ {"q": f'"{loc}" filetype:pdf report', "type": "Docs/Collab", "why": "Reports that reference the location."},
328
+ ]
329
+
330
+
331
+ def typed_dorks_for_file(desc: str) -> List[TypedDork]:
332
+ return [
333
+ {"q": f'"{desc}" filetype:pdf OR filetype:doc OR filetype:xls OR filetype:ppt OR filetype:csv', "type": "Docs/Collab", "why": "Document hunting by keyword."},
334
+ {"q": f'"{desc}" site:archive.org', "type": "Docs/Collab", "why": "Wayback/Archive artifacts."},
335
+ {"q": f'"{desc}" intitle:"index of"', "type": "Directory/Index", "why": "Open listings that may contain files."},
336
+ ]
337
+
338
+ TYPED_DORK_MAP: Dict[str, Any] = {
339
+ "Email Address": typed_dorks_for_email,
340
+ "Domain / Website": typed_dorks_for_domain,
341
+ "IP Address": typed_dorks_for_ip,
342
+ "Username / Handle": typed_dorks_for_username,
343
+ "Named Individual": typed_dorks_for_person,
344
+ "Organization / Company": typed_dorks_for_org,
345
+ "Location": typed_dorks_for_location,
346
+ "File / Image": typed_dorks_for_file,
347
+ }
348
+
349
+ # ---------------------------
350
+ # STEP 1: Explainer
351
+ # ---------------------------
352
+ def render_dorks_explainer(entity_type: str, entity_value: str):
353
+ st.subheader("Step 1: Dork Explainer")
354
+ st.caption("These are categorized OSINT search operators. Copy/paste into Google if you like; this app automates via DuckDuckGo to respect ToS.")
355
+ with st.expander("Dork categories explained", expanded=False):
356
+ for t, desc in DORK_TYPES.items():
357
+ st.markdown(f"**{t}** — {desc}")
358
+
359
+ builder = TYPED_DORK_MAP.get(entity_type)
360
+ typed = builder(entity_value) if (builder and entity_value) else []
361
+ if not typed:
362
+ st.info("Enter an entity value above to see a tailored catalog.")
363
+ return
364
+ for d in typed:
365
+ st.markdown(f"- **[{d['type']}]** `{d['q']}`")
366
+ st.markdown(f" <span class='small'>{d['why']}</span>", unsafe_allow_html=True)
367
+
368
+ # ---------------------------
369
+ # STEP 2: Advisor (LLM-powered with rules fallback)
370
+ # ---------------------------
371
+
372
+ # Goal weights for rules-based fallback / blending
373
+ GOAL_WEIGHTS: Dict[str, Dict[str, int]] = {
374
+ "Map footprint / surface": {"Footprinting": 3, "Directory/Index": 2},
375
+ "Find documents & spreadsheets": {"Docs/Collab": 3, "Directory/Index": 2},
376
+ "Discover code & credentials": {"Code/Repo": 3, "Credentials/Secrets": 3, "Directory/Index": 2},
377
+ "Identify breaches/leaks": {"Exposure/Leak": 3, "Credentials/Secrets": 2},
378
+ "Find people & org info": {"People/Profiles": 3, "Regulatory/Legal": 2},
379
+ "Track incidents / risk": {"Incidents/Risk": 3},
380
+ "Academic/technical trails": {"Academic/Research": 3},
381
+ }
382
+ DEFAULT_GOALS = list(GOAL_WEIGHTS.keys())
383
+
384
+ MODEL_ID_MAP = {
385
+ "qwen2.5-1.5b-instruct": "Qwen/Qwen2.5-1.5B-Instruct",
386
+ "phi-3-mini-4k-instruct": "microsoft/phi-3-mini-4k-instruct",
387
+ "gemma-2-2b-it": "google/gemma-2-2b-it",
388
+ }
389
+
390
+ # ---------------------------
391
+ # Known Facts Model
392
+ # ---------------------------
393
+ @dataclass
394
+ class KnownFacts:
395
+ handles: List[str]
396
+ real_names: List[str]
397
+ emails: List[str]
398
+ domains: List[str]
399
+ ips: List[str]
400
+ locations: List[str]
401
+ orgs: List[str]
402
+ context: str
403
+
404
+ @classmethod
405
+ def from_session(cls) -> "KnownFacts":
406
+ return st.session_state.get("known_facts") or cls([], [], [], [], [], [], [], "")
407
+
408
+ def _parse_csv(s: str) -> List[str]:
409
+ return [x.strip() for x in (s or "").split(",") if x.strip()]
410
+
411
+ def _known_facts_ui():
412
+ st.subheader("Known Facts / Prior Intelligence")
413
+ st.caption("Provide what you already know. This seeds scoring & generation.")
414
+ col_a, col_b, col_c = st.columns(3)
415
+ with col_a:
416
+ handles = st.text_area("Handles / Usernames (comma)", key="kf_handles", height=70)
417
+ emails = st.text_area("Emails (comma)", key="kf_emails", height=70)
418
+ ips = st.text_area("IP addresses (comma)", key="kf_ips", height=70)
419
+ with col_b:
420
+ real_names = st.text_area("Real Names (comma)", key="kf_real_names", height=70, help="Full names or key name variants")
421
+ domains = st.text_area("Domains (comma)", key="kf_domains", height=70)
422
+ orgs = st.text_area("Organizations (comma)", key="kf_orgs", height=70)
423
+ with col_c:
424
+ locations = st.text_area("Locations (comma)", key="kf_locations", height=70)
425
+ context = st.text_area("Context / Keywords", key="kf_context", height=160, help="Free-text mission context, tech stack, roles, etc.")
426
+ if st.button("Save Known Facts", key="btn_save_facts"):
427
+ facts = KnownFacts(
428
+ handles=_parse_csv(handles),
429
+ real_names=_parse_csv(real_names),
430
+ emails=_parse_csv(emails),
431
+ domains=_parse_csv(domains),
432
+ ips=_parse_csv(ips),
433
+ locations=_parse_csv(locations),
434
+ orgs=_parse_csv(orgs),
435
+ context=context.strip(),
436
+ )
437
+ st.session_state["known_facts"] = facts
438
+ st.success("Facts saved (session only).")
439
+ facts = KnownFacts.from_session()
440
+ st.markdown(f"**Current facts loaded:** {len(facts.handles)} handles, {len(facts.emails)} emails, {len(facts.domains)} domains, {len(facts.real_names)} names.")
441
+ st.markdown("---")
442
+ st.markdown("### Candidate Generation")
443
+ st.caption("Generate permutations / derived candidates from known facts.")
444
+ if st.button("Generate Candidates", key="btn_gen_candidates"):
445
+ facts = KnownFacts.from_session()
446
+ usernames = set(facts.handles)
447
+ # simple mutations
448
+ for h in list(usernames):
449
+ for suf in ["123", "01", "_sec", "_research", "-dev"]:
450
+ usernames.add(h + suf)
451
+ if h.isalpha():
452
+ usernames.add(h + "1")
453
+ # email permutations (if have names + domains)
454
+ emails = set(facts.emails)
455
+ if facts.real_names and facts.domains:
456
+ first = facts.real_names[0].split()[0].lower()
457
+ last = facts.real_names[0].split()[-1].lower()
458
+ for d in facts.domains[:3]:
459
+ emails.update({
460
+ f"{first}.{last}@{d}",
461
+ f"{first}{last}@{d}",
462
+ f"{first[0]}{last}@{d}",
463
+ f"{first}_{last}@{d}",
464
+ })
465
+ # domain variants (very light)
466
+ dom_vars = set(facts.domains)
467
+ for d in facts.domains:
468
+ if d.count('.') >= 1:
469
+ root = d.split('.')[0]
470
+ tld = d.split('.')[-1]
471
+ dom_vars.add(root + "-dev." + tld)
472
+ dom_vars.add(root + "-staging." + tld)
473
+ st.session_state["generated_candidates"] = {
474
+ "usernames": sorted(list(usernames))[:100],
475
+ "emails": sorted(list(emails))[:100],
476
+ "domains": sorted(list(dom_vars))[:100]
477
+ }
478
+ st.success("Candidates generated.")
479
+ cand = st.session_state.get("generated_candidates")
480
+ if cand:
481
+ st.write("Usernames (sample)", cand["usernames"][:10])
482
+ st.write("Emails (sample)", cand["emails"][:10])
483
+ st.write("Domains (sample)", cand["domains"][:10])
484
+ if st.button("Add All Candidates to Facts", key="btn_add_cand"):
485
+ facts = KnownFacts.from_session()
486
+ facts.handles = sorted(list(set(facts.handles + cand["usernames"])))
487
+ facts.emails = sorted(list(set(facts.emails + cand["emails"])))
488
+ facts.domains = sorted(list(set(facts.domains + cand["domains"])))
489
+ st.session_state["known_facts"] = facts
490
+ st.success("Candidates merged into facts.")
491
+
492
+ def _generate_investigation_plan(entity_type: str, entity_value: str, facts: KnownFacts) -> Dict[str, Any]:
493
+ """Produce a structured investigation plan based on current facts and target type."""
494
+ objectives = [
495
+ "Establish definitive identifiers (emails, handles, domains) to anchor pivots",
496
+ "Map exposed surface (sites, code, documents, credentials indicators)",
497
+ "Correlate identities across platforms and artifacts",
498
+ "Identify signs of exposure, breach, or sensitive data leakage",
499
+ "Prioritize high-confidence findings for deeper manual review"
500
+ ]
501
+ # Gap analysis
502
+ gaps = []
503
+ if not facts.emails: gaps.append("No confirmed email addresses")
504
+ if not facts.handles: gaps.append("No social/developer handles")
505
+ if not facts.domains and entity_type != "Domain / Website": gaps.append("No related domains captured")
506
+ if not facts.real_names and entity_type in ("Named Individual", "Organization / Company"): gaps.append("No individual name variants")
507
+ if not facts.orgs and entity_type == "Named Individual": gaps.append("No employing organizations")
508
+ if not facts.context: gaps.append("Context / mission keywords empty (reduces scoring nuance)")
509
+ if not gaps: gaps = ["Current fact set sufficient for first enumeration pass"]
510
+
511
+ # Phase recommendations
512
+ phases: List[Dict[str, Any]] = []
513
+ phases.append({
514
+ "phase": "Phase 1 - Baseline & Fact Hardening",
515
+ "goals": ["Normalize entity value", "Collect canonical facts", "Note obvious pivots"],
516
+ "actions": [
517
+ "Record primary identifier in Known Facts",
518
+ "Add any immediately known emails, domains, handles",
519
+ "Capture mission / context keywords (tech stack, industry, roles)",
520
+ "Run Advisor for broad Footprinting and People queries"
521
+ ]
522
+ })
523
+ phases.append({
524
+ "phase": "Phase 2 - Surface Enumeration",
525
+ "goals": ["Map public assets", "Discover documents & code"],
526
+ "actions": [
527
+ "Select dorks: site:, filetype:, intitle:'index of' variations",
528
+ "Enumerate repo references (GitHub/GitLab) and note unique strings",
529
+ "Pull down high-signal docs (PDF/DOCX) and extract metadata for hidden emails/handles"
530
+ ]
531
+ })
532
+ phases.append({
533
+ "phase": "Phase 3 - Identity Correlation",
534
+ "goals": ["Link handles to emails", "Find cross-platform reuse"],
535
+ "actions": [
536
+ "Search handles with platform-specific queries (social + developer)",
537
+ "Leverage resume / CV / speaker page dorks for name-email alignment",
538
+ "Add newly confirmed identifiers back into Known Facts and re-score"
539
+ ]
540
+ })
541
+ phases.append({
542
+ "phase": "Phase 4 - Exposure & Risk Signals",
543
+ "goals": ["Detect leak indicators", "Prioritize potential sensitive exposure"],
544
+ "actions": [
545
+ "Run leak / breach / paste oriented dorks including credential keywords",
546
+ "Inspect any pastebin / gist / artifact snippets for policy or secret references",
547
+ "Flag findings with multiple co-occurring identifiers for manual escalation"
548
+ ]
549
+ })
550
+ phases.append({
551
+ "phase": "Phase 5 - Consolidation & Reporting",
552
+ "goals": ["Score & rank findings", "Produce exportable report"],
553
+ "actions": [
554
+ "Re-score after final fact enrichment",
555
+ "Visualize graph to ensure high-score nodes connect multiple anchors",
556
+ "Export HTML report and retain audit log",
557
+ "Document residual gaps & next potential pivots (e.g., historical archives, certificate transparency)"
558
+ ]
559
+ })
560
+ return {
561
+ "entity_type": entity_type,
562
+ "entity_value": entity_value,
563
+ "objectives": objectives,
564
+ "gaps": gaps,
565
+ "phases": phases,
566
+ "facts_snapshot": facts.__dict__,
567
+ }
568
+
569
+ def render_investigation_plan(entity_type: str, entity_value: str):
570
+ st.subheader("Investigation Plan")
571
+ facts = KnownFacts.from_session()
572
+ plan = _generate_investigation_plan(entity_type, entity_value, facts)
573
+ st.markdown("### Core Objectives")
574
+ for o in plan["objectives"]:
575
+ st.markdown(f"- {o}")
576
+ st.markdown("### Current Gaps")
577
+ for g in plan["gaps"]:
578
+ st.markdown(f"- {g}")
579
+ st.markdown("### Phased Approach")
580
+ for ph in plan["phases"]:
581
+ with st.expander(ph["phase"], expanded=False):
582
+ st.markdown("**Goals**")
583
+ for g in ph["goals"]:
584
+ st.markdown(f"- {g}")
585
+ st.markdown("**Actions**")
586
+ for a in ph["actions"]:
587
+ st.markdown(f"- {a}")
588
+ if st.button("Export Plan (Markdown)", key="btn_export_plan"):
589
+ md_lines = [f"# Investigation Plan: {plan['entity_type']} — {plan['entity_value']}", "", "## Objectives"]
590
+ md_lines += [f"- {o}" for o in plan["objectives"]]
591
+ md_lines += ["", "## Gaps"] + [f"- {g}" for g in plan["gaps"]]
592
+ md_lines += ["", "## Phases"]
593
+ for ph in plan["phases"]:
594
+ md_lines.append(f"### {ph['phase']}")
595
+ md_lines.append("**Goals**")
596
+ md_lines += [f"- {g}" for g in ph["goals"]]
597
+ md_lines.append("**Actions**")
598
+ md_lines += [f"- {a}" for a in ph["actions"]]
599
+ md_lines.append("")
600
+ md = "\n".join(md_lines)
601
+ st.download_button("Download Plan", md, file_name="investigation_plan.md", mime="text/markdown")
602
+
603
+
604
+ def _score_dork_rule(d: TypedDork, goals: List[str], user_note: str) -> float:
605
+ s = 1.0
606
+ for g in goals:
607
+ for cat, w in GOAL_WEIGHTS.get(g, {}).items():
608
+ if d["type"] == cat:
609
+ s += w
610
+ note = (user_note or "").lower()
611
+ if any(k in note for k in ["password", "credential", "secret", "token"]):
612
+ if d["type"] in {"Credentials/Secrets", "Code/Repo", "Directory/Index"}:
613
+ s += 1.5
614
+ if any(k in note for k in ["resume", "cv", "employee", "contact"]):
615
+ if d["type"] in {"People/Profiles"}:
616
+ s += 1.0
617
+ if any(k in note for k in ["breach", "leak", "dump", "paste"]):
618
+ if d["type"] in {"Exposure/Leak", "Credentials/Secrets"}:
619
+ s += 1.5
620
+ if any(k in note for k in ["paper", "research", "doi", "citation"]):
621
+ if d["type"] in {"Academic/Research"}:
622
+ s += 1.0
623
+ return s
624
+
625
+
626
+ def _recommend_rules(entity_type: str, entity_value: str, goals: List[str], user_note: str, top_k: int = 10) -> List[TypedDork]:
627
+ builder = TYPED_DORK_MAP.get(entity_type)
628
+ typed = builder(entity_value) if (builder and entity_value) else []
629
+ ranked = sorted(typed, key=lambda d: _score_dork_rule(d, goals, user_note), reverse=True)
630
+ return ranked[:top_k]
631
+
632
+
633
+ def _safe_json_list(txt: str) -> List[Dict[str, Any]]:
634
+ """Best-effort extraction of a JSON list from raw LLM text or user input.
635
+
636
+ Strategy:
637
+ 1. Strip surrounding markdown code fences (with or without language tag).
638
+ 2. Attempt direct json.loads.
639
+ 3. Locate outermost '[' ... ']' span and attempt parse.
640
+ Returns [] on any failure or non-list root.
641
+ """
642
+ if not txt:
643
+ return []
644
+ s = txt.strip()
645
+ # Remove markdown fences like ```json ... ```
646
+ if s.startswith("```"):
647
+ lines = s.split("\n")
648
+ # drop first fence line
649
+ lines = lines[1:]
650
+ # drop trailing fence line if present
651
+ if lines and lines[-1].strip() == "```":
652
+ lines = lines[:-1]
653
+ s = "\n".join(lines).strip()
654
+ # Try direct parse
655
+ try:
656
+ data = json.loads(s)
657
+ if isinstance(data, list):
658
+ return data # type: ignore[return-value]
659
+ except Exception:
660
+ pass
661
+ # Fallback: largest bracketed list slice
662
+ start = s.find("[")
663
+ end = s.rfind("]")
664
+ if start != -1 and end != -1 and end > start:
665
+ candidate = s[start:end+1]
666
+ try:
667
+ data = json.loads(candidate)
668
+ if isinstance(data, list):
669
+ return data # type: ignore[return-value]
670
+ except Exception:
671
+ pass
672
+ return []
673
+
674
+
675
+ def _hf_infer(model_id: str, prompt: str, max_new_tokens: int = 384, temperature: float = 0.2) -> Optional[str]:
676
+ """Call Hugging Face Inference API if token & requests available.
677
+
678
+ Returns generated text or None (which triggers rule-based fallback)."""
679
+ if requests is None:
680
+ st.warning("'requests' not installed; cannot call Hugging Face Inference API. Falling back to rules.")
681
+ return None
682
+ api_token = os.getenv("HF_API_TOKEN")
683
+ if not api_token:
684
+ st.warning("HF_API_TOKEN not set. Add it as a secret/environment variable to enable LLM advisor. Falling back to rules.")
685
+ return None
686
+ url = f"https://api-inference.huggingface.co/models/{model_id}"
687
+ headers = {"Authorization": f"Bearer {api_token}"}
688
+ payload = {
689
+ "inputs": prompt,
690
+ "parameters": {
691
+ "max_new_tokens": max_new_tokens,
692
+ "temperature": temperature,
693
+ "return_full_text": False,
694
+ },
695
+ }
696
+ try:
697
+ resp = requests.post(url, headers=headers, json=payload, timeout=90)
698
+ resp.raise_for_status()
699
+ data = resp.json()
700
+ if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
701
+ return data[0]["generated_text"]
702
+ if isinstance(data, dict) and "generated_text" in data:
703
+ return data["generated_text"]
704
+ # Unknown shape: return serialized
705
+ return json.dumps(data)
706
+ except Exception as e:
707
+ st.warning(f"HF inference error: {e}. Falling back to rules.")
708
+ return None
709
+
710
+
711
+ def _build_llm_prompt(entity_type: str, entity_value: str, goals: List[str], hint: str, baseline: List[TypedDork], top_k: int) -> str:
712
+ cat_list = ", ".join(sorted(DORK_TYPES.keys()))
713
+ baseline_lines = "\n".join([f"- {d['type']}: {d['q']} // {d['why']}" for d in baseline[:25]])
714
+ return f"""
715
+ You are an OSINT assistant that crafts focused Google dorks.
716
+ Given the entity type and value, the user's goals, and an optional hint, return a JSON array (and ONLY a JSON array) of up to {top_k} objects with this schema:
717
+ {{"q": "<google dork string>", "type": "<one of [{cat_list}]>", "why": "<1 sentence rationale>"}}
718
+ Rules:
719
+ - Prefer free, public sources; avoid paid services.
720
+ - Keep queries precise; quote exact strings; use site:, filetype:, inurl:, intitle:, and AROUND(n) when helpful.
721
+ - Use ONLY categories from the allowed list above.
722
+ - Output must be valid JSON (no prose, no markdown fences).
723
+
724
+ ENTITY_TYPE: {entity_type}
725
+ ENTITY_VALUE: {entity_value}
726
+ GOALS: {goals}
727
+ HINT: {hint or '(none)'}
728
+ BASELINE_CATALOG (for inspiration, don't just repeat):
729
+ {baseline_lines}
730
+ """
731
+
732
+
733
+ def _recommend_llm(entity_type: str, entity_value: str, goals: List[str], hint: str, top_k: int) -> List[TypedDork]:
734
+ builder = TYPED_DORK_MAP.get(entity_type)
735
+ baseline = builder(entity_value) if (builder and entity_value) else []
736
+ model_key = st.session_state.get("settings", {}).get("model", "qwen2.5-1.5b-instruct")
737
+ model_id = MODEL_ID_MAP.get(model_key, model_key)
738
+ prompt = _build_llm_prompt(entity_type, entity_value, goals, hint, baseline, top_k)
739
+ raw = _hf_infer(model_id, prompt)
740
+ if not raw:
741
+ return []
742
+ parsed = _safe_json_list(raw)
743
+ out: List[TypedDork] = []
744
+ for item in parsed:
745
+ if not isinstance(item, dict):
746
+ continue
747
+ q = str(item.get("q", "")).strip()
748
+ typ = str(item.get("type", "Footprinting")).strip()
749
+ why = str(item.get("why", "Suggested by LLM")).strip()
750
+ if not q:
751
+ continue
752
+ if typ not in DORK_TYPES:
753
+ typ = "Footprinting"
754
+ out.append({"q": q, "type": typ, "why": why})
755
+ # Dedupe while preserving order
756
+ seen = set()
757
+ deduped: List[TypedDork] = []
758
+ for d in out:
759
+ if d["q"] in seen:
760
+ continue
761
+ seen.add(d["q"])
762
+ deduped.append(d)
763
+ return deduped[:top_k]
764
+
765
+
766
+ def render_dork_recommender(entity_type: str, entity_value: str):
767
+ st.subheader("Step 2: Advisor")
768
+ goals = st.multiselect("What are you trying to do?", DEFAULT_GOALS, default=["Map footprint / surface", "Find documents & spreadsheets"], key="advisor_goals")
769
+ hint = st.text_input("Optional hint (e.g., 'credentials around build system', 'employee directory')", key="advisor_hint")
770
+ top_k = st.slider("How many suggestions?", 3, 20, 10, key="advisor_topk")
771
+ use_llm = st.checkbox("Use advisor LLM (Hugging Face Inference API)", value=False, key="use_llm_checkbox", help="Requires HF_API_TOKEN environment secret. Falls back to rules if unavailable.")
772
+
773
+ if st.button("Suggest dorks", key="btn_suggest"):
774
+ recs: List[TypedDork] = []
775
+ if use_llm:
776
+ recs = _recommend_llm(entity_type, entity_value, goals, hint, top_k)
777
+ if not recs:
778
+ recs = _recommend_rules(entity_type, entity_value, goals, hint, top_k)
779
+ if not recs:
780
+ st.warning("Enter a valid entity value first.")
781
+ return
782
+ st.session_state["dork_recs"] = recs
783
+ st.markdown("#### Recommended dorks")
784
+ for r in recs:
785
+ st.markdown(f"- **[{r['type']}]** `{r['q']}`")
786
+ st.markdown(f" <span class='small'>{r['why']}</span>", unsafe_allow_html=True)
787
+
788
+ # ---------------------------
789
+ # STEP 3: Selection
790
+ # ---------------------------
791
+ def render_dork_selection(entity_type: str, entity_value: str):
792
+ st.subheader("Step 3: Select dorks")
793
+ recs = st.session_state.get("dork_recs", [])
794
+ choice = st.radio("Select method", ["Accept advisor", "Pick from catalog", "Custom"], key="method_radio")
795
+ final = []
796
+ if choice == "Accept advisor":
797
+ final = [r["q"] for r in recs]
798
+ elif choice == "Pick from catalog":
799
+ typed = TYPED_DORK_MAP[entity_type](entity_value)
800
+ for idx, d in enumerate(typed):
801
+ if st.checkbox(d["q"], key=f"pick_{idx}"):
802
+ final.append(d["q"])
803
+ elif choice == "Custom":
804
+ txt = st.text_area("Enter custom dorks")
805
+ if txt:
806
+ final = [l.strip() for l in txt.splitlines() if l.strip()]
807
+ st.session_state["selected_dorks"] = final
808
+ st.write("Final Basket:", final)
809
+
810
+ # ---------------------------
811
+ # STEP 4: Execution + Metadata
812
+ # ---------------------------
813
+ def _audit_init():
814
+ st.session_state.setdefault("audit", [])
815
+
816
+ def _audit_log(action: str, **details):
817
+ if not st.session_state.get("settings", {}).get("logging", True):
818
+ return
819
+ _audit_init()
820
+ st.session_state["audit"].append({"ts": datetime.utcnow().isoformat()+"Z", "action": action, **details})
821
+
822
+ def ddg_search(query: str, max_results: int=5):
823
+ if DDGS is None:
824
+ return []
825
+ with DDGS() as ddgs:
826
+ return list(ddgs.text(query, max_results=max_results))
827
+
828
+ # ---------------------------
829
+ # Scoring
830
+ # ---------------------------
831
+ SOURCE_RELIABILITY = {
832
+ "high": [".gov", ".mil", ".edu", "sec.gov", "reuters", "bloomberg", "nytimes", "wsj"],
833
+ "med": ["github.com", "gitlab.com", "medium.com", "substack.com", "bbc"],
834
+ }
835
+
836
+ def _source_reliability(url: str) -> str:
837
+ url_l = (url or "").lower()
838
+ for kw in SOURCE_RELIABILITY["high"]:
839
+ if kw in url_l:
840
+ return "High"
841
+ for kw in SOURCE_RELIABILITY["med"]:
842
+ if kw in url_l:
843
+ return "Medium"
844
+ return "Low"
845
+
846
+ def _fuzzy_match(a: str, b: str) -> float:
847
+ if not a or not b:
848
+ return 0.0
849
+ if a.lower() == b.lower():
850
+ return 1.0
851
+ if fuzz:
852
+ return fuzz.ratio(a.lower(), b.lower()) / 100.0
853
+ return 0.0
854
+
855
+ def score_finding(row: Dict[str, Any], facts: KnownFacts) -> Dict[str, Any]:
856
+ title = row.get("title") or row.get("heading") or ""
857
+ snippet = row.get("body") or row.get("snippet") or ""
858
+ url = row.get("href") or row.get("link") or ""
859
+ text = f"{title}\n{snippet}".lower()
860
+ score = 0
861
+ comps: List[Dict[str, Any]] = []
862
+
863
+ def add(points: int, label: str, reason: str):
864
+ nonlocal score
865
+ score += points
866
+ comps.append({"label": label, "points": points, "reason": reason})
867
+
868
+ # Exact matches
869
+ hits = 0
870
+ for e in facts.emails:
871
+ if e.lower() in text:
872
+ add(25, "Email match", e)
873
+ hits += 1
874
+ for h in facts.handles:
875
+ if h.lower() in text:
876
+ add(15, "Handle match", h)
877
+ hits += 1
878
+ for d in facts.domains:
879
+ if d.lower() in text:
880
+ add(10, "Domain mention", d)
881
+ hits += 1
882
+ for ip in facts.ips:
883
+ if ip and ip.lower() in text:
884
+ add(10, "IP mention", ip)
885
+ hits += 1
886
+ for org in facts.orgs:
887
+ if org.lower() in text:
888
+ add(8, "Org mention", org)
889
+ hits += 1
890
+ for name in facts.real_names:
891
+ if name.lower() in text:
892
+ add(20, "Name mention", name)
893
+ hits += 1
894
+ else:
895
+ # fuzzy
896
+ for token in name.split():
897
+ for word in text.split():
898
+ if _fuzzy_match(token, word) >= 0.9:
899
+ add(8, "Fuzzy name token", f"{token}->{word}")
900
+ hits += 1
901
+ break
902
+
903
+ if hits >= 2:
904
+ add(10, "Co-occurrence", f"{hits} fact tokens present")
905
+
906
+ # Source reliability
907
+ rel = _source_reliability(url)
908
+ if rel == "High":
909
+ add(10, "Source reliability", rel)
910
+ elif rel == "Medium":
911
+ add(5, "Source reliability", rel)
912
+
913
+ # Context keywords basic
914
+ ctx_hits = 0
915
+ if facts.context:
916
+ ctx_hits = sum(1 for kw in facts.context.lower().split() if kw and kw in text)
917
+ if ctx_hits >= 3:
918
+ add(10, "Context alignment", f"{ctx_hits} context keywords")
919
+ elif ctx_hits == 2:
920
+ add(6, "Context alignment", "2 context keywords")
921
+ elif ctx_hits == 1:
922
+ add(3, "Context alignment", "1 context keyword")
923
+
924
+ # Optional embedding similarity (semantic relevance to context)
925
+ if ctx_hits < 3 and st.session_state.get("settings", {}).get("enable_embeddings") and facts.context and SentenceTransformer:
926
+ emb_model = st.session_state.get("_embed_model")
927
+ if emb_model is None:
928
+ with st.spinner("Loading embedding model (once)..."):
929
+ try:
930
+ emb_model = SentenceTransformer("all-MiniLM-L6-v2")
931
+ st.session_state["_embed_model"] = emb_model
932
+ except Exception:
933
+ emb_model = None
934
+ if emb_model:
935
+ try:
936
+ q_emb = emb_model.encode([facts.context[:512]])[0]
937
+ doc_emb = emb_model.encode([text[:1024]])[0]
938
+ # cosine
939
+ dot = float((q_emb @ doc_emb) / ((q_emb**2).sum()**0.5 * (doc_emb**2).sum()**0.5))
940
+ if dot > 0.35:
941
+ pts = int(min(20, (dot - 0.35) / (0.30) * 20)) # scale 0.35..0.65 -> 0..20
942
+ if pts > 0:
943
+ add(pts, "Semantic similarity", f"cos={dot:.2f}")
944
+ except Exception:
945
+ pass
946
+
947
+ level = "High" if score >= 70 else ("Medium" if score >= 40 else "Low")
948
+ explanation = "; ".join(f"{c['label']} +{c['points']} ({c['reason']})" for c in comps)
949
+ return {
950
+ **row,
951
+ "score": score,
952
+ "level": level,
953
+ "explanation": explanation,
954
+ "components": comps,
955
+ "reliability": rel,
956
+ "url": url,
957
+ "title": title,
958
+ "snippet": snippet,
959
+ }
960
+
961
+ def score_all_findings(rows: List[Dict[str, Any]], facts: KnownFacts) -> List[Dict[str, Any]]:
962
+ return [score_finding(r, facts) for r in rows]
963
+
964
+ # File/Image metadata extraction
965
+ def extract_metadata(upload) -> Dict[str, Any]:
966
+ info: Dict[str, Any] = {}
967
+ if not upload:
968
+ return info
969
+ name = upload.name.lower()
970
+ try:
971
+ if name.endswith(".pdf") and PdfReader:
972
+ reader = PdfReader(upload)
973
+ info = {"Pages": len(reader.pages), "Meta": dict(reader.metadata)}
974
+ elif name.endswith(".docx") and docx:
975
+ doc = docx.Document(upload)
976
+ cp = doc.core_properties
977
+ info = {"Title": cp.title, "Author": cp.author, "Created": cp.created}
978
+ elif (name.endswith(".doc") or name.endswith(".xls")) and olefile:
979
+ if olefile.isOleFile(upload):
980
+ info = {"OLE": "Legacy Office file detected"}
981
+ elif name.endswith((".mp3", ".flac", ".ogg", ".m4a")) and MutagenFile:
982
+ audio = MutagenFile(upload)
983
+ info = dict(audio) if audio else {}
984
+ elif name.endswith((".jpg", ".jpeg", ".png")) and exifread:
985
+ tags = exifread.process_file(upload)
986
+ info = {tag: str(val) for tag, val in tags.items()}
987
+ except Exception as e:
988
+ info = {"error": str(e)}
989
+ return info
990
+
991
+ # ---------------------------
992
+ # Graph Visualization
993
+ # ---------------------------
994
+ def build_graph(scored: List[Dict[str, Any]], facts: KnownFacts) -> Optional[str]:
995
+ if not nx or not Network:
996
+ return None
997
+ G = nx.Graph()
998
+ # Add fact nodes
999
+ for email in facts.emails:
1000
+ G.add_node(email, type="email")
1001
+ for h in facts.handles:
1002
+ G.add_node(h, type="handle")
1003
+ for d in facts.domains:
1004
+ G.add_node(d, type="domain")
1005
+ for n in facts.real_names:
1006
+ G.add_node(n, type="name")
1007
+ # Add finding nodes & edges
1008
+ for f in scored[:300]:
1009
+ url = f.get("url") or "unknown"
1010
+ G.add_node(url, type="finding", score=f.get("score",0))
1011
+ text = (f.get("title","") + " " + f.get("snippet",""))[:400].lower()
1012
+ linked = False
1013
+ for token in facts.emails + facts.handles + facts.domains + facts.real_names:
1014
+ if token.lower() and token.lower() in text:
1015
+ G.add_edge(token, url)
1016
+ linked = True
1017
+ if not linked and f.get("level") == "High":
1018
+ # still include high score node
1019
+ continue
1020
+ # Visualize
1021
+ net = Network(height="550px", width="100%", bgcolor="#111", font_color="white")
1022
+ for n, data in G.nodes(data=True):
1023
+ color = {
1024
+ "email": "#ff7f50",
1025
+ "handle": "#1e90ff",
1026
+ "domain": "#32cd32",
1027
+ "name": "#daa520",
1028
+ "finding": "#888"
1029
+ }.get(data.get("type"), "#999")
1030
+ size = 15 if data.get("type") != "finding" else max(5, min(25, int(data.get("score",10)/4)))
1031
+ net.add_node(n, label=n[:30], color=color, title=n, size=size)
1032
+ for u,v in G.edges():
1033
+ net.add_edge(u,v)
1034
+ path = "graph.html"
1035
+ net.show(path)
1036
+ try:
1037
+ with open(path, "r", encoding="utf-8") as f:
1038
+ return f.read()
1039
+ except Exception:
1040
+ return None
1041
+
1042
+ # ---------------------------
1043
+ # Report Export
1044
+ # ---------------------------
1045
+ HTML_TEMPLATE = """<!doctype html><html><head><meta charset='utf-8'/><title>OSINT Report</title>
1046
+ <style>body{font-family:Arial,Helvetica,sans-serif;margin:2rem;background:#111;color:#eee;} h1,h2{color:#ffcc66} table{border-collapse:collapse;width:100%;margin:1rem 0;} th,td{border:1px solid #444;padding:6px;font-size:0.85rem;} .high{color:#4caf50;font-weight:700}.medium{color:#ffc107}.low{color:#f44336} code{background:#222;padding:2px 4px;border-radius:4px;} .small{font-size:0.75rem;color:#ccc}</style>
1047
+ </head><body>
1048
+ <h1>OSINT Investigation Report</h1>
1049
+ <h2>Summary</h2>
1050
+ <p><b>Entity Type:</b> {{ entity_type }}<br/><b>Entity Value:</b> {{ entity_value }}<br/>
1051
+ <b>Generated:</b> {{ generated }} UTC</p>
1052
+ <h2>Known Facts</h2>
1053
+ <pre>{{ facts_json }}</pre>
1054
+ <h2>Findings (Top {{ findings|length }})</h2>
1055
+ <table><thead><tr><th>Score</th><th>Level</th><th>Title</th><th>URL</th><th>Reliability</th><th>Explanation</th></tr></thead><tbody>
1056
+ {% for f in findings %}
1057
+ <tr><td>{{ f.score }}</td><td class='{{ f.level|lower }}'>{{ f.level }}</td><td>{{ f.title }}</td><td><a href='{{ f.url }}' target='_blank'>link</a></td><td>{{ f.reliability }}</td><td class='small'>{{ f.explanation }}</td></tr>
1058
+ {% endfor %}
1059
+ </tbody></table>
1060
+ </body></html>"""
1061
+
1062
+ def export_report(entity_type: str, entity_value: str, facts: KnownFacts, scored: List[Dict[str, Any]]):
1063
+ if not Template:
1064
+ st.warning("jinja2 not installed; cannot build HTML report.")
1065
+ return
1066
+ tpl = Template(HTML_TEMPLATE)
1067
+ html = tpl.render(
1068
+ entity_type=entity_type,
1069
+ entity_value=entity_value,
1070
+ generated=datetime.utcnow().isoformat(),
1071
+ facts_json=json.dumps(facts.__dict__, indent=2),
1072
+ findings=scored[:200],
1073
+ )
1074
+ st.download_button("Download HTML Report", data=html.encode("utf-8"), file_name="osint_report.html", mime="text/html")
1075
+
1076
+ # ---------------------------
1077
+ # Username Availability Probe (simple)
1078
+ # ---------------------------
1079
+ PLATFORM_PATTERNS: Dict[str,str] = {
1080
+ "GitHub": "https://github.com/{user}",
1081
+ "Twitter": "https://x.com/{user}",
1082
+ "Reddit": "https://www.reddit.com/user/{user}",
1083
+ "Medium": "https://medium.com/@{user}",
1084
+ }
1085
+
1086
+ def probe_usernames(users: List[str], limit: int = 10) -> List[Dict[str,str]]:
1087
+ out = []
1088
+ if requests is None:
1089
+ return out
1090
+ for u in users[:limit]:
1091
+ for plat, pattern in PLATFORM_PATTERNS.items():
1092
+ url = pattern.format(user=u)
1093
+ status = "?"
1094
+ try:
1095
+ r = requests.get(url, timeout=5)
1096
+ if r.status_code == 200:
1097
+ status = "Exists"
1098
+ elif r.status_code == 404:
1099
+ status = "Not Found"
1100
+ else:
1101
+ status = str(r.status_code)
1102
+ except Exception:
1103
+ status = "Error"
1104
+ out.append({"platform": plat, "username": u, "status": status})
1105
+ return out
1106
+
1107
+ def render_step4_execution(entity_type: str, entity_value: str):
1108
+ st.subheader("Step 4: Execute & Metadata")
1109
+ final = st.session_state.get("selected_dorks", [])
1110
+ if not final:
1111
+ st.info("No dorks selected.")
1112
+ return
1113
+ max_per = st.slider("Max results", 3, 20, st.session_state.get("settings", {}).get("max_per", 10))
1114
+ if st.button("Run dorks"):
1115
+ # Progressive skeleton loader while executing each query
1116
+ placeholder = st.empty()
1117
+ results: List[Dict[str, Any]] = []
1118
+ total_expected = len(final) * max_per
1119
+ for i, q in enumerate(final, start=1):
1120
+ remaining = len(final) - i + 1
1121
+ est_remaining = remaining * max_per
1122
+ # Render skeletons representing expected remaining results (capped for performance)
1123
+ with placeholder.container():
1124
+ st.markdown("#### Running searches…")
1125
+ st.caption(f"Query {i}/{len(final)}: {q}")
1126
+ skel_blocks = min(est_remaining, 18) # avoid huge DOM
1127
+ # Distribute size variations for visual interest
1128
+ sizes = ["sm", "md", "lg"]
1129
+ rows_html = []
1130
+ for j in range(skel_blocks):
1131
+ size = sizes[j % len(sizes)]
1132
+ rows_html.append(f'<div class="skeleton-block skeleton-h {size}"></div>')
1133
+ st.markdown(
1134
+ '<div class="skeleton-group">' + "".join(rows_html) + "</div>",
1135
+ unsafe_allow_html=True,
1136
+ )
1137
+ # Execute the actual search
1138
+ rows = ddg_search(q, max_results=max_per)
1139
+ _audit_log("dork_run", dork=q, results=len(rows))
1140
+ results.extend(rows)
1141
+ # Clear placeholder after completion
1142
+ placeholder.empty()
1143
+ st.session_state["dork_results"] = results
1144
+ # compute scores after acquiring all results
1145
+ facts = KnownFacts.from_session()
1146
+ st.session_state["scored_results"] = score_all_findings(results, facts)
1147
+ if res := st.session_state.get("dork_results"):
1148
+ st.json(res)
1149
+ audit_str = "\n".join(json.dumps(ev) for ev in st.session_state["audit"])
1150
+ st.download_button("Download audit", audit_str, "audit.jsonl")
1151
+
1152
+ st.markdown("---")
1153
+ st.subheader("File/Image Metadata Extractor")
1154
+ upload = st.file_uploader("Upload a file (pdf, docx, mp3, jpg, etc.)")
1155
+ if upload:
1156
+ meta = extract_metadata(upload)
1157
+ st.json(meta)
1158
+
1159
+ # ---------------------------
1160
+ # Main
1161
+ # ---------------------------
1162
+ def render_help_tab():
1163
+ st.subheader("How To Use This OSINT Investigator Suite")
1164
+ st.markdown("""
1165
+ This tab is a quick field manual. It shows the purpose of every tab, the workflow order, and pro tips.
1166
+
1167
+ ### Recommended Workflow (Fast Path)
1168
+ 1. Known Facts – Load seed identifiers (handles, emails, domains, names).
1169
+ 2. Plan – Review the autogenerated phased investigation plan; adjust facts if gaps obvious.
1170
+ 3. Explainer – Learn the dork building logic for transparency (optional).
1171
+ 4. Advisor – Get recommended dorks (rule + optional LLM). Refine, then accept.
1172
+ 5. Selection – Curate / edit / remove dorks; finalize the set to run.
1173
+ 6. Execution – Run dorks (skeleton loaders show progress); extract file/image metadata if you have artifacts.
1174
+ 7. Scoring – Review confidence scores, filter, read explanations, iterate by adding new facts and re-scoring.
1175
+ 8. Graph – Visual relationship view (requires networkx + pyvis) to spot high‑intersection nodes.
1176
+ 9. Report – Export an HTML snapshot for stakeholders / evidence chain.
1177
+ 10. Usernames – Probe handle existence across common platforms.
1178
+ 11. Help – (This) reference card anytime.
1179
+
1180
+ ---
1181
+ ### Tab Details & Tips
1182
+ **Known Facts**
1183
+ - Add all solid identifiers early; scoring & dork generation leverage them.
1184
+ - Handles & emails dramatically raise confidence when co-occurring in sources.
1185
+ - Update facts after each scouting loop (new domains from findings, etc.).
1186
+
1187
+ **Plan**
1188
+ - Generated phases: Recon, Expansion, Correlation, Deep Dive, Reporting.
1189
+ - Use it as a narrative backbone for your final export or task tickets.
1190
+
1191
+ **Explainer**
1192
+ - Shows how base + contextual tokens assemble into search dorks by entity type.
1193
+ - Use to justify methodology or teach newcomers.
1194
+
1195
+ **Advisor**
1196
+ - Hybrid: deterministic heuristic rules plus optional LLM (if HF token + model set in settings).
1197
+ - Toggle embedding/semantic features in settings (if present) to enrich scoring later.
1198
+ - Accept the generated list to push candidates to Selection.
1199
+
1200
+ **Selection**
1201
+ - Final edit surface. Remove noisy / redundant queries before execution.
1202
+ - Keep a balanced mix: broad footprint + specific leak/file/resource patterns.
1203
+
1204
+ **Execution**
1205
+ - Click Run dorks: animated skeleton placeholders appear per batch while searches resolve.
1206
+ - Results cached in session: re-running overwrites (audit log tracks runs).
1207
+ - Metadata Extractor: Upload docs / images to pull EXIF, PDF metadata, docx core props, audio tags.
1208
+
1209
+ **Scoring**
1210
+ - Each finding scored from component signals (exact identifiers, fuzzy tokens, co-occurrence, reliability, context keywords, semantic similarity).
1211
+ - Levels: High ≥70, Medium ≥40. Use filters + search bar to triage.
1212
+ - Re-score after updating Known Facts or enabling embeddings.
1213
+ - "Full Explanations" expands reasoning transparency for defensibility.
1214
+
1215
+ **Graph**
1216
+ - Visual pivot map: nodes sized by aggregated score; edges for shared identifiers.
1217
+ - Use to spot central assets (good pivot candidates) quickly.
1218
+ - If graph libs missing you'll see an install hint (they're listed in requirements).
1219
+
1220
+ **Report**
1221
+ - Generates a standalone HTML (includes styling + key metrics) for sharing.
1222
+ - Consider exporting after each major iteration to preserve state (version trail).
1223
+
1224
+ **Usernames**
1225
+ - Lightweight existence probe (HTTP status heuristic). "Exists" ≠ ownership proof.
1226
+ - Add more platforms by extending PLATFORM_PATTERNS in code.
1227
+
1228
+ **Chat Assistant (Floating)**
1229
+ - Noir-style guidance; quick buttons for common pivots.
1230
+ - If a model + token configured, responses may blend LLM nuance with rule hints; otherwise rule-based only.
1231
+ - Close with ✕; reopen with the 🕵️ button.
1232
+
1233
+ **Light / Dark Toggle**
1234
+ - Sidebar toggle (if present) swaps theme classes; custom components auto-adapt.
1235
+
1236
+ **Skeleton Loaders**
1237
+ - Shimmering bars appear during long search batches to indicate progress.
1238
+
1239
+ ---
1240
+ ### Power User Tips
1241
+ - Iterative Loop: (Run) → (Score) → (Add new facts from findings) → (Re-score) → (Graph) → (Report).
1242
+ - High-value pivots: Rare email domains, unique handles in code repos, author names in PDF metadata.
1243
+ - Noise Control: Remove generic dorks that return unrelated trending content before executing.
1244
+ - Evidence Chain: Audit log (download on Execution tab) + HTML reports form a defensible trail.
1245
+
1246
+ ### Performance Notes
1247
+ - Limiting Max results reduces API latency & keeps scoring responsive.
1248
+ - Embedding model loads lazily—first semantic scoring may pause a few seconds.
1249
+ - Graph view caps large result sets to avoid browser lockups.
1250
+
1251
+ ### Glossary
1252
+ - Dork: Crafted search query combining identifiers + context tokens.
1253
+ - Pivot: New investigative direction unlocked by a discovered unique attribute.
1254
+ - Co-occurrence: Multiple target identifiers appearing together in one source.
1255
+
1256
+ ### Ethics Reminder
1257
+ Public sources only. No credential stuffing, intrusion, or accessing private data stores. Respect rate limits & platform ToS.
1258
+ """)
1259
+
1260
+ def main():
1261
+ st.markdown("""
1262
+ <div class='app-brand-bar'>
1263
+ <div style='font-size:28px'>🕵️</div>
1264
+ <div class='app-brand-title'>OSINT Investigator Suite</div>
1265
+ <div class='app-badge'>AI-Augmented</div>
1266
+ <div class='app-badge'>Heuristic Scoring</div>
1267
+ <div class='app-badge'>Report Export</div>
1268
+ </div>
1269
+ """, unsafe_allow_html=True)
1270
+ entity_type = st.selectbox("Entity type", list(TYPED_DORK_MAP.keys()), key="entity_type")
1271
+ entity_value = st.text_input("Entity value", "[email protected]", key="entity_value")
1272
+ if entity_type and entity_value:
1273
+ tabs = st.tabs(["Known Facts", "Plan", "Explainer", "Advisor", "Selection", "Execution", "Scoring", "Graph", "Report", "Usernames", "Help"])
1274
+ with tabs[0]:
1275
+ _known_facts_ui()
1276
+ with tabs[1]:
1277
+ render_investigation_plan(entity_type, entity_value)
1278
+ with tabs[2]:
1279
+ render_dorks_explainer(entity_type, entity_value)
1280
+ with tabs[3]:
1281
+ render_dork_recommender(entity_type, entity_value)
1282
+ with tabs[4]:
1283
+ render_dork_selection(entity_type, entity_value)
1284
+ with tabs[5]:
1285
+ render_step4_execution(entity_type, entity_value)
1286
+ with tabs[6]:
1287
+ st.subheader("Scoring & Confidence")
1288
+ facts = KnownFacts.from_session()
1289
+ scored = st.session_state.get("scored_results")
1290
+ if not scored:
1291
+ st.info("Run dorks first to generate findings and scores.")
1292
+ else:
1293
+ high = sum(1 for r in scored if r["level"] == "High")
1294
+ med = sum(1 for r in scored if r["level"] == "Medium")
1295
+ low = sum(1 for r in scored if r["level"] == "Low")
1296
+ st.markdown("<div class='sticky-toolbar'><strong>Findings Overview</strong></div>", unsafe_allow_html=True)
1297
+ k1,k2,k3,k4 = st.columns(4)
1298
+ k1.metric("Total", len(scored))
1299
+ k2.metric("High", high)
1300
+ k3.metric("Medium", med)
1301
+ k4.metric("Low", low)
1302
+ level_filter = st.multiselect("Levels", ["High", "Medium", "Low"], default=["High", "Medium", "Low"], key="lvl_filter")
1303
+ q = st.text_input("Search title/snippet", key="score_search")
1304
+ view = [r for r in scored if r["level"] in level_filter and (not q or q.lower() in (r.get("snippet", '')).lower() or q.lower() in (r.get("title", '')).lower())]
1305
+ rows_html = []
1306
+ for r in view:
1307
+ lvl = r["level"].lower()
1308
+ badge = f"<span class='badge {lvl}'>{r['level']}</span>"
1309
+ title = (r.get('title',''))[:120]
1310
+ expl_short = (r.get('explanation',''))[:180]
1311
+ url = r.get('url') or ''
1312
+ rows_html.append(f"<tr><td>{r['score']}</td><td>{badge}</td><td>{title}</td><td><a href='{url}' target='_blank'>link</a></td><td>{r['reliability']}</td><td>{expl_short}</td></tr>")
1313
+ table_html = """
1314
+ <div style='max-height:520px;overflow:auto;border:1px solid #262626;border-radius:12px;'>
1315
+ <table class='score-table'>
1316
+ <thead><tr><th>Score</th><th>Level</th><th>Title</th><th>URL</th><th>Reliab.</th><th>Explanation (truncated)</th></tr></thead>
1317
+ <tbody>{rows}</tbody>
1318
+ </table>
1319
+ </div>
1320
+ """.format(rows="".join(rows_html))
1321
+ st.markdown(table_html, unsafe_allow_html=True)
1322
+ col_rescore, col_full, col_export = st.columns([1,2,1])
1323
+ with col_rescore:
1324
+ if st.button("Re-score", key="btn_rescore_now"):
1325
+ rescored = score_all_findings(st.session_state.get("dork_results", []), facts)
1326
+ st.session_state["scored_results"] = rescored
1327
+ st.success("Re-scored.")
1328
+ with col_full:
1329
+ with st.expander("Full Explanations"):
1330
+ for r in view:
1331
+ st.markdown(f"**{r.get('title','')}** — {r['level']} ({r['score']})\n\n{r.get('explanation','')}")
1332
+ with col_export:
1333
+ if st.button("Export Report (HTML)", key="btn_export_report_inline"):
1334
+ export_report(entity_type, entity_value, facts, scored)
1335
+ with tabs[7]:
1336
+ st.subheader("Entity Graph")
1337
+ facts = KnownFacts.from_session()
1338
+ scored = st.session_state.get("scored_results") or []
1339
+ if scored:
1340
+ html = build_graph(scored, facts)
1341
+ if html:
1342
+ st.components.v1.html(html, height=600, scrolling=True)
1343
+ else:
1344
+ st.info("Install networkx & pyvis for graph visualization.")
1345
+ else:
1346
+ st.info("No scored findings yet.")
1347
+ with tabs[8]:
1348
+ st.subheader("Report Export")
1349
+ facts = KnownFacts.from_session()
1350
+ scored = st.session_state.get("scored_results") or []
1351
+ if scored:
1352
+ export_report(entity_type, entity_value, facts, scored)
1353
+ else:
1354
+ st.info("Run and score findings to export a report.")
1355
+ with tabs[9]:
1356
+ st.subheader("Username Availability Probe")
1357
+ facts = KnownFacts.from_session()
1358
+ sample_users = facts.handles[:10] or [entity_value] if entity_type == "Username / Handle" else []
1359
+ if not sample_users:
1360
+ st.info("Add handles in Known Facts or pick a username entity.")
1361
+ else:
1362
+ if st.button("Probe Platforms", key="btn_probe_users"):
1363
+ data = probe_usernames(sample_users)
1364
+ st.session_state["probe_results"] = data
1365
+ if pr := st.session_state.get("probe_results"):
1366
+ st.dataframe(pr, use_container_width=True)
1367
+ with tabs[10]:
1368
+ render_help_tab()
1369
+ # Floating chat widget render
1370
+ render_chat_widget(entity_type, entity_value)
1371
+ with st.expander("Methodology / Scoring Rubric", expanded=False):
1372
+ st.markdown("""
1373
+ **Scoring Components**
1374
+ - Email (+25) / Name exact (+20) / Handle (+15) / Domain (+10) / IP (+10) / Org (+8)
1375
+ - Fuzzy name token (+8) / Co-occurrence (+10)
1376
+ - Source reliability High (+10) / Medium (+5)
1377
+ - Context alignment (1:+3 / 2:+6 / ≥3:+10)
1378
+ - Semantic similarity (0–20 scaled) if enabled
1379
+ **Levels:** High ≥70, Medium ≥40, else Low.
1380
+ """)
1381
+ with st.expander("Ethical Use Notice", expanded=False):
1382
+ st.markdown("Lawful OSINT only. No intrusion, auth bypass, or accessing non-public data. Respect platform ToS & privacy.")
1383
+
1384
+ # ---------------------------
1385
+ # Chat Assistant
1386
+ # ---------------------------
1387
+ GUIDE_SYSTEM = (
1388
+ "You are a noir-style seasoned OSINT investigator named 'The Analyst'. Speak like classic crime noir: terse, vivid metaphors, professional, never cheesy. "
1389
+ "Guide the user step-by-step in enumerating a digital entity using only ethical open sources. "
1390
+ "Each answer: <=150 words, 2-4 compact paragraphs or bullet fragments. Provide concrete next actions, pivot angles, and a light ethics reminder if user drifts. "
1391
+ "Avoid sensationalism. No illegal guidance. Occasionally finish with a brief noir tag line like 'That's the shape of the alley, kid.'" )
1392
+
1393
+ def _summarize_context(entity_type: str, entity_value: str) -> str:
1394
+ facts: KnownFacts = KnownFacts.from_session()
1395
+ scored = st.session_state.get("scored_results") or []
1396
+ high_titles = [s.get("title") for s in scored if s.get("level") == "High"][:5]
1397
+ parts = [f"Entity: {entity_type}={entity_value}"]
1398
+ if facts.handles: parts.append(f"Handles:{len(facts.handles)}")
1399
+ if facts.emails: parts.append(f"Emails:{len(facts.emails)}")
1400
+ if facts.domains: parts.append(f"Domains:{len(facts.domains)}")
1401
+ if high_titles: parts.append("HighHits:" + ";".join(high_titles))
1402
+ return " | ".join(parts)
1403
+
1404
+ def _rule_based_reply(user_msg: str, entity_type: str, entity_value: str) -> str:
1405
+ msg = user_msg.lower()
1406
+ lines = []
1407
+ ctx = _summarize_context(entity_type, entity_value)
1408
+ if any(k in msg for k in ["start", "hello", "hi", "first"]):
1409
+ lines.append("First we empty our pockets—handles, domains, emails. Solid identifiers become compass bearings.")
1410
+ if "dork" in msg or "search" in msg:
1411
+ lines.append("Open with wide footprint dorks. Then tighten: docs leaks, repo chatter, paste traces. Each query is a flashlight beam.")
1412
+ if "score" in msg or "confidence" in msg:
1413
+ lines.append("Confidence breathes when multiple facts collide in a clean source. Add precise emails or stable handles—re-score, watch the highs rise.")
1414
+ if "graph" in msg:
1415
+ lines.append("Graph shows the intersections. Nodes struck by multiple identifiers—those corners hide stories.")
1416
+ if "pivot" in msg or "next" in msg:
1417
+ lines.append("Pivot off unique anchors: a handle in a PDF, an email in a commit, a domain in a press note. Each pivot narrows the alley.")
1418
+ if not lines:
1419
+ lines.append("Playbook: 1) Lock facts 2) Advisor for 10 sharp dorks 3) Select & run 4) Score 5) Add new facts 6) Graph pivots 7) Export report.")
1420
+ lines.append(f"Context snapshot: {ctx}")
1421
+ lines.append("Stay clean—public sources only. That's the shape of the alley, kid.")
1422
+ return "\n\n".join(lines)
1423
+
1424
+ def render_chat_widget(entity_type: str, entity_value: str):
1425
+ # Session setup
1426
+ st.session_state.setdefault("chat_history", [])
1427
+ st.session_state.setdefault("chat_open", True)
1428
+ open_flag = st.session_state["chat_open"]
1429
+
1430
+ # Mini open button (when closed)
1431
+ if not open_flag:
1432
+ if st.button("🕵️", key="open_chat_button"):
1433
+ st.session_state["chat_open"] = True
1434
+ # Style the button to float
1435
+ st.markdown("""
1436
+ <style>
1437
+ div[data-testid='stButton'] button[kind='secondary'] {background:#222;border:2px solid #ffcc66;}
1438
+ </style>
1439
+ <div class='chat-mini-btn'></div>
1440
+ """, unsafe_allow_html=True)
1441
+ return
1442
+
1443
+ # Build chat window
1444
+ messages = st.session_state["chat_history"]
1445
+ # Render HTML shell
1446
+ st.markdown("<div class='chat-window'>", unsafe_allow_html=True)
1447
+ # Header with close control
1448
+ c1, c2, c3 = st.columns([0.2, 0.65, 0.15])
1449
+ with c1:
1450
+ st.markdown("<div class='chat-header' style='background:transparent;padding:4px 0 0 6px;'>🕵️</div>", unsafe_allow_html=True)
1451
+ with c2:
1452
+ st.markdown("<div class='chat-header' style='background:transparent;padding:4px 0;'> <span class='title'>Investigator</span></div>", unsafe_allow_html=True)
1453
+ with c3:
1454
+ if st.button("✕", key="close_chat_btn"):
1455
+ st.session_state["chat_open"] = False
1456
+ st.markdown("</div>", unsafe_allow_html=True)
1457
+ return
1458
+ # Messages area
1459
+ # Use an empty container to emulate scroll (Streamlit limitation)
1460
+ msg_container = st.container()
1461
+ with msg_container:
1462
+ if messages:
1463
+ for turn in messages[-18:]:
1464
+ st.markdown(f"<p class='msg-user'><b>You:</b> {turn['user']}</p>", unsafe_allow_html=True)
1465
+ st.markdown(f"<p class='msg-bot'><b>Inv:</b> {turn['assistant']}</p>", unsafe_allow_html=True)
1466
+ else:
1467
+ st.markdown("<p class='msg-bot'>Need a lead? Ask me about dorks, scoring, or pivots.</p>", unsafe_allow_html=True)
1468
+
1469
+ # Input form
1470
+ with st.form("chat_form", clear_on_submit=True):
1471
+ q = st.text_area("Message", key="chat_input_area", height=70, label_visibility="collapsed")
1472
+ col_a, col_b, col_c, col_d = st.columns(4)
1473
+ send = False
1474
+ with col_a:
1475
+ if st.form_submit_button("Send"):
1476
+ send = True
1477
+ with col_b:
1478
+ if st.form_submit_button("Dorks"):
1479
+ q = "What dorks should I run next?"; send = True
1480
+ with col_c:
1481
+ if st.form_submit_button("Confidence"):
1482
+ q = "How do I improve confidence now?"; send = True
1483
+ with col_d:
1484
+ if st.form_submit_button("Pivot"):
1485
+ q = "Give me a pivot strategy."; send = True
1486
+ if send and q.strip():
1487
+ reply: Optional[str] = None
1488
+ if st.session_state.get("settings", {}).get("model") and os.getenv("HF_API_TOKEN"):
1489
+ convo = st.session_state["chat_history"][-6:]
1490
+ history_str = "\n".join([f"User: {h['user']}\nAssistant: {h['assistant']}" for h in convo if h.get('assistant')])
1491
+ prompt = (
1492
+ f"{GUIDE_SYSTEM}\nCurrentContext: {_summarize_context(entity_type, entity_value)}\n" +
1493
+ history_str + f"\nUser: {q}\nAssistant:")
1494
+ reply = _hf_infer(MODEL_ID_MAP.get(st.session_state["settings"]["model"], st.session_state["settings"]["model"]), prompt, max_new_tokens=190, temperature=0.35)
1495
+ if not reply:
1496
+ reply = _rule_based_reply(q, entity_type, entity_value)
1497
+ st.session_state["chat_history"].append({"user": q, "assistant": reply})
1498
+ st.markdown("<div class='chat-input small'>Ethical OSINT only.🕵️‍♂️</div>", unsafe_allow_html=True)
1499
+ st.markdown("</div>", unsafe_allow_html=True)
1500
+
1501
+ if __name__ == "__main__":
1502
+ main()
1503
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.32
2
+ duckduckgo_search
3
+ rapidfuzz
4
+ sentence-transformers
5
+ networkx
6
+ pyvis
7
+ jinja2
8
+ PyPDF2
9
+ python-docx
10
+ olefile
11
+ mutagen
12
+ exifread
13
+ requests