Spaces:
Running
Running
Upload 2 files
Browse files- app.py +1503 -0
- requirements.txt +13 -0
app.py
ADDED
|
@@ -0,0 +1,1503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from typing import List, Dict, Any, TypedDict, Optional, Tuple
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import ipaddress
|
| 7 |
+
import os
|
| 8 |
+
try:
|
| 9 |
+
import requests
|
| 10 |
+
except Exception:
|
| 11 |
+
requests = None
|
| 12 |
+
|
| 13 |
+
# Optional libraries
|
| 14 |
+
try:
|
| 15 |
+
from duckduckgo_search import DDGS
|
| 16 |
+
except Exception:
|
| 17 |
+
DDGS = None
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from PyPDF2 import PdfReader
|
| 21 |
+
except Exception:
|
| 22 |
+
PdfReader = None
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
import docx
|
| 26 |
+
except Exception:
|
| 27 |
+
docx = None
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import olefile
|
| 31 |
+
except Exception:
|
| 32 |
+
olefile = None
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
from mutagen import File as MutagenFile
|
| 36 |
+
except Exception:
|
| 37 |
+
MutagenFile = None
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from rapidfuzz import fuzz
|
| 41 |
+
except Exception:
|
| 42 |
+
fuzz = None
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
import exifread
|
| 46 |
+
except Exception:
|
| 47 |
+
exifread = None
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
import networkx as nx
|
| 51 |
+
except Exception:
|
| 52 |
+
nx = None
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
from pyvis.network import Network
|
| 56 |
+
except Exception:
|
| 57 |
+
Network = None
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
from sentence_transformers import SentenceTransformer
|
| 61 |
+
except Exception:
|
| 62 |
+
SentenceTransformer = None
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
from jinja2 import Template
|
| 66 |
+
except Exception:
|
| 67 |
+
Template = None
|
| 68 |
+
|
| 69 |
+
# ---------------------------
|
| 70 |
+
# Config & Styles
|
| 71 |
+
# ---------------------------
|
| 72 |
+
st.set_page_config(page_title="OSINT Investigator", layout="wide")
|
| 73 |
+
|
| 74 |
+
HIDE_STREAMLIT_STYLE = """
|
| 75 |
+
<style>
|
| 76 |
+
#MainMenu {visibility: hidden;}
|
| 77 |
+
footer {visibility: hidden;}
|
| 78 |
+
.small {font-size: 0.85rem; color: #666}
|
| 79 |
+
code {white-space: pre-wrap;}
|
| 80 |
+
/* Floating Chat Styles */
|
| 81 |
+
.chat-window {position: fixed; bottom: 20px; right: 20px; width: 360px; max-height: 560px; background:#1c1c1c; border:1px solid #444; border-radius:14px; z-index:1000; display:flex; flex-direction:column; box-shadow:0 8px 24px rgba(0,0,0,.55);}
|
| 82 |
+
.chat-header {padding:8px 12px; display:flex; align-items:center; gap:8px; border-bottom:1px solid #333; background:#222; border-top-left-radius:14px; border-top-right-radius:14px;}
|
| 83 |
+
.chat-header .title {font-weight:600; color:#ffcc66;}
|
| 84 |
+
.chat-close {margin-left:auto; cursor:pointer; font-weight:700; color:#bbb;}
|
| 85 |
+
.chat-close:hover {color:#fff;}
|
| 86 |
+
.chat-messages {padding:10px 12px; overflow-y:auto; flex:1; font-size:0.8rem;}
|
| 87 |
+
.chat-messages p {margin:0 0 10px;}
|
| 88 |
+
.msg-user {color:#fff;}
|
| 89 |
+
.msg-bot {color:#ffcc66; font-style:italic;}
|
| 90 |
+
.chat-input {padding:8px 10px; border-top:1px solid #333; background:#181818; border-bottom-left-radius:14px; border-bottom-right-radius:14px;}
|
| 91 |
+
.chat-input textarea {font-size:0.75rem !important;}
|
| 92 |
+
.badge-action {display:inline-block; background:#333; color:#ffcc66; padding:2px 6px; margin:2px 4px 6px 0; border-radius:6px; font-size:0.6rem; cursor:pointer;}
|
| 93 |
+
.badge-action:hover {background:#444;}
|
| 94 |
+
.chat-mini-btn {position:fixed; bottom:20px; right:20px; width:62px; height:62px; border-radius:50%; background:#222; border:2px solid #ffcc66; display:flex; align-items:center; justify-content:center; font-size:30px; cursor:pointer; z-index:999; box-shadow:0 0 8px rgba(0,0,0,.6);}
|
| 95 |
+
.chat-mini-btn:hover {background:#333;}
|
| 96 |
+
/* App Enhancements */
|
| 97 |
+
.app-brand-bar {display:flex; align-items:center; gap:14px; padding:8px 18px 4px 8px; border-bottom:1px solid #262626; margin:-1rem -1rem 1.2rem -1rem; background:linear-gradient(90deg,#141414,#181818);}
|
| 98 |
+
.app-brand-title {font-size:1.35rem; font-weight:600; letter-spacing:.5px; color:#ffcc66;}
|
| 99 |
+
.app-badge {display:inline-block; padding:2px 8px; border-radius:12px; font-size:0.65rem; font-weight:600; text-transform:uppercase; letter-spacing:.5px; margin-right:6px; background:#222; border:1px solid #333; color:#bbb;}
|
| 100 |
+
.level-high {background:#11391f; border-color:#1f6d3b; color:#3ddc84;}
|
| 101 |
+
.level-medium {background:#3a2e12; border-color:#72581a; color:#ffcf66;}
|
| 102 |
+
.level-low {background:#3a1616; border-color:#7a2727; color:#ff6b6b;}
|
| 103 |
+
.metric-row {margin-top:.4rem;}
|
| 104 |
+
.stDataFrame {border:1px solid #262626; border-radius:10px; overflow:hidden;}
|
| 105 |
+
.styled-section {background:#141414; border:1px solid #2a2a2a; padding:1rem 1.2rem; border-radius:14px; box-shadow:0 0 0 1px #111 inset, 0 4px 18px -8px #000;}
|
| 106 |
+
.kpi-grid div[data-testid='metric-container'] {background:#181818; border:1px solid #262626; border-radius:12px; padding:.75rem;}
|
| 107 |
+
.kpi-grid div[data-testid='stMetric'] {padding:.25rem .5rem .35rem .5rem;}
|
| 108 |
+
.plan-expander summary {font-weight:600; letter-spacing:.5px;}
|
| 109 |
+
.report-btn button {background:#ffcc66 !important; color:#111 !important; font-weight:600;}
|
| 110 |
+
.stDownloadButton button {border-radius:10px;}
|
| 111 |
+
.stTextInput input, .stTextArea textarea {border-radius:10px !important;}
|
| 112 |
+
.stTabs [data-baseweb='tab-list'] {gap:4px;}
|
| 113 |
+
.stTabs [data-baseweb='tab'] {background:#161616; padding:.5rem .9rem; border-radius:10px; border:1px solid #262626;}
|
| 114 |
+
.stTabs [data-baseweb='tab']:hover {background:#1d1d1d;}
|
| 115 |
+
.stTabs [aria-selected='true'] {background:#222 !important; border-color:#444 !important;}
|
| 116 |
+
.section-title {font-size:1.05rem; font-weight:600; letter-spacing:.5px; margin-bottom:.35rem;}
|
| 117 |
+
.sticky-toolbar {position:sticky; top:0; z-index:50; background:linear-gradient(90deg,#181818,#141414); padding:.4rem .6rem; border:1px solid #262626; border-radius:10px; margin-bottom:.6rem; box-shadow:0 6px 12px -8px rgba(0,0,0,.6);}
|
| 118 |
+
.sticky-toolbar button {margin-right:.35rem;}
|
| 119 |
+
.score-table {width:100%; border-collapse:collapse; font-size:0.75rem;}
|
| 120 |
+
.score-table th {text-align:left; padding:6px 8px; background:#202020; position:sticky; top:0; z-index:2;}
|
| 121 |
+
.score-table td {padding:6px 8px; border-top:1px solid #262626; vertical-align:top;}
|
| 122 |
+
.badge {display:inline-block; padding:2px 7px; border-radius:10px; font-size:0.6rem; font-weight:600; letter-spacing:.5px;}
|
| 123 |
+
.badge.high {background:#11391f; color:#3ddc84;}
|
| 124 |
+
.badge.medium {background:#3a2e12; color:#ffcf66;}
|
| 125 |
+
.badge.low {background:#3a1616; color:#ff6b6b;}
|
| 126 |
+
.methodology-box {background:#141414; border:1px solid #262626; padding:.8rem 1rem; border-radius:12px; font-size:0.8rem; line-height:1.25rem;}
|
| 127 |
+
body.light-mode, .light-mode [data-testid='stAppViewContainer'] {background:#f6f7f9; color:#222;}
|
| 128 |
+
.light-mode .app-brand-bar {background:linear-gradient(90deg,#fafafa,#eceff1); border-color:#d8dadd;}
|
| 129 |
+
.light-mode .app-brand-title {color:#7a4d00;}
|
| 130 |
+
.light-mode .app-badge {background:#fff; border-color:#d1d4d8; color:#555;}
|
| 131 |
+
.light-mode .sticky-toolbar {background:linear-gradient(90deg,#fff,#f3f5f7); border-color:#d8dade;}
|
| 132 |
+
.light-mode .score-table th {background:#eceff1;}
|
| 133 |
+
.light-mode .score-table td {border-color:#d9dde1;}
|
| 134 |
+
.light-mode .badge.high {background:#d8f5e6; color:#0d7a3d;}
|
| 135 |
+
.light-mode .badge.medium {background:#fbeccb; color:#8a6500;}
|
| 136 |
+
.light-mode .badge.low {background:#fbd5d5; color:#b80000;}
|
| 137 |
+
.light-mode .stTabs [data-baseweb='tab'] {background:#f5f6f7; border-color:#d9dde1;}
|
| 138 |
+
.light-mode .stTabs [aria-selected='true'] {background:#ffffff !important; border-color:#b9bdc1 !important;}
|
| 139 |
+
/* Skeleton / Shimmer */
|
| 140 |
+
@keyframes shimmer {0% {transform:translateX(-60%);} 100% {transform:translateX(120%);} }
|
| 141 |
+
.skeleton-block {position:relative; overflow:hidden; background:#1e1e1e; border-radius:6px; margin:4px 0;}
|
| 142 |
+
.skeleton-block.light-mode {background:#e2e5e9;}
|
| 143 |
+
.skeleton-block::after {content:""; position:absolute; top:0; left:0; height:100%; width:50%; background:linear-gradient(90deg, rgba(255,255,255,0), rgba(255,255,255,.15), rgba(255,255,255,0)); animation:shimmer 1.25s infinite;}
|
| 144 |
+
.sk-line-sm {height:10px;}
|
| 145 |
+
.sk-line-md {height:14px;}
|
| 146 |
+
.sk-line-lg {height:22px;}
|
| 147 |
+
.sk-fade {animation:fadeIn .3s ease-in;}
|
| 148 |
+
@keyframes fadeIn {from {opacity:0;} to {opacity:1;}}
|
| 149 |
+
</style>
|
| 150 |
+
"""
|
| 151 |
+
|
| 152 |
+
st.markdown(HIDE_STREAMLIT_STYLE, unsafe_allow_html=True)
|
| 153 |
+
st.markdown("""
|
| 154 |
+
<head>
|
| 155 |
+
<link rel='icon' type='image/svg+xml' href="data:image/svg+xml;utf8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 64 64'><circle fill='%23111' cx='32' cy='32' r='32'/><path fill='%23ffcc66' d='M12 38l4-14h32l4 14H12zm8 4h24c0 6-6 10-12 10s-12-4-12-10zM24 18c0-4 4-8 8-8s8 4 8 8v4H24v-4z'/></svg>">
|
| 156 |
+
<meta name='description' content='OSINT Investigator Suite - AI-augmented open source intelligence enumeration & scoring platform.'>
|
| 157 |
+
<meta name='viewport' content='width=device-width, initial-scale=1'>
|
| 158 |
+
</head>
|
| 159 |
+
""", unsafe_allow_html=True)
|
| 160 |
+
if st.session_state.get("settings", {}).get("light_mode"):
|
| 161 |
+
st.markdown("""<script>const b=window.parent.document.querySelector('body'); if(b&&!b.classList.contains('light-mode')) b.classList.add('light-mode');</script>""", unsafe_allow_html=True)
|
| 162 |
+
else:
|
| 163 |
+
st.markdown("""<script>const b=window.parent.document.querySelector('body'); if(b) b.classList.remove('light-mode');</script>""", unsafe_allow_html=True)
|
| 164 |
+
|
| 165 |
+
# ---------------------------
|
| 166 |
+
# Sidebar: Settings
|
| 167 |
+
# ---------------------------
|
| 168 |
+
def _get_settings() -> Dict[str, Any]:
|
| 169 |
+
with st.sidebar:
|
| 170 |
+
st.header("Settings")
|
| 171 |
+
model = st.selectbox(
|
| 172 |
+
"Advisor model (CPU-friendly)",
|
| 173 |
+
[
|
| 174 |
+
"qwen2.5-1.5b-instruct",
|
| 175 |
+
"phi-3-mini-4k-instruct",
|
| 176 |
+
"gemma-2-2b-it",
|
| 177 |
+
],
|
| 178 |
+
index=0,
|
| 179 |
+
key="advisor_model_select",
|
| 180 |
+
help="Choose which free local LLM to use for advisor suggestions."
|
| 181 |
+
)
|
| 182 |
+
max_per = st.slider(
|
| 183 |
+
"Default max results per dork",
|
| 184 |
+
min_value=3,
|
| 185 |
+
max_value=50,
|
| 186 |
+
value=10,
|
| 187 |
+
step=1,
|
| 188 |
+
key="default_max_results",
|
| 189 |
+
help="Used as the default when executing dorks in Step 4."
|
| 190 |
+
)
|
| 191 |
+
logging = st.checkbox(
|
| 192 |
+
"Enable audit logging",
|
| 193 |
+
value=True,
|
| 194 |
+
key="enable_audit_logging",
|
| 195 |
+
help="If off, actions won't be written to the audit trail."
|
| 196 |
+
)
|
| 197 |
+
use_embeddings = st.checkbox(
|
| 198 |
+
"Enable semantic similarity (embeddings)",
|
| 199 |
+
value=False,
|
| 200 |
+
key="enable_embeddings",
|
| 201 |
+
help="Loads a small sentence-transformer to boost scoring by context relevance."
|
| 202 |
+
)
|
| 203 |
+
light_mode = st.checkbox(
|
| 204 |
+
"Light mode UI override",
|
| 205 |
+
value=False,
|
| 206 |
+
key="light_mode_toggle",
|
| 207 |
+
help="Apply a lighter palette without reloading base theme"
|
| 208 |
+
)
|
| 209 |
+
return {"model": model, "max_per": max_per, "logging": logging, "light_mode": light_mode}
|
| 210 |
+
|
| 211 |
+
SETTINGS = _get_settings()
|
| 212 |
+
st.session_state["settings"] = SETTINGS
|
| 213 |
+
st.session_state.setdefault("_embed_model", None)
|
| 214 |
+
|
| 215 |
+
# ---------------------------
|
| 216 |
+
# Google Dorks (typed catalog for many entities)
|
| 217 |
+
# ---------------------------
|
| 218 |
+
class TypedDork(TypedDict):
|
| 219 |
+
q: str
|
| 220 |
+
type: str
|
| 221 |
+
why: str
|
| 222 |
+
|
| 223 |
+
# Dork category glossary (shown in explainer)
|
| 224 |
+
DORK_TYPES: Dict[str, str] = {
|
| 225 |
+
"Footprinting": "Map surface area: sites/subdomains, logins, admin panels, basic presence.",
|
| 226 |
+
"Directory/Index": "Hunt for open listings or auto-generated indexes exposing files.",
|
| 227 |
+
"Docs/Collab": "Live docs/boards accidentally exposed (docs.google, Trello, etc.).",
|
| 228 |
+
"Code/Repo": "Public repos that may contain references, issues, or credentials.",
|
| 229 |
+
"Credentials/Secrets": "Clues that hint at passwords/keys or places leaks may exist.",
|
| 230 |
+
"Exposure/Leak": "Mentions of breaches, leaks, or dumps involving the entity.",
|
| 231 |
+
"People/Profiles": "Official bios, resumes/CVs, speaker pages, researcher profiles.",
|
| 232 |
+
"Social Activity": "Usernames/handles across social and developer communities.",
|
| 233 |
+
"Regulatory/Legal": "Filings and official records (e.g., SEC/EDGAR).",
|
| 234 |
+
"Incidents/Risk": "Incident reports, outages, protests, negative events.",
|
| 235 |
+
"Academic/Research": "Scholarly/technical works tied to a name or org.",
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
# ---- Typed dork builders ----
|
| 239 |
+
|
| 240 |
+
def typed_dorks_for_email(email: str) -> List[TypedDork]:
|
| 241 |
+
user, dom = (email.split("@", 1) + [""])[:2]
|
| 242 |
+
return [
|
| 243 |
+
{"q": f'"{email}"', "type": "Footprinting", "why": "Exact email mentions across the web."},
|
| 244 |
+
{"q": f'intext:"{email}"', "type": "Footprinting", "why": "Mentions inside page bodies."},
|
| 245 |
+
{"q": f'intext:"{user}" intext:"{dom}"', "type": "Footprinting", "why": "Mentions with split user/domain."},
|
| 246 |
+
{"q": f'site:{dom} intext:"@{dom}"', "type": "Footprinting", "why": "Emails published on the same domain."},
|
| 247 |
+
{"q": f'"{email}" filetype:pdf OR filetype:doc OR filetype:xls OR filetype:csv', "type": "Docs/Collab", "why": "Docs that may expose PII/roles."},
|
| 248 |
+
{"q": f'"{email}" site:github.com OR site:gitlab.com', "type": "Code/Repo", "why": "Commits/issues referencing the email."},
|
| 249 |
+
{"q": f'"{email}" site:gravatar.com', "type": "People/Profiles", "why": "Avatar/profile tied to the email hash."},
|
| 250 |
+
{"q": f'"{email}" site:pastebin.com OR site:ghostbin.com OR site:hastebin.com', "type": "Exposure/Leak", "why": "Common paste sites for leaks."},
|
| 251 |
+
{"q": f'"{email}" inurl:wp- OR inurl:wp-content OR inurl:wp-config', "type": "Directory/Index", "why": "WordPress artifacts sometimes leak emails."},
|
| 252 |
+
{"q": f'"{email}" AROUND(3) "password"', "type": "Credentials/Secrets", "why": "Heuristic for password-adjacent mentions."},
|
| 253 |
+
]
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def typed_dorks_for_domain(d: str) -> List[TypedDork]:
|
| 257 |
+
return [
|
| 258 |
+
{"q": f"site:{d} -www", "type": "Footprinting", "why": "Apex domain excluding www."},
|
| 259 |
+
{"q": f"site:*.{d} -www", "type": "Footprinting", "why": "Enumerate subdomains exposed to crawlers."},
|
| 260 |
+
{"q": f'"@{d}"', "type": "Footprinting", "why": "Emails belonging to the domain across the web."},
|
| 261 |
+
{"q": f'site:linkedin.com "{d}"', "type": "People/Profiles", "why": "Employees listing org domain."},
|
| 262 |
+
{"q": f'site:github.com "{d}"', "type": "Code/Repo", "why": "Repositories/issues referencing the domain."},
|
| 263 |
+
{"q": f'site:gitlab.com "{d}"', "type": "Code/Repo", "why": "Alternate forge often used by teams."},
|
| 264 |
+
{"q": f'site:docs.google.com "{d}"', "type": "Docs/Collab", "why": "Potentially exposed Google Docs/Sheets/Slides."},
|
| 265 |
+
{"q": f'site:trello.com "{d}"', "type": "Docs/Collab", "why": "Public Trello boards occasionally misconfigured."},
|
| 266 |
+
{"q": f'"{d}" filetype:pdf OR filetype:doc OR filetype:xls OR filetype:ppt OR filetype:csv', "type": "Docs/Collab", "why": "Documents with the org name/domain."},
|
| 267 |
+
{"q": f"site:{d} inurl:login OR inurl:admin OR inurl:signup", "type": "Footprinting", "why": "Auth surfaces (discovery only)."},
|
| 268 |
+
{"q": f'site:{d} intitle:"index of"', "type": "Directory/Index", "why": "Open directory listings on that domain."},
|
| 269 |
+
{"q": f"site:{d} ext:env OR ext:.git OR ext:git-credentials OR ext:sql OR ext:log", "type": "Credentials/Secrets", "why": "Common secret-bearing file extensions."},
|
| 270 |
+
{"q": f'"{d}" breach OR leak OR "data exposure"', "type": "Exposure/Leak", "why": "Press and trackers mentioning exposures."},
|
| 271 |
+
]
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def typed_dorks_for_ip(ip: str) -> List[TypedDork]:
|
| 275 |
+
return [
|
| 276 |
+
{"q": f'"{ip}"', "type": "Footprinting", "why": "Places where the raw IP is printed or logged."},
|
| 277 |
+
{"q": f'intext:"{ip}"', "type": "Footprinting", "why": "Body text mentions (forums, logs)."},
|
| 278 |
+
{"q": f'"{ip}" filetype:log OR filetype:txt', "type": "Directory/Index", "why": "Exposed logs referencing the IP."},
|
| 279 |
+
{"q": f'"{ip}" blacklist OR abuse', "type": "Incidents/Risk", "why": "Blacklist/abuse mentions and reports."},
|
| 280 |
+
{"q": f'"{ip}" intitle:"index of"', "type": "Directory/Index", "why": "Open indexes listing files with that IP."},
|
| 281 |
+
]
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def typed_dorks_for_username(u: str) -> List[TypedDork]:
|
| 285 |
+
return [
|
| 286 |
+
{"q": f'"{u}"', "type": "Footprinting", "why": "Exact handle mentions across the web."},
|
| 287 |
+
{"q": f'"{u}" site:twitter.com OR site:x.com OR site:reddit.com OR site:github.com OR site:stackexchange.com', "type": "Social Activity", "why": "Find consistent identity across major platforms."},
|
| 288 |
+
{"q": f'"{u}" site:medium.com OR site:substack.com', "type": "People/Profiles", "why": "Author pages tied to the handle."},
|
| 289 |
+
{"q": f'"{u}" site:keybase.io', "type": "People/Profiles", "why": "Cryptographic identity/proofs."},
|
| 290 |
+
{"q": f'"{u}" inurl:users OR inurl:profile', "type": "Footprinting", "why": "Generic user profile URLs."},
|
| 291 |
+
{"q": f'"{u}" filetype:pdf resume OR "curriculum vitae"', "type": "People/Profiles", "why": "CVs/resumes listing the handle."},
|
| 292 |
+
{"q": f'"{u}" AROUND(3) email', "type": "People/Profiles", "why": "Correlate handle to emails in bios/posts."},
|
| 293 |
+
{"q": f'"{u}" avatar OR "profile photo"', "type": "People/Profiles", "why": "Images tied to the identity."},
|
| 294 |
+
]
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def typed_dorks_for_person(name: str) -> List[TypedDork]:
|
| 298 |
+
return [
|
| 299 |
+
{"q": f'"{name}"', "type": "Footprinting", "why": "Exact full-name mentions."},
|
| 300 |
+
{"q": f'"{name}" site:linkedin.com', "type": "People/Profiles", "why": "Primary professional profile."},
|
| 301 |
+
{"q": f'"{name}" filetype:pdf resume OR "curriculum vitae"', "type": "People/Profiles", "why": "Resume/CV documents."},
|
| 302 |
+
{"q": f'"{name}" conference OR talk OR keynote', "type": "People/Profiles", "why": "Speaker bios and conference pages."},
|
| 303 |
+
{"q": f'"{name}" site:github.com OR site:gitlab.com', "type": "Code/Repo", "why": "Developer activity tied to the name."},
|
| 304 |
+
{"q": f'"{name}" site:researchgate.net OR site:scholar.google.com', "type": "Academic/Research", "why": "Scholarly output."},
|
| 305 |
+
{"q": f'"{name}" site:medium.com OR site:substack.com', "type": "People/Profiles", "why": "Editorial/social writing."},
|
| 306 |
+
{"q": f'"{name}" "email" OR "contact"', "type": "People/Profiles", "why": "Pages listing contact info."},
|
| 307 |
+
]
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def typed_dorks_for_org(org: str) -> List[TypedDork]:
|
| 311 |
+
return [
|
| 312 |
+
{"q": f'"{org}" site:sec.gov OR site:edgar', "type": "Regulatory/Legal", "why": "Official SEC/EDGAR filings."},
|
| 313 |
+
{"q": f'"{org}" contract award OR RFP OR "sources sought"', "type": "Regulatory/Legal", "why": "Gov procurement history and notices."},
|
| 314 |
+
{"q": f'"{org}" breach OR incident OR "data exposure"', "type": "Incidents/Risk", "why": "News/trackers about incidents/leaks."},
|
| 315 |
+
{"q": f'"{org}" site:linkedin.com', "type": "People/Profiles", "why": "Employees and org page."},
|
| 316 |
+
{"q": f'"{org}" site:github.com OR site:gitlab.com', "type": "Code/Repo", "why": "Public repos under org name."},
|
| 317 |
+
{"q": f'"{org}" filetype:pdf OR filetype:doc OR filetype:ppt OR filetype:xls', "type": "Docs/Collab", "why": "Documents carrying org name."},
|
| 318 |
+
{"q": f'"{org}" site:docs.google.com OR site:trello.com', "type": "Docs/Collab", "why": "Potentially exposed docs/boards."},
|
| 319 |
+
]
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def typed_dorks_for_location(loc: str) -> List[TypedDork]:
|
| 323 |
+
return [
|
| 324 |
+
{"q": f'"{loc}" incident OR protest OR outage', "type": "Incidents/Risk", "why": "Events/incidents tied to the place."},
|
| 325 |
+
{"q": f'"{loc}" satellite imagery OR "before after"', "type": "Footprinting", "why": "Imagery context for geospatial checks."},
|
| 326 |
+
{"q": f'"{loc}" site:news', "type": "Incidents/Risk", "why": "Recent news mentions for the place."},
|
| 327 |
+
{"q": f'"{loc}" filetype:pdf report', "type": "Docs/Collab", "why": "Reports that reference the location."},
|
| 328 |
+
]
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def typed_dorks_for_file(desc: str) -> List[TypedDork]:
|
| 332 |
+
return [
|
| 333 |
+
{"q": f'"{desc}" filetype:pdf OR filetype:doc OR filetype:xls OR filetype:ppt OR filetype:csv', "type": "Docs/Collab", "why": "Document hunting by keyword."},
|
| 334 |
+
{"q": f'"{desc}" site:archive.org', "type": "Docs/Collab", "why": "Wayback/Archive artifacts."},
|
| 335 |
+
{"q": f'"{desc}" intitle:"index of"', "type": "Directory/Index", "why": "Open listings that may contain files."},
|
| 336 |
+
]
|
| 337 |
+
|
| 338 |
+
TYPED_DORK_MAP: Dict[str, Any] = {
|
| 339 |
+
"Email Address": typed_dorks_for_email,
|
| 340 |
+
"Domain / Website": typed_dorks_for_domain,
|
| 341 |
+
"IP Address": typed_dorks_for_ip,
|
| 342 |
+
"Username / Handle": typed_dorks_for_username,
|
| 343 |
+
"Named Individual": typed_dorks_for_person,
|
| 344 |
+
"Organization / Company": typed_dorks_for_org,
|
| 345 |
+
"Location": typed_dorks_for_location,
|
| 346 |
+
"File / Image": typed_dorks_for_file,
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
# ---------------------------
|
| 350 |
+
# STEP 1: Explainer
|
| 351 |
+
# ---------------------------
|
| 352 |
+
def render_dorks_explainer(entity_type: str, entity_value: str):
|
| 353 |
+
st.subheader("Step 1: Dork Explainer")
|
| 354 |
+
st.caption("These are categorized OSINT search operators. Copy/paste into Google if you like; this app automates via DuckDuckGo to respect ToS.")
|
| 355 |
+
with st.expander("Dork categories explained", expanded=False):
|
| 356 |
+
for t, desc in DORK_TYPES.items():
|
| 357 |
+
st.markdown(f"**{t}** — {desc}")
|
| 358 |
+
|
| 359 |
+
builder = TYPED_DORK_MAP.get(entity_type)
|
| 360 |
+
typed = builder(entity_value) if (builder and entity_value) else []
|
| 361 |
+
if not typed:
|
| 362 |
+
st.info("Enter an entity value above to see a tailored catalog.")
|
| 363 |
+
return
|
| 364 |
+
for d in typed:
|
| 365 |
+
st.markdown(f"- **[{d['type']}]** `{d['q']}`")
|
| 366 |
+
st.markdown(f" <span class='small'>{d['why']}</span>", unsafe_allow_html=True)
|
| 367 |
+
|
| 368 |
+
# ---------------------------
|
| 369 |
+
# STEP 2: Advisor (LLM-powered with rules fallback)
|
| 370 |
+
# ---------------------------
|
| 371 |
+
|
| 372 |
+
# Goal weights for rules-based fallback / blending
|
| 373 |
+
GOAL_WEIGHTS: Dict[str, Dict[str, int]] = {
|
| 374 |
+
"Map footprint / surface": {"Footprinting": 3, "Directory/Index": 2},
|
| 375 |
+
"Find documents & spreadsheets": {"Docs/Collab": 3, "Directory/Index": 2},
|
| 376 |
+
"Discover code & credentials": {"Code/Repo": 3, "Credentials/Secrets": 3, "Directory/Index": 2},
|
| 377 |
+
"Identify breaches/leaks": {"Exposure/Leak": 3, "Credentials/Secrets": 2},
|
| 378 |
+
"Find people & org info": {"People/Profiles": 3, "Regulatory/Legal": 2},
|
| 379 |
+
"Track incidents / risk": {"Incidents/Risk": 3},
|
| 380 |
+
"Academic/technical trails": {"Academic/Research": 3},
|
| 381 |
+
}
|
| 382 |
+
DEFAULT_GOALS = list(GOAL_WEIGHTS.keys())
|
| 383 |
+
|
| 384 |
+
MODEL_ID_MAP = {
|
| 385 |
+
"qwen2.5-1.5b-instruct": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 386 |
+
"phi-3-mini-4k-instruct": "microsoft/phi-3-mini-4k-instruct",
|
| 387 |
+
"gemma-2-2b-it": "google/gemma-2-2b-it",
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
# ---------------------------
|
| 391 |
+
# Known Facts Model
|
| 392 |
+
# ---------------------------
|
| 393 |
+
@dataclass
|
| 394 |
+
class KnownFacts:
|
| 395 |
+
handles: List[str]
|
| 396 |
+
real_names: List[str]
|
| 397 |
+
emails: List[str]
|
| 398 |
+
domains: List[str]
|
| 399 |
+
ips: List[str]
|
| 400 |
+
locations: List[str]
|
| 401 |
+
orgs: List[str]
|
| 402 |
+
context: str
|
| 403 |
+
|
| 404 |
+
@classmethod
|
| 405 |
+
def from_session(cls) -> "KnownFacts":
|
| 406 |
+
return st.session_state.get("known_facts") or cls([], [], [], [], [], [], [], "")
|
| 407 |
+
|
| 408 |
+
def _parse_csv(s: str) -> List[str]:
|
| 409 |
+
return [x.strip() for x in (s or "").split(",") if x.strip()]
|
| 410 |
+
|
| 411 |
+
def _known_facts_ui():
|
| 412 |
+
st.subheader("Known Facts / Prior Intelligence")
|
| 413 |
+
st.caption("Provide what you already know. This seeds scoring & generation.")
|
| 414 |
+
col_a, col_b, col_c = st.columns(3)
|
| 415 |
+
with col_a:
|
| 416 |
+
handles = st.text_area("Handles / Usernames (comma)", key="kf_handles", height=70)
|
| 417 |
+
emails = st.text_area("Emails (comma)", key="kf_emails", height=70)
|
| 418 |
+
ips = st.text_area("IP addresses (comma)", key="kf_ips", height=70)
|
| 419 |
+
with col_b:
|
| 420 |
+
real_names = st.text_area("Real Names (comma)", key="kf_real_names", height=70, help="Full names or key name variants")
|
| 421 |
+
domains = st.text_area("Domains (comma)", key="kf_domains", height=70)
|
| 422 |
+
orgs = st.text_area("Organizations (comma)", key="kf_orgs", height=70)
|
| 423 |
+
with col_c:
|
| 424 |
+
locations = st.text_area("Locations (comma)", key="kf_locations", height=70)
|
| 425 |
+
context = st.text_area("Context / Keywords", key="kf_context", height=160, help="Free-text mission context, tech stack, roles, etc.")
|
| 426 |
+
if st.button("Save Known Facts", key="btn_save_facts"):
|
| 427 |
+
facts = KnownFacts(
|
| 428 |
+
handles=_parse_csv(handles),
|
| 429 |
+
real_names=_parse_csv(real_names),
|
| 430 |
+
emails=_parse_csv(emails),
|
| 431 |
+
domains=_parse_csv(domains),
|
| 432 |
+
ips=_parse_csv(ips),
|
| 433 |
+
locations=_parse_csv(locations),
|
| 434 |
+
orgs=_parse_csv(orgs),
|
| 435 |
+
context=context.strip(),
|
| 436 |
+
)
|
| 437 |
+
st.session_state["known_facts"] = facts
|
| 438 |
+
st.success("Facts saved (session only).")
|
| 439 |
+
facts = KnownFacts.from_session()
|
| 440 |
+
st.markdown(f"**Current facts loaded:** {len(facts.handles)} handles, {len(facts.emails)} emails, {len(facts.domains)} domains, {len(facts.real_names)} names.")
|
| 441 |
+
st.markdown("---")
|
| 442 |
+
st.markdown("### Candidate Generation")
|
| 443 |
+
st.caption("Generate permutations / derived candidates from known facts.")
|
| 444 |
+
if st.button("Generate Candidates", key="btn_gen_candidates"):
|
| 445 |
+
facts = KnownFacts.from_session()
|
| 446 |
+
usernames = set(facts.handles)
|
| 447 |
+
# simple mutations
|
| 448 |
+
for h in list(usernames):
|
| 449 |
+
for suf in ["123", "01", "_sec", "_research", "-dev"]:
|
| 450 |
+
usernames.add(h + suf)
|
| 451 |
+
if h.isalpha():
|
| 452 |
+
usernames.add(h + "1")
|
| 453 |
+
# email permutations (if have names + domains)
|
| 454 |
+
emails = set(facts.emails)
|
| 455 |
+
if facts.real_names and facts.domains:
|
| 456 |
+
first = facts.real_names[0].split()[0].lower()
|
| 457 |
+
last = facts.real_names[0].split()[-1].lower()
|
| 458 |
+
for d in facts.domains[:3]:
|
| 459 |
+
emails.update({
|
| 460 |
+
f"{first}.{last}@{d}",
|
| 461 |
+
f"{first}{last}@{d}",
|
| 462 |
+
f"{first[0]}{last}@{d}",
|
| 463 |
+
f"{first}_{last}@{d}",
|
| 464 |
+
})
|
| 465 |
+
# domain variants (very light)
|
| 466 |
+
dom_vars = set(facts.domains)
|
| 467 |
+
for d in facts.domains:
|
| 468 |
+
if d.count('.') >= 1:
|
| 469 |
+
root = d.split('.')[0]
|
| 470 |
+
tld = d.split('.')[-1]
|
| 471 |
+
dom_vars.add(root + "-dev." + tld)
|
| 472 |
+
dom_vars.add(root + "-staging." + tld)
|
| 473 |
+
st.session_state["generated_candidates"] = {
|
| 474 |
+
"usernames": sorted(list(usernames))[:100],
|
| 475 |
+
"emails": sorted(list(emails))[:100],
|
| 476 |
+
"domains": sorted(list(dom_vars))[:100]
|
| 477 |
+
}
|
| 478 |
+
st.success("Candidates generated.")
|
| 479 |
+
cand = st.session_state.get("generated_candidates")
|
| 480 |
+
if cand:
|
| 481 |
+
st.write("Usernames (sample)", cand["usernames"][:10])
|
| 482 |
+
st.write("Emails (sample)", cand["emails"][:10])
|
| 483 |
+
st.write("Domains (sample)", cand["domains"][:10])
|
| 484 |
+
if st.button("Add All Candidates to Facts", key="btn_add_cand"):
|
| 485 |
+
facts = KnownFacts.from_session()
|
| 486 |
+
facts.handles = sorted(list(set(facts.handles + cand["usernames"])))
|
| 487 |
+
facts.emails = sorted(list(set(facts.emails + cand["emails"])))
|
| 488 |
+
facts.domains = sorted(list(set(facts.domains + cand["domains"])))
|
| 489 |
+
st.session_state["known_facts"] = facts
|
| 490 |
+
st.success("Candidates merged into facts.")
|
| 491 |
+
|
| 492 |
+
def _generate_investigation_plan(entity_type: str, entity_value: str, facts: KnownFacts) -> Dict[str, Any]:
|
| 493 |
+
"""Produce a structured investigation plan based on current facts and target type."""
|
| 494 |
+
objectives = [
|
| 495 |
+
"Establish definitive identifiers (emails, handles, domains) to anchor pivots",
|
| 496 |
+
"Map exposed surface (sites, code, documents, credentials indicators)",
|
| 497 |
+
"Correlate identities across platforms and artifacts",
|
| 498 |
+
"Identify signs of exposure, breach, or sensitive data leakage",
|
| 499 |
+
"Prioritize high-confidence findings for deeper manual review"
|
| 500 |
+
]
|
| 501 |
+
# Gap analysis
|
| 502 |
+
gaps = []
|
| 503 |
+
if not facts.emails: gaps.append("No confirmed email addresses")
|
| 504 |
+
if not facts.handles: gaps.append("No social/developer handles")
|
| 505 |
+
if not facts.domains and entity_type != "Domain / Website": gaps.append("No related domains captured")
|
| 506 |
+
if not facts.real_names and entity_type in ("Named Individual", "Organization / Company"): gaps.append("No individual name variants")
|
| 507 |
+
if not facts.orgs and entity_type == "Named Individual": gaps.append("No employing organizations")
|
| 508 |
+
if not facts.context: gaps.append("Context / mission keywords empty (reduces scoring nuance)")
|
| 509 |
+
if not gaps: gaps = ["Current fact set sufficient for first enumeration pass"]
|
| 510 |
+
|
| 511 |
+
# Phase recommendations
|
| 512 |
+
phases: List[Dict[str, Any]] = []
|
| 513 |
+
phases.append({
|
| 514 |
+
"phase": "Phase 1 - Baseline & Fact Hardening",
|
| 515 |
+
"goals": ["Normalize entity value", "Collect canonical facts", "Note obvious pivots"],
|
| 516 |
+
"actions": [
|
| 517 |
+
"Record primary identifier in Known Facts",
|
| 518 |
+
"Add any immediately known emails, domains, handles",
|
| 519 |
+
"Capture mission / context keywords (tech stack, industry, roles)",
|
| 520 |
+
"Run Advisor for broad Footprinting and People queries"
|
| 521 |
+
]
|
| 522 |
+
})
|
| 523 |
+
phases.append({
|
| 524 |
+
"phase": "Phase 2 - Surface Enumeration",
|
| 525 |
+
"goals": ["Map public assets", "Discover documents & code"],
|
| 526 |
+
"actions": [
|
| 527 |
+
"Select dorks: site:, filetype:, intitle:'index of' variations",
|
| 528 |
+
"Enumerate repo references (GitHub/GitLab) and note unique strings",
|
| 529 |
+
"Pull down high-signal docs (PDF/DOCX) and extract metadata for hidden emails/handles"
|
| 530 |
+
]
|
| 531 |
+
})
|
| 532 |
+
phases.append({
|
| 533 |
+
"phase": "Phase 3 - Identity Correlation",
|
| 534 |
+
"goals": ["Link handles to emails", "Find cross-platform reuse"],
|
| 535 |
+
"actions": [
|
| 536 |
+
"Search handles with platform-specific queries (social + developer)",
|
| 537 |
+
"Leverage resume / CV / speaker page dorks for name-email alignment",
|
| 538 |
+
"Add newly confirmed identifiers back into Known Facts and re-score"
|
| 539 |
+
]
|
| 540 |
+
})
|
| 541 |
+
phases.append({
|
| 542 |
+
"phase": "Phase 4 - Exposure & Risk Signals",
|
| 543 |
+
"goals": ["Detect leak indicators", "Prioritize potential sensitive exposure"],
|
| 544 |
+
"actions": [
|
| 545 |
+
"Run leak / breach / paste oriented dorks including credential keywords",
|
| 546 |
+
"Inspect any pastebin / gist / artifact snippets for policy or secret references",
|
| 547 |
+
"Flag findings with multiple co-occurring identifiers for manual escalation"
|
| 548 |
+
]
|
| 549 |
+
})
|
| 550 |
+
phases.append({
|
| 551 |
+
"phase": "Phase 5 - Consolidation & Reporting",
|
| 552 |
+
"goals": ["Score & rank findings", "Produce exportable report"],
|
| 553 |
+
"actions": [
|
| 554 |
+
"Re-score after final fact enrichment",
|
| 555 |
+
"Visualize graph to ensure high-score nodes connect multiple anchors",
|
| 556 |
+
"Export HTML report and retain audit log",
|
| 557 |
+
"Document residual gaps & next potential pivots (e.g., historical archives, certificate transparency)"
|
| 558 |
+
]
|
| 559 |
+
})
|
| 560 |
+
return {
|
| 561 |
+
"entity_type": entity_type,
|
| 562 |
+
"entity_value": entity_value,
|
| 563 |
+
"objectives": objectives,
|
| 564 |
+
"gaps": gaps,
|
| 565 |
+
"phases": phases,
|
| 566 |
+
"facts_snapshot": facts.__dict__,
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
def render_investigation_plan(entity_type: str, entity_value: str):
|
| 570 |
+
st.subheader("Investigation Plan")
|
| 571 |
+
facts = KnownFacts.from_session()
|
| 572 |
+
plan = _generate_investigation_plan(entity_type, entity_value, facts)
|
| 573 |
+
st.markdown("### Core Objectives")
|
| 574 |
+
for o in plan["objectives"]:
|
| 575 |
+
st.markdown(f"- {o}")
|
| 576 |
+
st.markdown("### Current Gaps")
|
| 577 |
+
for g in plan["gaps"]:
|
| 578 |
+
st.markdown(f"- {g}")
|
| 579 |
+
st.markdown("### Phased Approach")
|
| 580 |
+
for ph in plan["phases"]:
|
| 581 |
+
with st.expander(ph["phase"], expanded=False):
|
| 582 |
+
st.markdown("**Goals**")
|
| 583 |
+
for g in ph["goals"]:
|
| 584 |
+
st.markdown(f"- {g}")
|
| 585 |
+
st.markdown("**Actions**")
|
| 586 |
+
for a in ph["actions"]:
|
| 587 |
+
st.markdown(f"- {a}")
|
| 588 |
+
if st.button("Export Plan (Markdown)", key="btn_export_plan"):
|
| 589 |
+
md_lines = [f"# Investigation Plan: {plan['entity_type']} — {plan['entity_value']}", "", "## Objectives"]
|
| 590 |
+
md_lines += [f"- {o}" for o in plan["objectives"]]
|
| 591 |
+
md_lines += ["", "## Gaps"] + [f"- {g}" for g in plan["gaps"]]
|
| 592 |
+
md_lines += ["", "## Phases"]
|
| 593 |
+
for ph in plan["phases"]:
|
| 594 |
+
md_lines.append(f"### {ph['phase']}")
|
| 595 |
+
md_lines.append("**Goals**")
|
| 596 |
+
md_lines += [f"- {g}" for g in ph["goals"]]
|
| 597 |
+
md_lines.append("**Actions**")
|
| 598 |
+
md_lines += [f"- {a}" for a in ph["actions"]]
|
| 599 |
+
md_lines.append("")
|
| 600 |
+
md = "\n".join(md_lines)
|
| 601 |
+
st.download_button("Download Plan", md, file_name="investigation_plan.md", mime="text/markdown")
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
def _score_dork_rule(d: TypedDork, goals: List[str], user_note: str) -> float:
|
| 605 |
+
s = 1.0
|
| 606 |
+
for g in goals:
|
| 607 |
+
for cat, w in GOAL_WEIGHTS.get(g, {}).items():
|
| 608 |
+
if d["type"] == cat:
|
| 609 |
+
s += w
|
| 610 |
+
note = (user_note or "").lower()
|
| 611 |
+
if any(k in note for k in ["password", "credential", "secret", "token"]):
|
| 612 |
+
if d["type"] in {"Credentials/Secrets", "Code/Repo", "Directory/Index"}:
|
| 613 |
+
s += 1.5
|
| 614 |
+
if any(k in note for k in ["resume", "cv", "employee", "contact"]):
|
| 615 |
+
if d["type"] in {"People/Profiles"}:
|
| 616 |
+
s += 1.0
|
| 617 |
+
if any(k in note for k in ["breach", "leak", "dump", "paste"]):
|
| 618 |
+
if d["type"] in {"Exposure/Leak", "Credentials/Secrets"}:
|
| 619 |
+
s += 1.5
|
| 620 |
+
if any(k in note for k in ["paper", "research", "doi", "citation"]):
|
| 621 |
+
if d["type"] in {"Academic/Research"}:
|
| 622 |
+
s += 1.0
|
| 623 |
+
return s
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
def _recommend_rules(entity_type: str, entity_value: str, goals: List[str], user_note: str, top_k: int = 10) -> List[TypedDork]:
|
| 627 |
+
builder = TYPED_DORK_MAP.get(entity_type)
|
| 628 |
+
typed = builder(entity_value) if (builder and entity_value) else []
|
| 629 |
+
ranked = sorted(typed, key=lambda d: _score_dork_rule(d, goals, user_note), reverse=True)
|
| 630 |
+
return ranked[:top_k]
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
def _safe_json_list(txt: str) -> List[Dict[str, Any]]:
|
| 634 |
+
"""Best-effort extraction of a JSON list from raw LLM text or user input.
|
| 635 |
+
|
| 636 |
+
Strategy:
|
| 637 |
+
1. Strip surrounding markdown code fences (with or without language tag).
|
| 638 |
+
2. Attempt direct json.loads.
|
| 639 |
+
3. Locate outermost '[' ... ']' span and attempt parse.
|
| 640 |
+
Returns [] on any failure or non-list root.
|
| 641 |
+
"""
|
| 642 |
+
if not txt:
|
| 643 |
+
return []
|
| 644 |
+
s = txt.strip()
|
| 645 |
+
# Remove markdown fences like ```json ... ```
|
| 646 |
+
if s.startswith("```"):
|
| 647 |
+
lines = s.split("\n")
|
| 648 |
+
# drop first fence line
|
| 649 |
+
lines = lines[1:]
|
| 650 |
+
# drop trailing fence line if present
|
| 651 |
+
if lines and lines[-1].strip() == "```":
|
| 652 |
+
lines = lines[:-1]
|
| 653 |
+
s = "\n".join(lines).strip()
|
| 654 |
+
# Try direct parse
|
| 655 |
+
try:
|
| 656 |
+
data = json.loads(s)
|
| 657 |
+
if isinstance(data, list):
|
| 658 |
+
return data # type: ignore[return-value]
|
| 659 |
+
except Exception:
|
| 660 |
+
pass
|
| 661 |
+
# Fallback: largest bracketed list slice
|
| 662 |
+
start = s.find("[")
|
| 663 |
+
end = s.rfind("]")
|
| 664 |
+
if start != -1 and end != -1 and end > start:
|
| 665 |
+
candidate = s[start:end+1]
|
| 666 |
+
try:
|
| 667 |
+
data = json.loads(candidate)
|
| 668 |
+
if isinstance(data, list):
|
| 669 |
+
return data # type: ignore[return-value]
|
| 670 |
+
except Exception:
|
| 671 |
+
pass
|
| 672 |
+
return []
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
def _hf_infer(model_id: str, prompt: str, max_new_tokens: int = 384, temperature: float = 0.2) -> Optional[str]:
|
| 676 |
+
"""Call Hugging Face Inference API if token & requests available.
|
| 677 |
+
|
| 678 |
+
Returns generated text or None (which triggers rule-based fallback)."""
|
| 679 |
+
if requests is None:
|
| 680 |
+
st.warning("'requests' not installed; cannot call Hugging Face Inference API. Falling back to rules.")
|
| 681 |
+
return None
|
| 682 |
+
api_token = os.getenv("HF_API_TOKEN")
|
| 683 |
+
if not api_token:
|
| 684 |
+
st.warning("HF_API_TOKEN not set. Add it as a secret/environment variable to enable LLM advisor. Falling back to rules.")
|
| 685 |
+
return None
|
| 686 |
+
url = f"https://api-inference.huggingface.co/models/{model_id}"
|
| 687 |
+
headers = {"Authorization": f"Bearer {api_token}"}
|
| 688 |
+
payload = {
|
| 689 |
+
"inputs": prompt,
|
| 690 |
+
"parameters": {
|
| 691 |
+
"max_new_tokens": max_new_tokens,
|
| 692 |
+
"temperature": temperature,
|
| 693 |
+
"return_full_text": False,
|
| 694 |
+
},
|
| 695 |
+
}
|
| 696 |
+
try:
|
| 697 |
+
resp = requests.post(url, headers=headers, json=payload, timeout=90)
|
| 698 |
+
resp.raise_for_status()
|
| 699 |
+
data = resp.json()
|
| 700 |
+
if isinstance(data, list) and data and isinstance(data[0], dict) and "generated_text" in data[0]:
|
| 701 |
+
return data[0]["generated_text"]
|
| 702 |
+
if isinstance(data, dict) and "generated_text" in data:
|
| 703 |
+
return data["generated_text"]
|
| 704 |
+
# Unknown shape: return serialized
|
| 705 |
+
return json.dumps(data)
|
| 706 |
+
except Exception as e:
|
| 707 |
+
st.warning(f"HF inference error: {e}. Falling back to rules.")
|
| 708 |
+
return None
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
def _build_llm_prompt(entity_type: str, entity_value: str, goals: List[str], hint: str, baseline: List[TypedDork], top_k: int) -> str:
|
| 712 |
+
cat_list = ", ".join(sorted(DORK_TYPES.keys()))
|
| 713 |
+
baseline_lines = "\n".join([f"- {d['type']}: {d['q']} // {d['why']}" for d in baseline[:25]])
|
| 714 |
+
return f"""
|
| 715 |
+
You are an OSINT assistant that crafts focused Google dorks.
|
| 716 |
+
Given the entity type and value, the user's goals, and an optional hint, return a JSON array (and ONLY a JSON array) of up to {top_k} objects with this schema:
|
| 717 |
+
{{"q": "<google dork string>", "type": "<one of [{cat_list}]>", "why": "<1 sentence rationale>"}}
|
| 718 |
+
Rules:
|
| 719 |
+
- Prefer free, public sources; avoid paid services.
|
| 720 |
+
- Keep queries precise; quote exact strings; use site:, filetype:, inurl:, intitle:, and AROUND(n) when helpful.
|
| 721 |
+
- Use ONLY categories from the allowed list above.
|
| 722 |
+
- Output must be valid JSON (no prose, no markdown fences).
|
| 723 |
+
|
| 724 |
+
ENTITY_TYPE: {entity_type}
|
| 725 |
+
ENTITY_VALUE: {entity_value}
|
| 726 |
+
GOALS: {goals}
|
| 727 |
+
HINT: {hint or '(none)'}
|
| 728 |
+
BASELINE_CATALOG (for inspiration, don't just repeat):
|
| 729 |
+
{baseline_lines}
|
| 730 |
+
"""
|
| 731 |
+
|
| 732 |
+
|
| 733 |
+
def _recommend_llm(entity_type: str, entity_value: str, goals: List[str], hint: str, top_k: int) -> List[TypedDork]:
|
| 734 |
+
builder = TYPED_DORK_MAP.get(entity_type)
|
| 735 |
+
baseline = builder(entity_value) if (builder and entity_value) else []
|
| 736 |
+
model_key = st.session_state.get("settings", {}).get("model", "qwen2.5-1.5b-instruct")
|
| 737 |
+
model_id = MODEL_ID_MAP.get(model_key, model_key)
|
| 738 |
+
prompt = _build_llm_prompt(entity_type, entity_value, goals, hint, baseline, top_k)
|
| 739 |
+
raw = _hf_infer(model_id, prompt)
|
| 740 |
+
if not raw:
|
| 741 |
+
return []
|
| 742 |
+
parsed = _safe_json_list(raw)
|
| 743 |
+
out: List[TypedDork] = []
|
| 744 |
+
for item in parsed:
|
| 745 |
+
if not isinstance(item, dict):
|
| 746 |
+
continue
|
| 747 |
+
q = str(item.get("q", "")).strip()
|
| 748 |
+
typ = str(item.get("type", "Footprinting")).strip()
|
| 749 |
+
why = str(item.get("why", "Suggested by LLM")).strip()
|
| 750 |
+
if not q:
|
| 751 |
+
continue
|
| 752 |
+
if typ not in DORK_TYPES:
|
| 753 |
+
typ = "Footprinting"
|
| 754 |
+
out.append({"q": q, "type": typ, "why": why})
|
| 755 |
+
# Dedupe while preserving order
|
| 756 |
+
seen = set()
|
| 757 |
+
deduped: List[TypedDork] = []
|
| 758 |
+
for d in out:
|
| 759 |
+
if d["q"] in seen:
|
| 760 |
+
continue
|
| 761 |
+
seen.add(d["q"])
|
| 762 |
+
deduped.append(d)
|
| 763 |
+
return deduped[:top_k]
|
| 764 |
+
|
| 765 |
+
|
| 766 |
+
def render_dork_recommender(entity_type: str, entity_value: str):
|
| 767 |
+
st.subheader("Step 2: Advisor")
|
| 768 |
+
goals = st.multiselect("What are you trying to do?", DEFAULT_GOALS, default=["Map footprint / surface", "Find documents & spreadsheets"], key="advisor_goals")
|
| 769 |
+
hint = st.text_input("Optional hint (e.g., 'credentials around build system', 'employee directory')", key="advisor_hint")
|
| 770 |
+
top_k = st.slider("How many suggestions?", 3, 20, 10, key="advisor_topk")
|
| 771 |
+
use_llm = st.checkbox("Use advisor LLM (Hugging Face Inference API)", value=False, key="use_llm_checkbox", help="Requires HF_API_TOKEN environment secret. Falls back to rules if unavailable.")
|
| 772 |
+
|
| 773 |
+
if st.button("Suggest dorks", key="btn_suggest"):
|
| 774 |
+
recs: List[TypedDork] = []
|
| 775 |
+
if use_llm:
|
| 776 |
+
recs = _recommend_llm(entity_type, entity_value, goals, hint, top_k)
|
| 777 |
+
if not recs:
|
| 778 |
+
recs = _recommend_rules(entity_type, entity_value, goals, hint, top_k)
|
| 779 |
+
if not recs:
|
| 780 |
+
st.warning("Enter a valid entity value first.")
|
| 781 |
+
return
|
| 782 |
+
st.session_state["dork_recs"] = recs
|
| 783 |
+
st.markdown("#### Recommended dorks")
|
| 784 |
+
for r in recs:
|
| 785 |
+
st.markdown(f"- **[{r['type']}]** `{r['q']}`")
|
| 786 |
+
st.markdown(f" <span class='small'>{r['why']}</span>", unsafe_allow_html=True)
|
| 787 |
+
|
| 788 |
+
# ---------------------------
|
| 789 |
+
# STEP 3: Selection
|
| 790 |
+
# ---------------------------
|
| 791 |
+
def render_dork_selection(entity_type: str, entity_value: str):
|
| 792 |
+
st.subheader("Step 3: Select dorks")
|
| 793 |
+
recs = st.session_state.get("dork_recs", [])
|
| 794 |
+
choice = st.radio("Select method", ["Accept advisor", "Pick from catalog", "Custom"], key="method_radio")
|
| 795 |
+
final = []
|
| 796 |
+
if choice == "Accept advisor":
|
| 797 |
+
final = [r["q"] for r in recs]
|
| 798 |
+
elif choice == "Pick from catalog":
|
| 799 |
+
typed = TYPED_DORK_MAP[entity_type](entity_value)
|
| 800 |
+
for idx, d in enumerate(typed):
|
| 801 |
+
if st.checkbox(d["q"], key=f"pick_{idx}"):
|
| 802 |
+
final.append(d["q"])
|
| 803 |
+
elif choice == "Custom":
|
| 804 |
+
txt = st.text_area("Enter custom dorks")
|
| 805 |
+
if txt:
|
| 806 |
+
final = [l.strip() for l in txt.splitlines() if l.strip()]
|
| 807 |
+
st.session_state["selected_dorks"] = final
|
| 808 |
+
st.write("Final Basket:", final)
|
| 809 |
+
|
| 810 |
+
# ---------------------------
|
| 811 |
+
# STEP 4: Execution + Metadata
|
| 812 |
+
# ---------------------------
|
| 813 |
+
def _audit_init():
|
| 814 |
+
st.session_state.setdefault("audit", [])
|
| 815 |
+
|
| 816 |
+
def _audit_log(action: str, **details):
|
| 817 |
+
if not st.session_state.get("settings", {}).get("logging", True):
|
| 818 |
+
return
|
| 819 |
+
_audit_init()
|
| 820 |
+
st.session_state["audit"].append({"ts": datetime.utcnow().isoformat()+"Z", "action": action, **details})
|
| 821 |
+
|
| 822 |
+
def ddg_search(query: str, max_results: int=5):
|
| 823 |
+
if DDGS is None:
|
| 824 |
+
return []
|
| 825 |
+
with DDGS() as ddgs:
|
| 826 |
+
return list(ddgs.text(query, max_results=max_results))
|
| 827 |
+
|
| 828 |
+
# ---------------------------
|
| 829 |
+
# Scoring
|
| 830 |
+
# ---------------------------
|
| 831 |
+
SOURCE_RELIABILITY = {
|
| 832 |
+
"high": [".gov", ".mil", ".edu", "sec.gov", "reuters", "bloomberg", "nytimes", "wsj"],
|
| 833 |
+
"med": ["github.com", "gitlab.com", "medium.com", "substack.com", "bbc"],
|
| 834 |
+
}
|
| 835 |
+
|
| 836 |
+
def _source_reliability(url: str) -> str:
|
| 837 |
+
url_l = (url or "").lower()
|
| 838 |
+
for kw in SOURCE_RELIABILITY["high"]:
|
| 839 |
+
if kw in url_l:
|
| 840 |
+
return "High"
|
| 841 |
+
for kw in SOURCE_RELIABILITY["med"]:
|
| 842 |
+
if kw in url_l:
|
| 843 |
+
return "Medium"
|
| 844 |
+
return "Low"
|
| 845 |
+
|
| 846 |
+
def _fuzzy_match(a: str, b: str) -> float:
|
| 847 |
+
if not a or not b:
|
| 848 |
+
return 0.0
|
| 849 |
+
if a.lower() == b.lower():
|
| 850 |
+
return 1.0
|
| 851 |
+
if fuzz:
|
| 852 |
+
return fuzz.ratio(a.lower(), b.lower()) / 100.0
|
| 853 |
+
return 0.0
|
| 854 |
+
|
| 855 |
+
def score_finding(row: Dict[str, Any], facts: KnownFacts) -> Dict[str, Any]:
|
| 856 |
+
title = row.get("title") or row.get("heading") or ""
|
| 857 |
+
snippet = row.get("body") or row.get("snippet") or ""
|
| 858 |
+
url = row.get("href") or row.get("link") or ""
|
| 859 |
+
text = f"{title}\n{snippet}".lower()
|
| 860 |
+
score = 0
|
| 861 |
+
comps: List[Dict[str, Any]] = []
|
| 862 |
+
|
| 863 |
+
def add(points: int, label: str, reason: str):
|
| 864 |
+
nonlocal score
|
| 865 |
+
score += points
|
| 866 |
+
comps.append({"label": label, "points": points, "reason": reason})
|
| 867 |
+
|
| 868 |
+
# Exact matches
|
| 869 |
+
hits = 0
|
| 870 |
+
for e in facts.emails:
|
| 871 |
+
if e.lower() in text:
|
| 872 |
+
add(25, "Email match", e)
|
| 873 |
+
hits += 1
|
| 874 |
+
for h in facts.handles:
|
| 875 |
+
if h.lower() in text:
|
| 876 |
+
add(15, "Handle match", h)
|
| 877 |
+
hits += 1
|
| 878 |
+
for d in facts.domains:
|
| 879 |
+
if d.lower() in text:
|
| 880 |
+
add(10, "Domain mention", d)
|
| 881 |
+
hits += 1
|
| 882 |
+
for ip in facts.ips:
|
| 883 |
+
if ip and ip.lower() in text:
|
| 884 |
+
add(10, "IP mention", ip)
|
| 885 |
+
hits += 1
|
| 886 |
+
for org in facts.orgs:
|
| 887 |
+
if org.lower() in text:
|
| 888 |
+
add(8, "Org mention", org)
|
| 889 |
+
hits += 1
|
| 890 |
+
for name in facts.real_names:
|
| 891 |
+
if name.lower() in text:
|
| 892 |
+
add(20, "Name mention", name)
|
| 893 |
+
hits += 1
|
| 894 |
+
else:
|
| 895 |
+
# fuzzy
|
| 896 |
+
for token in name.split():
|
| 897 |
+
for word in text.split():
|
| 898 |
+
if _fuzzy_match(token, word) >= 0.9:
|
| 899 |
+
add(8, "Fuzzy name token", f"{token}->{word}")
|
| 900 |
+
hits += 1
|
| 901 |
+
break
|
| 902 |
+
|
| 903 |
+
if hits >= 2:
|
| 904 |
+
add(10, "Co-occurrence", f"{hits} fact tokens present")
|
| 905 |
+
|
| 906 |
+
# Source reliability
|
| 907 |
+
rel = _source_reliability(url)
|
| 908 |
+
if rel == "High":
|
| 909 |
+
add(10, "Source reliability", rel)
|
| 910 |
+
elif rel == "Medium":
|
| 911 |
+
add(5, "Source reliability", rel)
|
| 912 |
+
|
| 913 |
+
# Context keywords basic
|
| 914 |
+
ctx_hits = 0
|
| 915 |
+
if facts.context:
|
| 916 |
+
ctx_hits = sum(1 for kw in facts.context.lower().split() if kw and kw in text)
|
| 917 |
+
if ctx_hits >= 3:
|
| 918 |
+
add(10, "Context alignment", f"{ctx_hits} context keywords")
|
| 919 |
+
elif ctx_hits == 2:
|
| 920 |
+
add(6, "Context alignment", "2 context keywords")
|
| 921 |
+
elif ctx_hits == 1:
|
| 922 |
+
add(3, "Context alignment", "1 context keyword")
|
| 923 |
+
|
| 924 |
+
# Optional embedding similarity (semantic relevance to context)
|
| 925 |
+
if ctx_hits < 3 and st.session_state.get("settings", {}).get("enable_embeddings") and facts.context and SentenceTransformer:
|
| 926 |
+
emb_model = st.session_state.get("_embed_model")
|
| 927 |
+
if emb_model is None:
|
| 928 |
+
with st.spinner("Loading embedding model (once)..."):
|
| 929 |
+
try:
|
| 930 |
+
emb_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 931 |
+
st.session_state["_embed_model"] = emb_model
|
| 932 |
+
except Exception:
|
| 933 |
+
emb_model = None
|
| 934 |
+
if emb_model:
|
| 935 |
+
try:
|
| 936 |
+
q_emb = emb_model.encode([facts.context[:512]])[0]
|
| 937 |
+
doc_emb = emb_model.encode([text[:1024]])[0]
|
| 938 |
+
# cosine
|
| 939 |
+
dot = float((q_emb @ doc_emb) / ((q_emb**2).sum()**0.5 * (doc_emb**2).sum()**0.5))
|
| 940 |
+
if dot > 0.35:
|
| 941 |
+
pts = int(min(20, (dot - 0.35) / (0.30) * 20)) # scale 0.35..0.65 -> 0..20
|
| 942 |
+
if pts > 0:
|
| 943 |
+
add(pts, "Semantic similarity", f"cos={dot:.2f}")
|
| 944 |
+
except Exception:
|
| 945 |
+
pass
|
| 946 |
+
|
| 947 |
+
level = "High" if score >= 70 else ("Medium" if score >= 40 else "Low")
|
| 948 |
+
explanation = "; ".join(f"{c['label']} +{c['points']} ({c['reason']})" for c in comps)
|
| 949 |
+
return {
|
| 950 |
+
**row,
|
| 951 |
+
"score": score,
|
| 952 |
+
"level": level,
|
| 953 |
+
"explanation": explanation,
|
| 954 |
+
"components": comps,
|
| 955 |
+
"reliability": rel,
|
| 956 |
+
"url": url,
|
| 957 |
+
"title": title,
|
| 958 |
+
"snippet": snippet,
|
| 959 |
+
}
|
| 960 |
+
|
| 961 |
+
def score_all_findings(rows: List[Dict[str, Any]], facts: KnownFacts) -> List[Dict[str, Any]]:
|
| 962 |
+
return [score_finding(r, facts) for r in rows]
|
| 963 |
+
|
| 964 |
+
# File/Image metadata extraction
|
| 965 |
+
def extract_metadata(upload) -> Dict[str, Any]:
|
| 966 |
+
info: Dict[str, Any] = {}
|
| 967 |
+
if not upload:
|
| 968 |
+
return info
|
| 969 |
+
name = upload.name.lower()
|
| 970 |
+
try:
|
| 971 |
+
if name.endswith(".pdf") and PdfReader:
|
| 972 |
+
reader = PdfReader(upload)
|
| 973 |
+
info = {"Pages": len(reader.pages), "Meta": dict(reader.metadata)}
|
| 974 |
+
elif name.endswith(".docx") and docx:
|
| 975 |
+
doc = docx.Document(upload)
|
| 976 |
+
cp = doc.core_properties
|
| 977 |
+
info = {"Title": cp.title, "Author": cp.author, "Created": cp.created}
|
| 978 |
+
elif (name.endswith(".doc") or name.endswith(".xls")) and olefile:
|
| 979 |
+
if olefile.isOleFile(upload):
|
| 980 |
+
info = {"OLE": "Legacy Office file detected"}
|
| 981 |
+
elif name.endswith((".mp3", ".flac", ".ogg", ".m4a")) and MutagenFile:
|
| 982 |
+
audio = MutagenFile(upload)
|
| 983 |
+
info = dict(audio) if audio else {}
|
| 984 |
+
elif name.endswith((".jpg", ".jpeg", ".png")) and exifread:
|
| 985 |
+
tags = exifread.process_file(upload)
|
| 986 |
+
info = {tag: str(val) for tag, val in tags.items()}
|
| 987 |
+
except Exception as e:
|
| 988 |
+
info = {"error": str(e)}
|
| 989 |
+
return info
|
| 990 |
+
|
| 991 |
+
# ---------------------------
|
| 992 |
+
# Graph Visualization
|
| 993 |
+
# ---------------------------
|
| 994 |
+
def build_graph(scored: List[Dict[str, Any]], facts: KnownFacts) -> Optional[str]:
|
| 995 |
+
if not nx or not Network:
|
| 996 |
+
return None
|
| 997 |
+
G = nx.Graph()
|
| 998 |
+
# Add fact nodes
|
| 999 |
+
for email in facts.emails:
|
| 1000 |
+
G.add_node(email, type="email")
|
| 1001 |
+
for h in facts.handles:
|
| 1002 |
+
G.add_node(h, type="handle")
|
| 1003 |
+
for d in facts.domains:
|
| 1004 |
+
G.add_node(d, type="domain")
|
| 1005 |
+
for n in facts.real_names:
|
| 1006 |
+
G.add_node(n, type="name")
|
| 1007 |
+
# Add finding nodes & edges
|
| 1008 |
+
for f in scored[:300]:
|
| 1009 |
+
url = f.get("url") or "unknown"
|
| 1010 |
+
G.add_node(url, type="finding", score=f.get("score",0))
|
| 1011 |
+
text = (f.get("title","") + " " + f.get("snippet",""))[:400].lower()
|
| 1012 |
+
linked = False
|
| 1013 |
+
for token in facts.emails + facts.handles + facts.domains + facts.real_names:
|
| 1014 |
+
if token.lower() and token.lower() in text:
|
| 1015 |
+
G.add_edge(token, url)
|
| 1016 |
+
linked = True
|
| 1017 |
+
if not linked and f.get("level") == "High":
|
| 1018 |
+
# still include high score node
|
| 1019 |
+
continue
|
| 1020 |
+
# Visualize
|
| 1021 |
+
net = Network(height="550px", width="100%", bgcolor="#111", font_color="white")
|
| 1022 |
+
for n, data in G.nodes(data=True):
|
| 1023 |
+
color = {
|
| 1024 |
+
"email": "#ff7f50",
|
| 1025 |
+
"handle": "#1e90ff",
|
| 1026 |
+
"domain": "#32cd32",
|
| 1027 |
+
"name": "#daa520",
|
| 1028 |
+
"finding": "#888"
|
| 1029 |
+
}.get(data.get("type"), "#999")
|
| 1030 |
+
size = 15 if data.get("type") != "finding" else max(5, min(25, int(data.get("score",10)/4)))
|
| 1031 |
+
net.add_node(n, label=n[:30], color=color, title=n, size=size)
|
| 1032 |
+
for u,v in G.edges():
|
| 1033 |
+
net.add_edge(u,v)
|
| 1034 |
+
path = "graph.html"
|
| 1035 |
+
net.show(path)
|
| 1036 |
+
try:
|
| 1037 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 1038 |
+
return f.read()
|
| 1039 |
+
except Exception:
|
| 1040 |
+
return None
|
| 1041 |
+
|
| 1042 |
+
# ---------------------------
|
| 1043 |
+
# Report Export
|
| 1044 |
+
# ---------------------------
|
| 1045 |
+
HTML_TEMPLATE = """<!doctype html><html><head><meta charset='utf-8'/><title>OSINT Report</title>
|
| 1046 |
+
<style>body{font-family:Arial,Helvetica,sans-serif;margin:2rem;background:#111;color:#eee;} h1,h2{color:#ffcc66} table{border-collapse:collapse;width:100%;margin:1rem 0;} th,td{border:1px solid #444;padding:6px;font-size:0.85rem;} .high{color:#4caf50;font-weight:700}.medium{color:#ffc107}.low{color:#f44336} code{background:#222;padding:2px 4px;border-radius:4px;} .small{font-size:0.75rem;color:#ccc}</style>
|
| 1047 |
+
</head><body>
|
| 1048 |
+
<h1>OSINT Investigation Report</h1>
|
| 1049 |
+
<h2>Summary</h2>
|
| 1050 |
+
<p><b>Entity Type:</b> {{ entity_type }}<br/><b>Entity Value:</b> {{ entity_value }}<br/>
|
| 1051 |
+
<b>Generated:</b> {{ generated }} UTC</p>
|
| 1052 |
+
<h2>Known Facts</h2>
|
| 1053 |
+
<pre>{{ facts_json }}</pre>
|
| 1054 |
+
<h2>Findings (Top {{ findings|length }})</h2>
|
| 1055 |
+
<table><thead><tr><th>Score</th><th>Level</th><th>Title</th><th>URL</th><th>Reliability</th><th>Explanation</th></tr></thead><tbody>
|
| 1056 |
+
{% for f in findings %}
|
| 1057 |
+
<tr><td>{{ f.score }}</td><td class='{{ f.level|lower }}'>{{ f.level }}</td><td>{{ f.title }}</td><td><a href='{{ f.url }}' target='_blank'>link</a></td><td>{{ f.reliability }}</td><td class='small'>{{ f.explanation }}</td></tr>
|
| 1058 |
+
{% endfor %}
|
| 1059 |
+
</tbody></table>
|
| 1060 |
+
</body></html>"""
|
| 1061 |
+
|
| 1062 |
+
def export_report(entity_type: str, entity_value: str, facts: KnownFacts, scored: List[Dict[str, Any]]):
|
| 1063 |
+
if not Template:
|
| 1064 |
+
st.warning("jinja2 not installed; cannot build HTML report.")
|
| 1065 |
+
return
|
| 1066 |
+
tpl = Template(HTML_TEMPLATE)
|
| 1067 |
+
html = tpl.render(
|
| 1068 |
+
entity_type=entity_type,
|
| 1069 |
+
entity_value=entity_value,
|
| 1070 |
+
generated=datetime.utcnow().isoformat(),
|
| 1071 |
+
facts_json=json.dumps(facts.__dict__, indent=2),
|
| 1072 |
+
findings=scored[:200],
|
| 1073 |
+
)
|
| 1074 |
+
st.download_button("Download HTML Report", data=html.encode("utf-8"), file_name="osint_report.html", mime="text/html")
|
| 1075 |
+
|
| 1076 |
+
# ---------------------------
|
| 1077 |
+
# Username Availability Probe (simple)
|
| 1078 |
+
# ---------------------------
|
| 1079 |
+
PLATFORM_PATTERNS: Dict[str,str] = {
|
| 1080 |
+
"GitHub": "https://github.com/{user}",
|
| 1081 |
+
"Twitter": "https://x.com/{user}",
|
| 1082 |
+
"Reddit": "https://www.reddit.com/user/{user}",
|
| 1083 |
+
"Medium": "https://medium.com/@{user}",
|
| 1084 |
+
}
|
| 1085 |
+
|
| 1086 |
+
def probe_usernames(users: List[str], limit: int = 10) -> List[Dict[str,str]]:
|
| 1087 |
+
out = []
|
| 1088 |
+
if requests is None:
|
| 1089 |
+
return out
|
| 1090 |
+
for u in users[:limit]:
|
| 1091 |
+
for plat, pattern in PLATFORM_PATTERNS.items():
|
| 1092 |
+
url = pattern.format(user=u)
|
| 1093 |
+
status = "?"
|
| 1094 |
+
try:
|
| 1095 |
+
r = requests.get(url, timeout=5)
|
| 1096 |
+
if r.status_code == 200:
|
| 1097 |
+
status = "Exists"
|
| 1098 |
+
elif r.status_code == 404:
|
| 1099 |
+
status = "Not Found"
|
| 1100 |
+
else:
|
| 1101 |
+
status = str(r.status_code)
|
| 1102 |
+
except Exception:
|
| 1103 |
+
status = "Error"
|
| 1104 |
+
out.append({"platform": plat, "username": u, "status": status})
|
| 1105 |
+
return out
|
| 1106 |
+
|
| 1107 |
+
def render_step4_execution(entity_type: str, entity_value: str):
|
| 1108 |
+
st.subheader("Step 4: Execute & Metadata")
|
| 1109 |
+
final = st.session_state.get("selected_dorks", [])
|
| 1110 |
+
if not final:
|
| 1111 |
+
st.info("No dorks selected.")
|
| 1112 |
+
return
|
| 1113 |
+
max_per = st.slider("Max results", 3, 20, st.session_state.get("settings", {}).get("max_per", 10))
|
| 1114 |
+
if st.button("Run dorks"):
|
| 1115 |
+
# Progressive skeleton loader while executing each query
|
| 1116 |
+
placeholder = st.empty()
|
| 1117 |
+
results: List[Dict[str, Any]] = []
|
| 1118 |
+
total_expected = len(final) * max_per
|
| 1119 |
+
for i, q in enumerate(final, start=1):
|
| 1120 |
+
remaining = len(final) - i + 1
|
| 1121 |
+
est_remaining = remaining * max_per
|
| 1122 |
+
# Render skeletons representing expected remaining results (capped for performance)
|
| 1123 |
+
with placeholder.container():
|
| 1124 |
+
st.markdown("#### Running searches…")
|
| 1125 |
+
st.caption(f"Query {i}/{len(final)}: {q}")
|
| 1126 |
+
skel_blocks = min(est_remaining, 18) # avoid huge DOM
|
| 1127 |
+
# Distribute size variations for visual interest
|
| 1128 |
+
sizes = ["sm", "md", "lg"]
|
| 1129 |
+
rows_html = []
|
| 1130 |
+
for j in range(skel_blocks):
|
| 1131 |
+
size = sizes[j % len(sizes)]
|
| 1132 |
+
rows_html.append(f'<div class="skeleton-block skeleton-h {size}"></div>')
|
| 1133 |
+
st.markdown(
|
| 1134 |
+
'<div class="skeleton-group">' + "".join(rows_html) + "</div>",
|
| 1135 |
+
unsafe_allow_html=True,
|
| 1136 |
+
)
|
| 1137 |
+
# Execute the actual search
|
| 1138 |
+
rows = ddg_search(q, max_results=max_per)
|
| 1139 |
+
_audit_log("dork_run", dork=q, results=len(rows))
|
| 1140 |
+
results.extend(rows)
|
| 1141 |
+
# Clear placeholder after completion
|
| 1142 |
+
placeholder.empty()
|
| 1143 |
+
st.session_state["dork_results"] = results
|
| 1144 |
+
# compute scores after acquiring all results
|
| 1145 |
+
facts = KnownFacts.from_session()
|
| 1146 |
+
st.session_state["scored_results"] = score_all_findings(results, facts)
|
| 1147 |
+
if res := st.session_state.get("dork_results"):
|
| 1148 |
+
st.json(res)
|
| 1149 |
+
audit_str = "\n".join(json.dumps(ev) for ev in st.session_state["audit"])
|
| 1150 |
+
st.download_button("Download audit", audit_str, "audit.jsonl")
|
| 1151 |
+
|
| 1152 |
+
st.markdown("---")
|
| 1153 |
+
st.subheader("File/Image Metadata Extractor")
|
| 1154 |
+
upload = st.file_uploader("Upload a file (pdf, docx, mp3, jpg, etc.)")
|
| 1155 |
+
if upload:
|
| 1156 |
+
meta = extract_metadata(upload)
|
| 1157 |
+
st.json(meta)
|
| 1158 |
+
|
| 1159 |
+
# ---------------------------
|
| 1160 |
+
# Main
|
| 1161 |
+
# ---------------------------
|
| 1162 |
+
def render_help_tab():
|
| 1163 |
+
st.subheader("How To Use This OSINT Investigator Suite")
|
| 1164 |
+
st.markdown("""
|
| 1165 |
+
This tab is a quick field manual. It shows the purpose of every tab, the workflow order, and pro tips.
|
| 1166 |
+
|
| 1167 |
+
### Recommended Workflow (Fast Path)
|
| 1168 |
+
1. Known Facts – Load seed identifiers (handles, emails, domains, names).
|
| 1169 |
+
2. Plan – Review the autogenerated phased investigation plan; adjust facts if gaps obvious.
|
| 1170 |
+
3. Explainer – Learn the dork building logic for transparency (optional).
|
| 1171 |
+
4. Advisor – Get recommended dorks (rule + optional LLM). Refine, then accept.
|
| 1172 |
+
5. Selection – Curate / edit / remove dorks; finalize the set to run.
|
| 1173 |
+
6. Execution – Run dorks (skeleton loaders show progress); extract file/image metadata if you have artifacts.
|
| 1174 |
+
7. Scoring – Review confidence scores, filter, read explanations, iterate by adding new facts and re-scoring.
|
| 1175 |
+
8. Graph – Visual relationship view (requires networkx + pyvis) to spot high‑intersection nodes.
|
| 1176 |
+
9. Report – Export an HTML snapshot for stakeholders / evidence chain.
|
| 1177 |
+
10. Usernames – Probe handle existence across common platforms.
|
| 1178 |
+
11. Help – (This) reference card anytime.
|
| 1179 |
+
|
| 1180 |
+
---
|
| 1181 |
+
### Tab Details & Tips
|
| 1182 |
+
**Known Facts**
|
| 1183 |
+
- Add all solid identifiers early; scoring & dork generation leverage them.
|
| 1184 |
+
- Handles & emails dramatically raise confidence when co-occurring in sources.
|
| 1185 |
+
- Update facts after each scouting loop (new domains from findings, etc.).
|
| 1186 |
+
|
| 1187 |
+
**Plan**
|
| 1188 |
+
- Generated phases: Recon, Expansion, Correlation, Deep Dive, Reporting.
|
| 1189 |
+
- Use it as a narrative backbone for your final export or task tickets.
|
| 1190 |
+
|
| 1191 |
+
**Explainer**
|
| 1192 |
+
- Shows how base + contextual tokens assemble into search dorks by entity type.
|
| 1193 |
+
- Use to justify methodology or teach newcomers.
|
| 1194 |
+
|
| 1195 |
+
**Advisor**
|
| 1196 |
+
- Hybrid: deterministic heuristic rules plus optional LLM (if HF token + model set in settings).
|
| 1197 |
+
- Toggle embedding/semantic features in settings (if present) to enrich scoring later.
|
| 1198 |
+
- Accept the generated list to push candidates to Selection.
|
| 1199 |
+
|
| 1200 |
+
**Selection**
|
| 1201 |
+
- Final edit surface. Remove noisy / redundant queries before execution.
|
| 1202 |
+
- Keep a balanced mix: broad footprint + specific leak/file/resource patterns.
|
| 1203 |
+
|
| 1204 |
+
**Execution**
|
| 1205 |
+
- Click Run dorks: animated skeleton placeholders appear per batch while searches resolve.
|
| 1206 |
+
- Results cached in session: re-running overwrites (audit log tracks runs).
|
| 1207 |
+
- Metadata Extractor: Upload docs / images to pull EXIF, PDF metadata, docx core props, audio tags.
|
| 1208 |
+
|
| 1209 |
+
**Scoring**
|
| 1210 |
+
- Each finding scored from component signals (exact identifiers, fuzzy tokens, co-occurrence, reliability, context keywords, semantic similarity).
|
| 1211 |
+
- Levels: High ≥70, Medium ≥40. Use filters + search bar to triage.
|
| 1212 |
+
- Re-score after updating Known Facts or enabling embeddings.
|
| 1213 |
+
- "Full Explanations" expands reasoning transparency for defensibility.
|
| 1214 |
+
|
| 1215 |
+
**Graph**
|
| 1216 |
+
- Visual pivot map: nodes sized by aggregated score; edges for shared identifiers.
|
| 1217 |
+
- Use to spot central assets (good pivot candidates) quickly.
|
| 1218 |
+
- If graph libs missing you'll see an install hint (they're listed in requirements).
|
| 1219 |
+
|
| 1220 |
+
**Report**
|
| 1221 |
+
- Generates a standalone HTML (includes styling + key metrics) for sharing.
|
| 1222 |
+
- Consider exporting after each major iteration to preserve state (version trail).
|
| 1223 |
+
|
| 1224 |
+
**Usernames**
|
| 1225 |
+
- Lightweight existence probe (HTTP status heuristic). "Exists" ≠ ownership proof.
|
| 1226 |
+
- Add more platforms by extending PLATFORM_PATTERNS in code.
|
| 1227 |
+
|
| 1228 |
+
**Chat Assistant (Floating)**
|
| 1229 |
+
- Noir-style guidance; quick buttons for common pivots.
|
| 1230 |
+
- If a model + token configured, responses may blend LLM nuance with rule hints; otherwise rule-based only.
|
| 1231 |
+
- Close with ✕; reopen with the 🕵️ button.
|
| 1232 |
+
|
| 1233 |
+
**Light / Dark Toggle**
|
| 1234 |
+
- Sidebar toggle (if present) swaps theme classes; custom components auto-adapt.
|
| 1235 |
+
|
| 1236 |
+
**Skeleton Loaders**
|
| 1237 |
+
- Shimmering bars appear during long search batches to indicate progress.
|
| 1238 |
+
|
| 1239 |
+
---
|
| 1240 |
+
### Power User Tips
|
| 1241 |
+
- Iterative Loop: (Run) → (Score) → (Add new facts from findings) → (Re-score) → (Graph) → (Report).
|
| 1242 |
+
- High-value pivots: Rare email domains, unique handles in code repos, author names in PDF metadata.
|
| 1243 |
+
- Noise Control: Remove generic dorks that return unrelated trending content before executing.
|
| 1244 |
+
- Evidence Chain: Audit log (download on Execution tab) + HTML reports form a defensible trail.
|
| 1245 |
+
|
| 1246 |
+
### Performance Notes
|
| 1247 |
+
- Limiting Max results reduces API latency & keeps scoring responsive.
|
| 1248 |
+
- Embedding model loads lazily—first semantic scoring may pause a few seconds.
|
| 1249 |
+
- Graph view caps large result sets to avoid browser lockups.
|
| 1250 |
+
|
| 1251 |
+
### Glossary
|
| 1252 |
+
- Dork: Crafted search query combining identifiers + context tokens.
|
| 1253 |
+
- Pivot: New investigative direction unlocked by a discovered unique attribute.
|
| 1254 |
+
- Co-occurrence: Multiple target identifiers appearing together in one source.
|
| 1255 |
+
|
| 1256 |
+
### Ethics Reminder
|
| 1257 |
+
Public sources only. No credential stuffing, intrusion, or accessing private data stores. Respect rate limits & platform ToS.
|
| 1258 |
+
""")
|
| 1259 |
+
|
| 1260 |
+
def main():
|
| 1261 |
+
st.markdown("""
|
| 1262 |
+
<div class='app-brand-bar'>
|
| 1263 |
+
<div style='font-size:28px'>🕵️</div>
|
| 1264 |
+
<div class='app-brand-title'>OSINT Investigator Suite</div>
|
| 1265 |
+
<div class='app-badge'>AI-Augmented</div>
|
| 1266 |
+
<div class='app-badge'>Heuristic Scoring</div>
|
| 1267 |
+
<div class='app-badge'>Report Export</div>
|
| 1268 |
+
</div>
|
| 1269 |
+
""", unsafe_allow_html=True)
|
| 1270 |
+
entity_type = st.selectbox("Entity type", list(TYPED_DORK_MAP.keys()), key="entity_type")
|
| 1271 |
+
entity_value = st.text_input("Entity value", "[email protected]", key="entity_value")
|
| 1272 |
+
if entity_type and entity_value:
|
| 1273 |
+
tabs = st.tabs(["Known Facts", "Plan", "Explainer", "Advisor", "Selection", "Execution", "Scoring", "Graph", "Report", "Usernames", "Help"])
|
| 1274 |
+
with tabs[0]:
|
| 1275 |
+
_known_facts_ui()
|
| 1276 |
+
with tabs[1]:
|
| 1277 |
+
render_investigation_plan(entity_type, entity_value)
|
| 1278 |
+
with tabs[2]:
|
| 1279 |
+
render_dorks_explainer(entity_type, entity_value)
|
| 1280 |
+
with tabs[3]:
|
| 1281 |
+
render_dork_recommender(entity_type, entity_value)
|
| 1282 |
+
with tabs[4]:
|
| 1283 |
+
render_dork_selection(entity_type, entity_value)
|
| 1284 |
+
with tabs[5]:
|
| 1285 |
+
render_step4_execution(entity_type, entity_value)
|
| 1286 |
+
with tabs[6]:
|
| 1287 |
+
st.subheader("Scoring & Confidence")
|
| 1288 |
+
facts = KnownFacts.from_session()
|
| 1289 |
+
scored = st.session_state.get("scored_results")
|
| 1290 |
+
if not scored:
|
| 1291 |
+
st.info("Run dorks first to generate findings and scores.")
|
| 1292 |
+
else:
|
| 1293 |
+
high = sum(1 for r in scored if r["level"] == "High")
|
| 1294 |
+
med = sum(1 for r in scored if r["level"] == "Medium")
|
| 1295 |
+
low = sum(1 for r in scored if r["level"] == "Low")
|
| 1296 |
+
st.markdown("<div class='sticky-toolbar'><strong>Findings Overview</strong></div>", unsafe_allow_html=True)
|
| 1297 |
+
k1,k2,k3,k4 = st.columns(4)
|
| 1298 |
+
k1.metric("Total", len(scored))
|
| 1299 |
+
k2.metric("High", high)
|
| 1300 |
+
k3.metric("Medium", med)
|
| 1301 |
+
k4.metric("Low", low)
|
| 1302 |
+
level_filter = st.multiselect("Levels", ["High", "Medium", "Low"], default=["High", "Medium", "Low"], key="lvl_filter")
|
| 1303 |
+
q = st.text_input("Search title/snippet", key="score_search")
|
| 1304 |
+
view = [r for r in scored if r["level"] in level_filter and (not q or q.lower() in (r.get("snippet", '')).lower() or q.lower() in (r.get("title", '')).lower())]
|
| 1305 |
+
rows_html = []
|
| 1306 |
+
for r in view:
|
| 1307 |
+
lvl = r["level"].lower()
|
| 1308 |
+
badge = f"<span class='badge {lvl}'>{r['level']}</span>"
|
| 1309 |
+
title = (r.get('title',''))[:120]
|
| 1310 |
+
expl_short = (r.get('explanation',''))[:180]
|
| 1311 |
+
url = r.get('url') or ''
|
| 1312 |
+
rows_html.append(f"<tr><td>{r['score']}</td><td>{badge}</td><td>{title}</td><td><a href='{url}' target='_blank'>link</a></td><td>{r['reliability']}</td><td>{expl_short}</td></tr>")
|
| 1313 |
+
table_html = """
|
| 1314 |
+
<div style='max-height:520px;overflow:auto;border:1px solid #262626;border-radius:12px;'>
|
| 1315 |
+
<table class='score-table'>
|
| 1316 |
+
<thead><tr><th>Score</th><th>Level</th><th>Title</th><th>URL</th><th>Reliab.</th><th>Explanation (truncated)</th></tr></thead>
|
| 1317 |
+
<tbody>{rows}</tbody>
|
| 1318 |
+
</table>
|
| 1319 |
+
</div>
|
| 1320 |
+
""".format(rows="".join(rows_html))
|
| 1321 |
+
st.markdown(table_html, unsafe_allow_html=True)
|
| 1322 |
+
col_rescore, col_full, col_export = st.columns([1,2,1])
|
| 1323 |
+
with col_rescore:
|
| 1324 |
+
if st.button("Re-score", key="btn_rescore_now"):
|
| 1325 |
+
rescored = score_all_findings(st.session_state.get("dork_results", []), facts)
|
| 1326 |
+
st.session_state["scored_results"] = rescored
|
| 1327 |
+
st.success("Re-scored.")
|
| 1328 |
+
with col_full:
|
| 1329 |
+
with st.expander("Full Explanations"):
|
| 1330 |
+
for r in view:
|
| 1331 |
+
st.markdown(f"**{r.get('title','')}** — {r['level']} ({r['score']})\n\n{r.get('explanation','')}")
|
| 1332 |
+
with col_export:
|
| 1333 |
+
if st.button("Export Report (HTML)", key="btn_export_report_inline"):
|
| 1334 |
+
export_report(entity_type, entity_value, facts, scored)
|
| 1335 |
+
with tabs[7]:
|
| 1336 |
+
st.subheader("Entity Graph")
|
| 1337 |
+
facts = KnownFacts.from_session()
|
| 1338 |
+
scored = st.session_state.get("scored_results") or []
|
| 1339 |
+
if scored:
|
| 1340 |
+
html = build_graph(scored, facts)
|
| 1341 |
+
if html:
|
| 1342 |
+
st.components.v1.html(html, height=600, scrolling=True)
|
| 1343 |
+
else:
|
| 1344 |
+
st.info("Install networkx & pyvis for graph visualization.")
|
| 1345 |
+
else:
|
| 1346 |
+
st.info("No scored findings yet.")
|
| 1347 |
+
with tabs[8]:
|
| 1348 |
+
st.subheader("Report Export")
|
| 1349 |
+
facts = KnownFacts.from_session()
|
| 1350 |
+
scored = st.session_state.get("scored_results") or []
|
| 1351 |
+
if scored:
|
| 1352 |
+
export_report(entity_type, entity_value, facts, scored)
|
| 1353 |
+
else:
|
| 1354 |
+
st.info("Run and score findings to export a report.")
|
| 1355 |
+
with tabs[9]:
|
| 1356 |
+
st.subheader("Username Availability Probe")
|
| 1357 |
+
facts = KnownFacts.from_session()
|
| 1358 |
+
sample_users = facts.handles[:10] or [entity_value] if entity_type == "Username / Handle" else []
|
| 1359 |
+
if not sample_users:
|
| 1360 |
+
st.info("Add handles in Known Facts or pick a username entity.")
|
| 1361 |
+
else:
|
| 1362 |
+
if st.button("Probe Platforms", key="btn_probe_users"):
|
| 1363 |
+
data = probe_usernames(sample_users)
|
| 1364 |
+
st.session_state["probe_results"] = data
|
| 1365 |
+
if pr := st.session_state.get("probe_results"):
|
| 1366 |
+
st.dataframe(pr, use_container_width=True)
|
| 1367 |
+
with tabs[10]:
|
| 1368 |
+
render_help_tab()
|
| 1369 |
+
# Floating chat widget render
|
| 1370 |
+
render_chat_widget(entity_type, entity_value)
|
| 1371 |
+
with st.expander("Methodology / Scoring Rubric", expanded=False):
|
| 1372 |
+
st.markdown("""
|
| 1373 |
+
**Scoring Components**
|
| 1374 |
+
- Email (+25) / Name exact (+20) / Handle (+15) / Domain (+10) / IP (+10) / Org (+8)
|
| 1375 |
+
- Fuzzy name token (+8) / Co-occurrence (+10)
|
| 1376 |
+
- Source reliability High (+10) / Medium (+5)
|
| 1377 |
+
- Context alignment (1:+3 / 2:+6 / ≥3:+10)
|
| 1378 |
+
- Semantic similarity (0–20 scaled) if enabled
|
| 1379 |
+
**Levels:** High ≥70, Medium ≥40, else Low.
|
| 1380 |
+
""")
|
| 1381 |
+
with st.expander("Ethical Use Notice", expanded=False):
|
| 1382 |
+
st.markdown("Lawful OSINT only. No intrusion, auth bypass, or accessing non-public data. Respect platform ToS & privacy.")
|
| 1383 |
+
|
| 1384 |
+
# ---------------------------
|
| 1385 |
+
# Chat Assistant
|
| 1386 |
+
# ---------------------------
|
| 1387 |
+
GUIDE_SYSTEM = (
|
| 1388 |
+
"You are a noir-style seasoned OSINT investigator named 'The Analyst'. Speak like classic crime noir: terse, vivid metaphors, professional, never cheesy. "
|
| 1389 |
+
"Guide the user step-by-step in enumerating a digital entity using only ethical open sources. "
|
| 1390 |
+
"Each answer: <=150 words, 2-4 compact paragraphs or bullet fragments. Provide concrete next actions, pivot angles, and a light ethics reminder if user drifts. "
|
| 1391 |
+
"Avoid sensationalism. No illegal guidance. Occasionally finish with a brief noir tag line like 'That's the shape of the alley, kid.'" )
|
| 1392 |
+
|
| 1393 |
+
def _summarize_context(entity_type: str, entity_value: str) -> str:
|
| 1394 |
+
facts: KnownFacts = KnownFacts.from_session()
|
| 1395 |
+
scored = st.session_state.get("scored_results") or []
|
| 1396 |
+
high_titles = [s.get("title") for s in scored if s.get("level") == "High"][:5]
|
| 1397 |
+
parts = [f"Entity: {entity_type}={entity_value}"]
|
| 1398 |
+
if facts.handles: parts.append(f"Handles:{len(facts.handles)}")
|
| 1399 |
+
if facts.emails: parts.append(f"Emails:{len(facts.emails)}")
|
| 1400 |
+
if facts.domains: parts.append(f"Domains:{len(facts.domains)}")
|
| 1401 |
+
if high_titles: parts.append("HighHits:" + ";".join(high_titles))
|
| 1402 |
+
return " | ".join(parts)
|
| 1403 |
+
|
| 1404 |
+
def _rule_based_reply(user_msg: str, entity_type: str, entity_value: str) -> str:
|
| 1405 |
+
msg = user_msg.lower()
|
| 1406 |
+
lines = []
|
| 1407 |
+
ctx = _summarize_context(entity_type, entity_value)
|
| 1408 |
+
if any(k in msg for k in ["start", "hello", "hi", "first"]):
|
| 1409 |
+
lines.append("First we empty our pockets—handles, domains, emails. Solid identifiers become compass bearings.")
|
| 1410 |
+
if "dork" in msg or "search" in msg:
|
| 1411 |
+
lines.append("Open with wide footprint dorks. Then tighten: docs leaks, repo chatter, paste traces. Each query is a flashlight beam.")
|
| 1412 |
+
if "score" in msg or "confidence" in msg:
|
| 1413 |
+
lines.append("Confidence breathes when multiple facts collide in a clean source. Add precise emails or stable handles—re-score, watch the highs rise.")
|
| 1414 |
+
if "graph" in msg:
|
| 1415 |
+
lines.append("Graph shows the intersections. Nodes struck by multiple identifiers—those corners hide stories.")
|
| 1416 |
+
if "pivot" in msg or "next" in msg:
|
| 1417 |
+
lines.append("Pivot off unique anchors: a handle in a PDF, an email in a commit, a domain in a press note. Each pivot narrows the alley.")
|
| 1418 |
+
if not lines:
|
| 1419 |
+
lines.append("Playbook: 1) Lock facts 2) Advisor for 10 sharp dorks 3) Select & run 4) Score 5) Add new facts 6) Graph pivots 7) Export report.")
|
| 1420 |
+
lines.append(f"Context snapshot: {ctx}")
|
| 1421 |
+
lines.append("Stay clean—public sources only. That's the shape of the alley, kid.")
|
| 1422 |
+
return "\n\n".join(lines)
|
| 1423 |
+
|
| 1424 |
+
def render_chat_widget(entity_type: str, entity_value: str):
|
| 1425 |
+
# Session setup
|
| 1426 |
+
st.session_state.setdefault("chat_history", [])
|
| 1427 |
+
st.session_state.setdefault("chat_open", True)
|
| 1428 |
+
open_flag = st.session_state["chat_open"]
|
| 1429 |
+
|
| 1430 |
+
# Mini open button (when closed)
|
| 1431 |
+
if not open_flag:
|
| 1432 |
+
if st.button("🕵️", key="open_chat_button"):
|
| 1433 |
+
st.session_state["chat_open"] = True
|
| 1434 |
+
# Style the button to float
|
| 1435 |
+
st.markdown("""
|
| 1436 |
+
<style>
|
| 1437 |
+
div[data-testid='stButton'] button[kind='secondary'] {background:#222;border:2px solid #ffcc66;}
|
| 1438 |
+
</style>
|
| 1439 |
+
<div class='chat-mini-btn'></div>
|
| 1440 |
+
""", unsafe_allow_html=True)
|
| 1441 |
+
return
|
| 1442 |
+
|
| 1443 |
+
# Build chat window
|
| 1444 |
+
messages = st.session_state["chat_history"]
|
| 1445 |
+
# Render HTML shell
|
| 1446 |
+
st.markdown("<div class='chat-window'>", unsafe_allow_html=True)
|
| 1447 |
+
# Header with close control
|
| 1448 |
+
c1, c2, c3 = st.columns([0.2, 0.65, 0.15])
|
| 1449 |
+
with c1:
|
| 1450 |
+
st.markdown("<div class='chat-header' style='background:transparent;padding:4px 0 0 6px;'>🕵️</div>", unsafe_allow_html=True)
|
| 1451 |
+
with c2:
|
| 1452 |
+
st.markdown("<div class='chat-header' style='background:transparent;padding:4px 0;'> <span class='title'>Investigator</span></div>", unsafe_allow_html=True)
|
| 1453 |
+
with c3:
|
| 1454 |
+
if st.button("✕", key="close_chat_btn"):
|
| 1455 |
+
st.session_state["chat_open"] = False
|
| 1456 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 1457 |
+
return
|
| 1458 |
+
# Messages area
|
| 1459 |
+
# Use an empty container to emulate scroll (Streamlit limitation)
|
| 1460 |
+
msg_container = st.container()
|
| 1461 |
+
with msg_container:
|
| 1462 |
+
if messages:
|
| 1463 |
+
for turn in messages[-18:]:
|
| 1464 |
+
st.markdown(f"<p class='msg-user'><b>You:</b> {turn['user']}</p>", unsafe_allow_html=True)
|
| 1465 |
+
st.markdown(f"<p class='msg-bot'><b>Inv:</b> {turn['assistant']}</p>", unsafe_allow_html=True)
|
| 1466 |
+
else:
|
| 1467 |
+
st.markdown("<p class='msg-bot'>Need a lead? Ask me about dorks, scoring, or pivots.</p>", unsafe_allow_html=True)
|
| 1468 |
+
|
| 1469 |
+
# Input form
|
| 1470 |
+
with st.form("chat_form", clear_on_submit=True):
|
| 1471 |
+
q = st.text_area("Message", key="chat_input_area", height=70, label_visibility="collapsed")
|
| 1472 |
+
col_a, col_b, col_c, col_d = st.columns(4)
|
| 1473 |
+
send = False
|
| 1474 |
+
with col_a:
|
| 1475 |
+
if st.form_submit_button("Send"):
|
| 1476 |
+
send = True
|
| 1477 |
+
with col_b:
|
| 1478 |
+
if st.form_submit_button("Dorks"):
|
| 1479 |
+
q = "What dorks should I run next?"; send = True
|
| 1480 |
+
with col_c:
|
| 1481 |
+
if st.form_submit_button("Confidence"):
|
| 1482 |
+
q = "How do I improve confidence now?"; send = True
|
| 1483 |
+
with col_d:
|
| 1484 |
+
if st.form_submit_button("Pivot"):
|
| 1485 |
+
q = "Give me a pivot strategy."; send = True
|
| 1486 |
+
if send and q.strip():
|
| 1487 |
+
reply: Optional[str] = None
|
| 1488 |
+
if st.session_state.get("settings", {}).get("model") and os.getenv("HF_API_TOKEN"):
|
| 1489 |
+
convo = st.session_state["chat_history"][-6:]
|
| 1490 |
+
history_str = "\n".join([f"User: {h['user']}\nAssistant: {h['assistant']}" for h in convo if h.get('assistant')])
|
| 1491 |
+
prompt = (
|
| 1492 |
+
f"{GUIDE_SYSTEM}\nCurrentContext: {_summarize_context(entity_type, entity_value)}\n" +
|
| 1493 |
+
history_str + f"\nUser: {q}\nAssistant:")
|
| 1494 |
+
reply = _hf_infer(MODEL_ID_MAP.get(st.session_state["settings"]["model"], st.session_state["settings"]["model"]), prompt, max_new_tokens=190, temperature=0.35)
|
| 1495 |
+
if not reply:
|
| 1496 |
+
reply = _rule_based_reply(q, entity_type, entity_value)
|
| 1497 |
+
st.session_state["chat_history"].append({"user": q, "assistant": reply})
|
| 1498 |
+
st.markdown("<div class='chat-input small'>Ethical OSINT only.🕵️♂️</div>", unsafe_allow_html=True)
|
| 1499 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 1500 |
+
|
| 1501 |
+
if __name__ == "__main__":
|
| 1502 |
+
main()
|
| 1503 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.32
|
| 2 |
+
duckduckgo_search
|
| 3 |
+
rapidfuzz
|
| 4 |
+
sentence-transformers
|
| 5 |
+
networkx
|
| 6 |
+
pyvis
|
| 7 |
+
jinja2
|
| 8 |
+
PyPDF2
|
| 9 |
+
python-docx
|
| 10 |
+
olefile
|
| 11 |
+
mutagen
|
| 12 |
+
exifread
|
| 13 |
+
requests
|