File size: 9,987 Bytes
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93be849
33afddb
 
 
 
 
f5f1dc9
 
33afddb
 
f5f1dc9
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
 
f5f1dc9
 
 
33afddb
 
f5f1dc9
 
 
 
 
 
 
 
 
 
 
 
 
93be849
f5f1dc9
93be849
 
 
 
f5f1dc9
 
33afddb
f5f1dc9
 
 
33afddb
 
 
f5f1dc9
 
 
 
33afddb
f5f1dc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33afddb
f5f1dc9
 
 
 
 
 
 
 
 
 
 
 
 
33afddb
 
 
 
f5f1dc9
33afddb
f5f1dc9
 
 
33afddb
f5f1dc9
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5f1dc9
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5f1dc9
 
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
f5f1dc9
 
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5f1dc9
33afddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import psycopg2
from psycopg2.extras import execute_values
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
import datetime
import logging
from collections import deque
from fastapi import FastAPI, BackgroundTasks, HTTPException
from contextlib import asynccontextmanager
from fastapi.responses import HTMLResponse
import threading

# --- Configuration ---
SUPABASE_CONNECTION_STRING = os.getenv("SUPABASE_CONNECTION_STRING")

# --- Toggles & Tuning ---
PROCESSING_CHUNK_SIZE = 32   
EMBEDDING_BATCH_SIZE = 32     
DRY_RUN = False               

# --- Global State ---
model = None
execution_logs = deque(maxlen=50) 
is_processing = False 
processing_lock = threading.Lock()

# --- Lifespan Manager ---
@asynccontextmanager
async def lifespan(app: FastAPI):
    global model
    print("⏳ Loading Model...")
    model = SentenceTransformer('Alibaba-NLP/gte-modernbert-base', trust_remote_code=True)
    print("βœ… Model Loaded.")
    yield
    print("πŸ›‘ Shutting down...")

app = FastAPI(lifespan=lifespan)

# --- Helper Functions ---

def fetch_and_lock_chunk(conn, chunk_size):
    """
    Fetches candidates from the denormalized table where embeddings are missing.
    """
    query = """
    SELECT 
        id,
        name,
        summary,
        work_experience,
        projects,
        education,
        achievements,
        certifications,
        volunteering,
        skills,
        languages
    FROM public.candidates
    WHERE 
        -- Condition 1: Embedding is missing (New Job)
        embeddings IS NULL 
        OR 
        -- Condition 2: Job created after the last embedding (Retry/Update Logic)
        -- Note: Since there is no 'updated_at' column, we rely on created_at vs embeddings_created_at
        (embeddings_created_at IS NOT NULL AND created_at > embeddings_created_at)
    FOR UPDATE SKIP LOCKED
    LIMIT %s
    """
    # Note: If you add an 'updated_at' column later, change WHERE to:
    # WHERE embeddings IS NULL OR updated_at > embeddings_created_at
    
    return pd.read_sql_query(query, conn, params=(chunk_size,))

def clean_and_format_text(row):
    """
    Parses the JSONB and Array columns from the new schema to create a 
    rich text representation for embedding.
    """
    text_parts = []

    # 1. Basic Info
    if row.get('name'):
        text_parts.append(f"Name: {row['name']}")
    
    if row.get('summary'):
        text_parts.append(f"Summary: {row['summary']}")

    # 2. Skills (Postgres Array -> Python List)
    if row.get('skills') and isinstance(row['skills'], list):
        # Filter out empty strings/None
        valid_skills = [s for s in row['skills'] if s]
        if valid_skills:
            text_parts.append(f"Skills: {', '.join(valid_skills)}")

    # 3. Work Experience (JSONB List of Dicts)
    # Schema keys: role, company, description, duration
    if row.get('work_experience') and isinstance(row['work_experience'], list):
        exps = []
        for item in row['work_experience']:
            if isinstance(item, dict):
                role = item.get('role', '')
                company = item.get('company', '')
                desc = item.get('description', '')
                # Format: "Role at Company: Description"
                entry = f"{role} at {company}".strip()
                if desc:
                    entry += f": {desc}"
                exps.append(entry)
        if exps:
            text_parts.append("Work Experience:\n" + "\n".join(exps))

    # 4. Projects (JSONB List of Dicts)
    # Schema keys: title, description, link
    if row.get('projects') and isinstance(row['projects'], list):
        projs = []
        for item in row['projects']:
            if isinstance(item, dict):
                title = item.get('title', '')
                desc = item.get('description', '')
                entry = f"{title}".strip()
                if desc:
                    entry += f": {desc}"
                projs.append(entry)
        if projs:
            text_parts.append("Projects:\n" + "\n".join(projs))

    # 5. Education (JSONB List of Dicts)
    # Schema keys: degree, institution, year
    if row.get('education') and isinstance(row['education'], list):
        edus = []
        for item in row['education']:
            if isinstance(item, dict):
                degree = item.get('degree', '')
                inst = item.get('institution', '')
                entry = f"{degree} from {inst}".strip()
                edus.append(entry)
        if edus:
            text_parts.append("Education: " + ", ".join(edus))

    # 6. Certifications (JSONB List of Dicts)
    # Schema keys: name, issuer
    if row.get('certifications') and isinstance(row['certifications'], list):
        certs = []
        for item in row['certifications']:
            if isinstance(item, dict):
                name = item.get('name', '')
                issuer = item.get('issuer', '')
                entry = f"{name} by {issuer}".strip()
                certs.append(entry)
        if certs:
            text_parts.append("Certifications: " + ", ".join(certs))
    
    # 7. Achievements (JSONB List of Dicts)
    if row.get('achievements') and isinstance(row['achievements'], list):
        achievements = []
        for item in row['achievements']:
            if isinstance(item, dict):
                title = item.get('title', '')
                desc = item.get('description', '')
                entry = f"{title}: {desc}".strip()
                achievements.append(entry)
        if achievements:
             text_parts.append("Achievements: " + "; ".join(achievements))

    return "\n\n".join(text_parts)

def update_db_batch(conn, updates):
    if DRY_RUN: return

    # Updated to target public.candidates and cast ID to UUID
    query = """
        UPDATE public.candidates AS c
        SET embeddings = data.vector::vector,
            embeddings_created_at = NOW()
        FROM (VALUES %s) AS data (id, vector)
        WHERE c.id = data.id::uuid
    """
    cursor = conn.cursor()
    try:
        execute_values(cursor, query, updates)
        conn.commit() 
    except Exception as e:
        conn.rollback()
        raise e 
    finally:
        cursor.close()

def run_worker_logic():
    """
    The core logic that runs one single batch processing.
    """
    log_buffer = [] 
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    log_buffer.append(f"<b>BATCH RUN: {timestamp}</b>")
    
    conn = None
    try:
        conn = psycopg2.connect(SUPABASE_CONNECTION_STRING, sslmode='require')
        
        # 1. Fetch & Lock
        df = fetch_and_lock_chunk(conn, PROCESSING_CHUNK_SIZE)
        
        if df.empty:
            conn.rollback()
            log_buffer.append("πŸ’€ No pending candidates found.")
            execution_logs.appendleft("<br>".join(log_buffer))
            return "No data"

        log_buffer.append(f"πŸ”’ Locked & Processing {len(df)} candidates...")

        # 2. Clean Text
        df['full_text'] = df.apply(clean_and_format_text, axis=1)
        
        # 3. Log Inputs (For the Root API view)
        for index, row in df.iterrows():
            log_buffer.append(f"<div style='border:1px solid #ccc; margin:5px; padding:5px; background:#f9f9f9'>")
            # row['id'] is now the UUID
            log_buffer.append(f"<strong>ID: {row['id']} ({row.get('name', 'Unknown')})</strong>")
            log_buffer.append(f"<pre style='white-space: pre-wrap;'>{row['full_text']}</pre>")
            log_buffer.append("</div>")

        # 4. Generate Embeddings
        embeddings = model.encode(
            df['full_text'].tolist(), 
            batch_size=EMBEDDING_BATCH_SIZE, 
            show_progress_bar=False, 
            convert_to_numpy=True,
            normalize_embeddings=True 
        )

        # 5. Update DB
        # Ensure ID is converted to string for the tuple list if it isn't already
        updates = list(zip(df['id'].astype(str).tolist(), embeddings.tolist()))
        
        if not DRY_RUN:
            update_db_batch(conn, updates)
            log_buffer.append(f"βœ… Successfully updated {len(df)} profiles.")
        else:
            conn.rollback()
            log_buffer.append("⚠️ Dry Run: No DB updates made.")

    except Exception as e:
        if conn: conn.rollback()
        log_buffer.append(f"❌ ERROR: {str(e)}")
        print(f"Error: {e}")
    finally:
        if conn: conn.close()
        execution_logs.appendleft("<br>".join(log_buffer))

# --- API Endpoints ---

@app.get("/", response_class=HTMLResponse)
async def read_root():
    html_content = """
    <html>
        <head>
            <title>Embedding Worker Logs</title>
            <style>
                body { font-family: monospace; padding: 20px; }
                h1 { color: #333; }
                .log-entry { margin-bottom: 20px; border-bottom: 2px solid #333; padding-bottom: 20px; }
            </style>
        </head>
        <body>
            <h1>πŸ“œ Candidates Embedding Worker</h1>
            <p><i>Most recent batches shown first.</i></p>
            <hr>
    """
    
    if not execution_logs:
        html_content += "<p>No logs yet. Hit the <code>/trigger-batch</code> endpoint to start processing.</p>"
    
    for entry in execution_logs:
        html_content += f"<div class='log-entry'>{entry}</div>"

    html_content += "</body></html>"
    return html_content

@app.get("/trigger-batch")
async def trigger_processing(background_tasks: BackgroundTasks):
    if processing_lock.locked():
        return {"status": "busy", "message": "Worker is currently processing a previous batch."}
    
    background_tasks.add_task(wrapped_worker)
    return {"status": "started", "message": "Batch processing started in background."}

def wrapped_worker():
    if processing_lock.acquire(blocking=False):
        try:
            run_worker_logic()
        finally:
            processing_lock.release()