Spaces:
Sleeping
Sleeping
Update backend/app/openrouter_client.py
Browse files- backend/app/openrouter_client.py +57 -64
backend/app/openrouter_client.py
CHANGED
|
@@ -325,10 +325,21 @@ async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, tot
|
|
| 325 |
|
| 326 |
|
| 327 |
async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
|
| 328 |
-
"""Extract from a single page using HuggingFace Inference API (
|
| 329 |
if not HF_TOKEN:
|
| 330 |
raise RuntimeError("HF_TOKEN environment variable is not set")
|
| 331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
prompt = (
|
| 333 |
f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
|
| 334 |
"Extract every word, number, and piece of information, including any non-English text. "
|
|
@@ -338,74 +349,50 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
|
|
| 338 |
print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
|
| 339 |
|
| 340 |
try:
|
| 341 |
-
#
|
| 342 |
-
api_url = f"{HF_INFERENCE_API_URL}/{HF_MODEL_NAME}"
|
| 343 |
-
|
| 344 |
-
# Convert image to base64
|
| 345 |
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
|
|
|
| 346 |
|
| 347 |
-
#
|
| 348 |
-
#
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
{
|
| 353 |
"role": "user",
|
| 354 |
"content": [
|
| 355 |
-
{
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
]
|
| 358 |
}
|
| 359 |
-
]
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
}
|
| 365 |
-
}
|
| 366 |
-
|
| 367 |
-
headers = {
|
| 368 |
-
"Authorization": f"Bearer {HF_TOKEN}",
|
| 369 |
-
"Content-Type": "application/json"
|
| 370 |
-
}
|
| 371 |
-
|
| 372 |
-
timeout = httpx.Timeout(180.0, connect=30.0)
|
| 373 |
-
async with httpx.AsyncClient(timeout=timeout) as client:
|
| 374 |
-
print(f"[INFO] Making POST request to {api_url}...")
|
| 375 |
-
resp = await client.post(api_url, headers=headers, json=payload)
|
| 376 |
-
print(f"[INFO] Received response: Status {resp.status_code}")
|
| 377 |
-
resp.raise_for_status()
|
| 378 |
-
result = resp.json()
|
| 379 |
|
| 380 |
-
# Extract response text
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
elif isinstance(result, dict):
|
| 386 |
-
# Check for different response formats
|
| 387 |
-
if "generated_text" in result:
|
| 388 |
-
response_text = result["generated_text"]
|
| 389 |
-
elif "text" in result:
|
| 390 |
-
response_text = result["text"]
|
| 391 |
-
elif "choices" in result and len(result["choices"]) > 0:
|
| 392 |
-
# Chat completion format
|
| 393 |
-
message = result["choices"][0].get("message", {})
|
| 394 |
-
content = message.get("content", "")
|
| 395 |
-
if isinstance(content, list):
|
| 396 |
-
response_text = "".join(
|
| 397 |
-
item.get("text", "")
|
| 398 |
-
for item in content
|
| 399 |
-
if item.get("type") == "text"
|
| 400 |
-
)
|
| 401 |
-
else:
|
| 402 |
-
response_text = content
|
| 403 |
else:
|
| 404 |
-
response_text = str(
|
| 405 |
-
elif isinstance(result, str):
|
| 406 |
-
response_text = result
|
| 407 |
else:
|
| 408 |
-
response_text = str(
|
| 409 |
|
| 410 |
if not response_text:
|
| 411 |
raise ValueError("Empty response from HuggingFace API")
|
|
@@ -413,12 +400,18 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
|
|
| 413 |
print(f"[DEBUG] HuggingFace response preview: {response_text[:500]}")
|
| 414 |
|
| 415 |
return _parse_model_response(response_text, page_num)
|
| 416 |
-
except httpx.HTTPStatusError as e:
|
| 417 |
-
print(f"[ERROR] HuggingFace API HTTP error: {e.response.status_code} - {e.response.text[:500]}")
|
| 418 |
-
raise RuntimeError(f"HuggingFace API error for page {page_num}: {e.response.status_code} - {str(e)}")
|
| 419 |
except Exception as e:
|
| 420 |
-
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
|
| 424 |
def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
|
|
|
|
| 325 |
|
| 326 |
|
| 327 |
async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
|
| 328 |
+
"""Extract from a single page using HuggingFace Inference API (router endpoint)."""
|
| 329 |
if not HF_TOKEN:
|
| 330 |
raise RuntimeError("HF_TOKEN environment variable is not set")
|
| 331 |
|
| 332 |
+
try:
|
| 333 |
+
from huggingface_hub import InferenceClient
|
| 334 |
+
except ImportError:
|
| 335 |
+
raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
|
| 336 |
+
|
| 337 |
+
# Use InferenceClient with router endpoint (required for newer models)
|
| 338 |
+
client = InferenceClient(
|
| 339 |
+
api_key=HF_TOKEN,
|
| 340 |
+
timeout=180.0
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
prompt = (
|
| 344 |
f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
|
| 345 |
"Extract every word, number, and piece of information, including any non-English text. "
|
|
|
|
| 349 |
print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
|
| 350 |
|
| 351 |
try:
|
| 352 |
+
# Convert image bytes to base64 data URL
|
|
|
|
|
|
|
|
|
|
| 353 |
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
| 354 |
+
image_data_url = f"data:image/jpeg;base64,{image_base64}"
|
| 355 |
|
| 356 |
+
# Use chat.completions.create() as shown in HuggingFace documentation
|
| 357 |
+
# This uses the router endpoint which is now required
|
| 358 |
+
# Run in executor since it's a blocking synchronous call
|
| 359 |
+
import asyncio
|
| 360 |
+
loop = asyncio.get_event_loop()
|
| 361 |
+
completion = await loop.run_in_executor(
|
| 362 |
+
None,
|
| 363 |
+
lambda: client.chat.completions.create(
|
| 364 |
+
model=HF_MODEL_NAME,
|
| 365 |
+
messages=[
|
| 366 |
{
|
| 367 |
"role": "user",
|
| 368 |
"content": [
|
| 369 |
+
{
|
| 370 |
+
"type": "text",
|
| 371 |
+
"text": prompt
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"type": "image_url",
|
| 375 |
+
"image_url": {
|
| 376 |
+
"url": image_data_url
|
| 377 |
+
}
|
| 378 |
+
}
|
| 379 |
]
|
| 380 |
}
|
| 381 |
+
],
|
| 382 |
+
max_tokens=2048,
|
| 383 |
+
temperature=0.1
|
| 384 |
+
)
|
| 385 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
+
# Extract response text from completion
|
| 388 |
+
if hasattr(completion, 'choices') and len(completion.choices) > 0:
|
| 389 |
+
message = completion.choices[0].message
|
| 390 |
+
if hasattr(message, 'content'):
|
| 391 |
+
response_text = message.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
else:
|
| 393 |
+
response_text = str(message)
|
|
|
|
|
|
|
| 394 |
else:
|
| 395 |
+
response_text = str(completion)
|
| 396 |
|
| 397 |
if not response_text:
|
| 398 |
raise ValueError("Empty response from HuggingFace API")
|
|
|
|
| 400 |
print(f"[DEBUG] HuggingFace response preview: {response_text[:500]}")
|
| 401 |
|
| 402 |
return _parse_model_response(response_text, page_num)
|
|
|
|
|
|
|
|
|
|
| 403 |
except Exception as e:
|
| 404 |
+
error_msg = str(e)
|
| 405 |
+
print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {error_msg}")
|
| 406 |
+
|
| 407 |
+
# Check if it's a permissions error
|
| 408 |
+
if "403" in error_msg or "permissions" in error_msg.lower() or "Forbidden" in error_msg:
|
| 409 |
+
raise RuntimeError(
|
| 410 |
+
f"HuggingFace API error for page {page_num}: Insufficient permissions. "
|
| 411 |
+
"Your HF_TOKEN may need to be a token with 'read' access to Inference API. "
|
| 412 |
+
"Check your HuggingFace account settings and token permissions."
|
| 413 |
+
)
|
| 414 |
+
raise RuntimeError(f"HuggingFace API error for page {page_num}: {error_msg}")
|
| 415 |
|
| 416 |
|
| 417 |
def _parse_model_response(text: str, page_num: int = None) -> Dict[str, Any]:
|