Seth0330 commited on
Commit
93fcaaf
·
verified ·
1 Parent(s): 32c001b

Update backend/app/openrouter_client.py

Browse files
Files changed (1) hide show
  1. backend/app/openrouter_client.py +61 -32
backend/app/openrouter_client.py CHANGED
@@ -23,7 +23,7 @@ MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
23
  # HuggingFace Inference API
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
  HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
26
- HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen2-VL-7B-Instruct") # Alternative HF model
27
 
28
  # Backend selection: "openrouter" or "huggingface"
29
  EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
@@ -325,17 +325,10 @@ async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, tot
325
 
326
 
327
  async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
328
- """Extract from a single page using HuggingFace Inference API."""
329
  if not HF_TOKEN:
330
  raise RuntimeError("HF_TOKEN environment variable is not set")
331
 
332
- try:
333
- from huggingface_hub import InferenceClient
334
- except ImportError:
335
- raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
336
-
337
- client = InferenceClient(model=HF_MODEL_NAME, token=HF_TOKEN, timeout=180.0)
338
-
339
  prompt = (
340
  f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
341
  "Extract every word, number, and piece of information, including any non-English text. "
@@ -345,39 +338,70 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
345
  print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
346
 
347
  try:
348
- # Convert image bytes to base64 data URL for HuggingFace API
 
 
 
349
  image_base64 = base64.b64encode(image_bytes).decode('utf-8')
350
- image_data_url = f"data:image/jpeg;base64,{image_base64}"
351
 
352
- # HF Inference API for vision models - use chat completion with base64 image
353
- result = client.chat_completion(
354
- messages=[
355
- {
356
- "role": "user",
357
- "content": [
358
- {"type": "text", "text": prompt},
359
- {"type": "image", "image": image_data_url} # Use base64 data URL, not raw bytes
360
- ]
361
- }
362
- ],
363
- max_tokens=2048
364
- )
 
 
 
 
 
 
365
 
366
- # Extract response text
367
- if isinstance(result, dict):
368
- if "choices" in result and len(result["choices"]) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  message = result["choices"][0].get("message", {})
370
- if isinstance(message.get("content"), list):
371
- # Content might be a list of content blocks
372
  response_text = "".join(
373
  item.get("text", "")
374
- for item in message["content"]
375
  if item.get("type") == "text"
376
  )
377
  else:
378
- response_text = message.get("content", "")
379
  else:
380
- response_text = result.get("generated_text", str(result))
381
  elif isinstance(result, str):
382
  response_text = result
383
  else:
@@ -385,8 +409,13 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
385
 
386
  if not response_text:
387
  raise ValueError("Empty response from HuggingFace API")
 
 
388
 
389
  return _parse_model_response(response_text, page_num)
 
 
 
390
  except Exception as e:
391
  print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {str(e)}")
392
  raise RuntimeError(f"HuggingFace API error for page {page_num}: {str(e)}")
 
23
  # HuggingFace Inference API
24
  HF_TOKEN = os.environ.get("HF_TOKEN")
25
  HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
26
+ HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen3-VL-235B-A22B-Instruct") # Default HF model
27
 
28
  # Backend selection: "openrouter" or "huggingface"
29
  EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
 
325
 
326
 
327
  async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
328
+ """Extract from a single page using HuggingFace Inference API (standard endpoint)."""
329
  if not HF_TOKEN:
330
  raise RuntimeError("HF_TOKEN environment variable is not set")
331
 
 
 
 
 
 
 
 
332
  prompt = (
333
  f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
334
  "Extract every word, number, and piece of information, including any non-English text. "
 
338
  print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
339
 
340
  try:
341
+ # Use standard HuggingFace Inference API endpoint (not chat_completion/router)
342
+ api_url = f"{HF_INFERENCE_API_URL}/{HF_MODEL_NAME}"
343
+
344
+ # Convert image to base64
345
  image_base64 = base64.b64encode(image_bytes).decode('utf-8')
 
346
 
347
+ # For Qwen3-VL models, use the chat format through standard API
348
+ # The standard API accepts chat-completion format for compatible models
349
+ payload = {
350
+ "inputs": {
351
+ "messages": [
352
+ {
353
+ "role": "user",
354
+ "content": [
355
+ {"type": "text", "text": prompt},
356
+ {"type": "image", "image": f"data:image/jpeg;base64,{image_base64}"}
357
+ ]
358
+ }
359
+ ]
360
+ },
361
+ "parameters": {
362
+ "max_new_tokens": 2048,
363
+ "temperature": 0.1
364
+ }
365
+ }
366
 
367
+ headers = {
368
+ "Authorization": f"Bearer {HF_TOKEN}",
369
+ "Content-Type": "application/json"
370
+ }
371
+
372
+ timeout = httpx.Timeout(180.0, connect=30.0)
373
+ async with httpx.AsyncClient(timeout=timeout) as client:
374
+ print(f"[INFO] Making POST request to {api_url}...")
375
+ resp = await client.post(api_url, headers=headers, json=payload)
376
+ print(f"[INFO] Received response: Status {resp.status_code}")
377
+ resp.raise_for_status()
378
+ result = resp.json()
379
+
380
+ # Extract response text - format depends on model response
381
+ response_text = None
382
+ if isinstance(result, list) and len(result) > 0:
383
+ # Standard API often returns list with generated_text
384
+ response_text = result[0].get("generated_text", str(result[0]))
385
+ elif isinstance(result, dict):
386
+ # Check for different response formats
387
+ if "generated_text" in result:
388
+ response_text = result["generated_text"]
389
+ elif "text" in result:
390
+ response_text = result["text"]
391
+ elif "choices" in result and len(result["choices"]) > 0:
392
+ # Chat completion format
393
  message = result["choices"][0].get("message", {})
394
+ content = message.get("content", "")
395
+ if isinstance(content, list):
396
  response_text = "".join(
397
  item.get("text", "")
398
+ for item in content
399
  if item.get("type") == "text"
400
  )
401
  else:
402
+ response_text = content
403
  else:
404
+ response_text = str(result)
405
  elif isinstance(result, str):
406
  response_text = result
407
  else:
 
409
 
410
  if not response_text:
411
  raise ValueError("Empty response from HuggingFace API")
412
+
413
+ print(f"[DEBUG] HuggingFace response preview: {response_text[:500]}")
414
 
415
  return _parse_model_response(response_text, page_num)
416
+ except httpx.HTTPStatusError as e:
417
+ print(f"[ERROR] HuggingFace API HTTP error: {e.response.status_code} - {e.response.text[:500]}")
418
+ raise RuntimeError(f"HuggingFace API error for page {page_num}: {e.response.status_code} - {str(e)}")
419
  except Exception as e:
420
  print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {str(e)}")
421
  raise RuntimeError(f"HuggingFace API error for page {page_num}: {str(e)}")