{"service":"LLM Router","description":"Unified LLM gateway: local Gemma 4 31B, PaddleX OCR, and external API passthrough (single port)","endpoints":{"GET /":"This help message","GET /help":"This help message","GET /health":"Server status and GPU info","GET /health/liveliness":"Simple liveness check","GET /v1/models":"List available models","POST /v1/chat/completions":"Chat completion (OpenAI-compatible, local + external)","POST /v1/layout/detect":"Layout detection (PaddleX PP-DocLayoutV2), body: {\"image\": \"<base64>\"}","POST /v1/ocr/predict":"OCR (PaddleX PaddleOCR-VL), body: {\"image\": \"<base64>\"}"},"models":{"local":"Gemma 4 31B Q4_K_M on GPU 0+1 (32K context)","openai/*":"Proxied to OpenAI (e.g. openai/gpt-5.4)","anthropic/*":"Proxied to Anthropic (e.g. anthropic/claude-4-sonnet)","google/*":"Proxied to Google (e.g. google/gemini-2.0-flash)","paddlex":"Load balanced across paddlex-0 (GPU 0) and paddlex-1 (GPU 1) for layout/OCR"},"aliases":{"gemma-27b":"local (backward compat)","gemma-31b":"local","gemma":"local"},"example_curl":"curl http://localhost:8088/v1/chat/completions -H \"Content-Type: application/json\" -H \"Authorization: Bearer sk-llm-r...\" -d '{\"model\": \"local\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]}'","example_python":"from openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8088/v1\", api_key=\"sk-llm-r...\")\nresponse = client.chat.completions.create(\n    model=\"local\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello!\"}]\n)","notes":{"context":"32K context (dual GPU tensor split)","lazy_loading":"Models load on first request (~30-60s for Gemma 4 31B)","auto_unload":"Models unload after 120s inactivity","multimodal":"Supports images via base64 data URLs","thinking":"Gemma 4 thinking is OFF by default. To enable, send \"thinking\": true in the request body.","tracing":"Langfuse tracing is ON by default for local models. To disable, send \"trace\": false in the request body.","external_models":"Models prefixed with openai/, anthropic/, or google/ are proxied to their respective APIs via internal LiteLLM."}}