grctest
diff --git a/‎Dockerfile
Lines changed: 1 addition & 1 deletion b/‎Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/lib/__init__.py
Lines changed: 1 addition & 1 deletion b/‎app/lib/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/lib/endpoints.py
Lines changed: 0 additions & 672 deletions b/‎app/lib/endpoints.py
Lines changed: 0 additions & 672 deletions
diff --git a/‎app/lib/endpoints/benchmark_endpoints.py
Lines changed: 191 additions & 0 deletions b/‎app/lib/endpoints/benchmark_endpoints.py
Lines changed: 191 additions & 0 deletions
diff --git a/‎app/lib/endpoints/chat_endpoints.py
Lines changed: 78 additions & 0 deletions b/‎app/lib/endpoints/chat_endpoints.py
Lines changed: 78 additions & 0 deletions
diff --git a/‎app/lib/endpoints/process_management.py
Lines changed: 97 additions & 0 deletions b/‎app/lib/endpoints/process_management.py
Lines changed: 97 additions & 0 deletions
@@ -23,7 +23,7 @@ RUN apt-get update && apt-get install -y \
 
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt && \
-    pip install "fastapi[standard]" "uvicorn[standard]" httpx fastapi-mcp
+    pip install "fastapi[standard]" "uvicorn[standard]" httpx fastapi-mcp psutil
 
 # (Optional) Run your setup_env.py if needed
 RUN python /code/setup_env.py -md /code/models/BitNet-b1.58-2B-4T -q i2_s
 
@@ -1,4 +1,4 @@
-from .endpoints import ChatRequest
+from .endpoints.chat_endpoints import ChatRequest
 from typing import List
 from pydantic import BaseModel
 
 
@@ -0,0 +1,191 @@
+import os
+import asyncio
+import logging
+from ..models import ModelEnum, BenchmarkRequest, PerplexityRequest
+from ..utils import parse_benchmark_data, parse_perplexity_data
+
+import os
+import subprocess # Keep for CalledProcessError
+import asyncio # Ensure asyncio is imported
+from pydantic import BaseModel, Field
+from fastapi import HTTPException, Query, Depends
+import logging # Import logging
+
+# --- Logging Configuration for this module ---
+logger = logging.getLogger(__name__)
+
+def validate_prompt_length(prompt: str = Query(..., description="Input text for perplexity calculation"), ctx_size: int = Query(10, gt=3)) -> str:
+    token_count = len(prompt.split())
+    min_tokens = 2 * ctx_size
+    if token_count < min_tokens:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Prompt too short. Needs at least {min_tokens} tokens, got {token_count}"
+        )
+    return prompt
+
+async def run_benchmark(
+    model: ModelEnum,
+    n_token: int = Query(128, gt=0),
+    threads: int = Query(2, gt=0),
+    n_prompt: int = Query(32, gt=0)
+):
+    request = BenchmarkRequest(model=model, n_token=n_token, threads=threads, n_prompt=n_prompt)
+    build_dir = os.getenv("BUILD_DIR", "build")
+    bench_path = os.path.join(build_dir, "bin", "llama-bench")
+    if not os.path.exists(bench_path):
+        raise HTTPException(status_code=500, detail="Benchmark binary not found")
+    command = [
+        bench_path,
+        '-m', request.model.value,
+        '-n', str(request.n_token),
+        '-ngl', '0',
+        '-b', '1',
+        '-t', str(request.threads),
+        '-p', str(request.n_prompt),
+        '-r', '5'
+    ]
+    try:
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout_bytes, stderr_bytes = await process.communicate()
+        if process.returncode != 0:
+            raise HTTPException(status_code=500, detail=f"Benchmark failed: {stderr_bytes.decode(errors='ignore')}")
+        parsed_data = parse_benchmark_data(stdout_bytes.decode(errors='ignore'))
+        return parsed_data
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during benchmark: {str(e)}")
+
+async def run_perplexity(
+    model: ModelEnum,
+    prompt: str = Depends(validate_prompt_length),
+    threads: int = Query(2, gt=0),
+    ctx_size: int = Query(10, gt=3),
+    ppl_stride: int = Query(0, ge=0)
+):
+    """Calculate perplexity for given text and model"""
+    try:
+        request = PerplexityRequest(
+            model=model, 
+            prompt=prompt, 
+            threads=threads, 
+            ctx_size=ctx_size, 
+            ppl_stride=ppl_stride
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    build_dir = os.getenv("BUILD_DIR", "build")
+    ppl_path = os.path.join(build_dir, "bin", "llama-perplexity")
+    if not os.path.exists(ppl_path):
+        logger.error(f"Perplexity binary not found at '{ppl_path}'.")
+        raise HTTPException(status_code=500, detail="Perplexity binary not found")
+
+    command = [
+        ppl_path,
+        '--model', request.model.value,
+        '--prompt', request.prompt,
+        '--threads', str(request.threads),
+        '--ctx-size', str(request.ctx_size),
+        '--perplexity',
+        '--ppl-stride', str(request.ppl_stride)
+    ]
+
+    try:
+        logger.info(f"Running perplexity calculation with command: {' '.join(command)}")
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            stdout=asyncio.subprocess.PIPE, # Perplexity might output to stdout or stderr
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout_bytes, stderr_bytes = await process.communicate()
+
+        if process.returncode != 0:
+            logger.error(f"Perplexity calculation failed. RC: {process.returncode}. Stderr: {stderr_bytes.decode(errors='ignore')}")
+            raise subprocess.CalledProcessError(
+                process.returncode, cmd=command, output=stdout_bytes, stderr=stderr_bytes
+            )
+        
+        # Original code parsed from stderr, stick to that unless known otherwise
+        parsed_data = parse_perplexity_data(stderr_bytes.decode(errors='ignore'))
+        logger.info("Perplexity calculation completed successfully.")
+        return parsed_data
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Perplexity calculation failed: {str(e)}. Command: {e.cmd}. RC: {e.returncode}. Stderr: {e.stderr.decode(errors='ignore') if e.stderr else ''}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Perplexity calculation failed: {e.stderr.decode(errors='ignore') if e.stderr else str(e)}")
+    except Exception as e: # Catch any other unexpected errors
+        logger.error(f"Unexpected error during perplexity calculation: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during perplexity calculation: {str(e)}")
+
+def get_model_sizes():
+    """Endpoint to get the file sizes of supported .gguf models."""
+    model_sizes = {}
+    models_dir = "models"
+    for subdir in os.listdir(models_dir):
+        subdir_path = os.path.join(models_dir, subdir)
+        if os.path.isdir(subdir_path):
+            for file in os.listdir(subdir_path):
+                if file.endswith(".gguf"):
+                    file_path = os.path.join(subdir_path, file)
+                    file_size_bytes = os.path.getsize(file_path)
+                    file_size_mb = round(file_size_bytes / (1024 * 1024), 3)
+                    file_size_gb = round(file_size_bytes / (1024 * 1024 * 1024), 3)
+                    model_sizes[file] = {
+                        "bytes": file_size_bytes,
+                        "MB": file_size_mb,
+                        "GB": file_size_gb
+                    }
+    return model_sizes
+
+async def run_benchmark(
+    model: ModelEnum,
+    n_token: int = Query(128, gt=0),
+    threads: int = Query(2, gt=0),
+    n_prompt: int = Query(32, gt=0)
+):
+    """Run benchmark on specified model"""
+    request = BenchmarkRequest(model=model, n_token=n_token, threads=threads, n_prompt=n_prompt)
+    build_dir = os.getenv("BUILD_DIR", "build")
+    bench_path = os.path.join(build_dir, "bin", "llama-bench")
+    if not os.path.exists(bench_path):
+        logger.error(f"Benchmark binary not found at '{bench_path}'.")
+        raise HTTPException(status_code=500, detail="Benchmark binary not found")
+    command = [
+        bench_path,
+        '-m', request.model.value,
+        '-n', str(request.n_token),
+        '-ngl', '0',
+        '-b', '1',
+        '-t', str(request.threads),
+        '-p', str(request.n_prompt),
+        '-r', '5'
+    ]
+    try:
+        logger.info(f"Running benchmark with command: {' '.join(command)}")
+        # Replace subprocess.run with asyncio.create_subprocess_exec and communicate
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout_bytes, stderr_bytes = await process.communicate() # Wait for completion
+
+        if process.returncode != 0:
+            logger.error(f"Benchmark failed. RC: {process.returncode}. Stderr: {stderr_bytes.decode(errors='ignore')}")
+            raise subprocess.CalledProcessError(
+                process.returncode, cmd=command, output=stdout_bytes, stderr=stderr_bytes
+            )
+        
+        parsed_data = parse_benchmark_data(stdout_bytes.decode(errors='ignore'))
+        logger.info("Benchmark completed successfully.")
+        return parsed_data
+    except subprocess.CalledProcessError as e: # Catch the specific error
+        # Log details from the CalledProcessError object
+        logger.error(f"Benchmark failed: {str(e)}. Command: {e.cmd}. RC: {e.returncode}. Stdout: {e.stdout.decode(errors='ignore') if e.stdout else ''}. Stderr: {e.stderr.decode(errors='ignore') if e.stderr else ''}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Benchmark failed: {e.stderr.decode(errors='ignore') if e.stderr else str(e)}")
+    except Exception as e: # Catch any other unexpected errors
+        logger.error(f"Unexpected error during benchmark: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during benchmark: {str(e)}")
@@ -0,0 +1,78 @@
+from fastapi import HTTPException
+import httpx
+import asyncio
+import logging
+from .process_management import get_server_processes, get_server_configs
+from pydantic import BaseModel
+from typing import List
+
+logger = logging.getLogger(__name__)
+
+class ChatRequest(BaseModel):
+    message: str
+    port: int = 8081
+    threads: int = 1
+    ctx_size: int = 2048
+    n_predict: int = 256
+    temperature: float = 0.8
+
+async def chat_with_bitnet(chat: ChatRequest):
+    host = "127.0.0.1"
+    key = (host, chat.port)
+    proc_entry = get_server_processes().get(key)
+    cfg = get_server_configs().get(key)
+    if not (proc_entry and proc_entry["process"].returncode is None and cfg):
+        logger.warning(f"Chat request to non-existent or stopped server on port {chat.port}.")
+        raise HTTPException(status_code=404, detail=f"Server on port {chat.port} not running or not configured.")
+    server_url = f"http://{host}:{chat.port}/completion"
+    payload = {
+        "prompt": chat.message,
+        "threads": chat.threads,
+        "ctx_size": chat.ctx_size,
+        "n_predict": chat.n_predict,
+        "temperature": chat.temperature
+    }
+    async def _chat():
+        async with httpx.AsyncClient() as client:
+            try:
+                logger.info(f"Forwarding chat message to BitNet server on port {chat.port}.")
+                response = await client.post(server_url, json=payload, timeout=300.0)
+                response.raise_for_status()
+                return response.json()
+            except httpx.ReadTimeout:
+                logger.error(f"ReadTimeout when communicating with BitNet server on port {chat.port}.")
+                raise HTTPException(status_code=504, detail=f"Request to BitNet server on port {chat.port} timed out.")
+            except httpx.ConnectError:
+                logger.error(f"ConnectError when communicating with BitNet server on port {chat.port}.")
+                raise HTTPException(status_code=503, detail=f"Could not connect to BitNet server on port {chat.port}.")
+            except httpx.HTTPStatusError as e:
+                logger.error(f"HTTPStatusError from BitNet server on port {chat.port}: {e.response.status_code} - {e.response.text}", exc_info=True)
+                raise HTTPException(status_code=e.response.status_code, detail=f"BitNet server error: {e.response.text}")
+            except Exception as e:
+                logger.error(f"Unexpected error during chat with BitNet server on port {chat.port}: {str(e)}", exc_info=True)
+                error_detail = f"An unexpected error occurred while communicating with BitNet server on port {chat.port}: {str(e)}"
+                raise HTTPException(status_code=500, detail=error_detail)
+    return await _chat()
+
+class MultiChatRequest(BaseModel):
+    requests: List[ChatRequest]
+
+async def multichat_with_bitnet(multichat: MultiChatRequest):
+    logger.info(f"Multichat request received for {len(multichat.requests)} chats.")
+    async def run_chat(chat_req: ChatRequest):
+        chat_fn = chat_with_bitnet(chat_req)
+        return await chat_fn
+    results = await asyncio.gather(*(run_chat(req) for req in multichat.requests), return_exceptions=True)
+    formatted = []
+    for i, res in enumerate(results):
+        if isinstance(res, Exception):
+            if isinstance(res, HTTPException):
+                formatted.append({"error": res.detail})
+            else:
+                formatted.append({"error": str(res)})
+        elif isinstance(res, dict) and "content" in res:
+            formatted.append(res["content"])
+        else:
+            formatted.append(res)
+    logger.info("Multichat processing completed.")
+    return {"results": formatted}
@@ -0,0 +1,97 @@
+import psutil
+import os
+import asyncio
+import logging
+import atexit
+from typing import Dict, Tuple, Any
+
+logger = logging.getLogger(__name__)
+
+server_processes: Dict[Tuple[str, int], Dict[str, Any]] = {}
+server_configs: Dict[Tuple[str, int], Dict[str, Any]] = {}
+
+def get_server_processes():
+    return server_processes
+
+def get_server_configs():
+    return server_configs
+FASTAPI_PORT = 8080
+_atexit_cleanup_completed = False
+
+def _max_server_instances_by_ram(per_server_gb=1):
+    total_gb = psutil.virtual_memory().total / (1024 ** 3)
+    used_gb = psutil.virtual_memory().used / (1024 ** 3)
+    available_gb = total_gb - used_gb
+    return int(available_gb // per_server_gb)
+
+async def _terminate_server_process(key: tuple[str, int]):
+    host, port = key
+    if port == FASTAPI_PORT:
+        logger.warning(f"Attempt to terminate FastAPI server on port {port} denied.")
+        return f"Operation denied: Port {port} is used by the FastAPI application and cannot be terminated via this endpoint."
+    proc_entry = server_processes.get(key)
+    if not proc_entry:
+        server_configs.pop(key, None)
+        logger.info(f"No server process found for key {key} (port {port}) during termination attempt.")
+        return f"No server process found for key {key} (port {port})."
+    proc_to_terminate = proc_entry["process"]
+    pid = proc_entry["pid"]
+    if proc_to_terminate.returncode is None:
+        logger.info(f"Attempting to terminate server on port {port} (PID: {pid}).")
+        try:
+            proc_to_terminate.terminate()
+            await asyncio.wait_for(proc_to_terminate.wait(), timeout=5.0)
+            logger.info(f"Server on port {port} (PID: {pid}) terminated successfully after SIGTERM.")
+            server_processes.pop(key, None)
+            server_configs.pop(key, None)
+            return f"Server on port {port} (PID: {pid}) terminated successfully."
+        except asyncio.TimeoutError:
+            logger.warning(f"Server on port {port} (PID: {pid}) did not respond to SIGTERM within timeout. Attempting SIGKILL.")
+            try:
+                proc_to_terminate.kill()
+                await proc_to_terminate.wait()
+                logger.info(f"Server on port {port} (PID: {pid}) forcefully killed.")
+                server_processes.pop(key, None)
+                server_configs.pop(key, None)
+                return f"Server on port {port} (PID: {pid}) forcefully killed as it did not respond to SIGTERM."
+            except Exception as e_kill:
+                logger.error(f"Error forcefully killing server on port {port} (PID: {pid}): {str(e_kill)}", exc_info=True)
+                return f"Error forcefully killing server on port {port} (PID: {pid}): {str(e_kill)}. Process may still be running."
+        except Exception as e_term:
+            logger.error(f"Error terminating server on port {port} (PID: {pid}) with SIGTERM: {str(e_term)}", exc_info=True)
+            return f"Error terminating server on port {port} (PID: {pid}): {str(e_term)}. Process may still be running."
+    else:
+        logger.info(f"Server on port {port} was already stopped (return code: {proc_to_terminate.returncode}). Cleaned up tracking.")
+        server_processes.pop(key, None)
+        server_configs.pop(key, None)
+        return f"Server on port {port} was already stopped. Cleaned up tracking."
+
+async def _terminate_all_servers():
+    global _atexit_cleanup_completed
+    if _atexit_cleanup_completed:
+        return
+    logger.info("Attempting to terminate all running server processes asynchronously at exit.")
+    keys_to_terminate = list(server_processes.keys())
+    tasks = [_terminate_server_process(key) for key in keys_to_terminate]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for i, key in enumerate(keys_to_terminate):
+        result = results[i]
+        if isinstance(result, Exception):
+            logger.error(f"Error during atexit termination for server {key}: {result}", exc_info=result)
+        else:
+            logger.info(f"Atexit termination for server {key}: {result}")
+    logger.info("Asynchronous termination of all server processes at exit completed.")
+    _atexit_cleanup_completed = True
+
+def _run_async_cleanup_on_exit():
+    try:
+        asyncio.run(_terminate_all_servers())
+    except RuntimeError as e:
+        if ("cannot schedule new futures after shutdown" in str(e).lower() or "event loop is closed" in str(e).lower()):
+            logger.warning(f"Could not run async cleanup at exit because event loop was closed or shutting down: {e}")
+        else:
+            logger.error(f"Unexpected RuntimeError during atexit async cleanup: {e}", exc_info=True)
+    except Exception as e:
+        logger.error(f"Unexpected Exception during atexit async cleanup: {e}", exc_info=True)
+
+atexit.register(_run_async_cleanup_on_exit)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from .endpoints import ChatRequest`
	`1`	`+from .endpoints.chat_endpoints import ChatRequest`
`2`	`2`	`from typing import List`
`3`	`3`	`from pydantic import BaseModel`
`4`	`4`