commit 1e2567fd59ca7e1cef12ea5de007a28c2c4981b3 Author: mrmarcus007 Date: Mon Nov 10 16:02:57 2025 +0100 first commit diff --git a/GPU Resource Manager Proxy for Ollama (Designed for Proxmox).py b/GPU Resource Manager Proxy for Ollama (Designed for Proxmox).py new file mode 100644 index 0000000..80e8d08 --- /dev/null +++ b/GPU Resource Manager Proxy for Ollama (Designed for Proxmox).py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +import http.server +import socketserver +import requests +import json +import time +import threading +import subprocess +from datetime import datetime, time as dt_time +from urllib.parse import urlparse, parse_qs +import logging + +# Configuration +OLLAMA_HOST = "localhost" # Your Ollama LXC IP +OLLAMA_PORT = 11434 # Your Ollama LXC Port +PROXY_PORT = 11435 # Port Of This Proxy +OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Don't touch unless you know what you are doing. + +# GPU monitoring +GPU_CHECK_INTERVAL = 10 # seconds it wait's to check for other process apart from known/compute processes + +# process process patterns (from your nvidia-smi output) +IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer'] +KNOWN_NvidiaGPU_PROCESSES = ['Xorg'] # Processes that are allowed when "idle" compute process is running +IDLE_CONTAINER_ID = "" # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120" +Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container. +Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again. +#----------------------------------------------------------active-code--------------------------------------------------------------# +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('/var/log/gpu_proxy.log'), + logging.StreamHandler() + ] +) + +class GPUResourceManager: + def __init__(self): + self.idle_compute_running = False + self.ollama_active = False + self.last_ollama_activity = 0 + self.ollama_activity_timeout = 120 # seconds of no activity before considering Ollama done + self.last_gpu_check = 0 + self.gpu_processes = [] + self.lock = threading.Lock() + self.operation_in_progress = False + + def run_command(self, cmd): + try: + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) + return result.stdout.strip() + except subprocess.CalledProcessError as e: + logging.error(f"Command failed: {cmd}, error: {e.stderr}") + return None + + def is_container_running(self, container_id): + output = self.run_command(f"pct list | grep \"^{container_id}\"") + if output and "running" in output: + return True + return False + + def stop_container(self, container_id): + if self.is_container_running(container_id): + logging.info(f"Stopping container {container_id}") + result = self.run_command(f"pct stop {container_id}") + if result is not None: + max_wait = 15 + waited = 0 + while self.is_container_running(container_id) and waited < max_wait: + time.sleep(1) + waited += 1 + + if not self.is_container_running(container_id): + logging.info(f"Container {container_id} stopped successfully") + return True + else: + logging.warning(f"Container {container_id} still running after {max_wait} seconds") + return False + else: + logging.error(f"Failed to stop container {container_id}") + else: + logging.debug(f"Container {container_id} already stopped, no action needed") + return True + return False + + def start_container(self, container_id): + if not self.is_container_running(container_id): + logging.info(f"Starting container {container_id}") + result = self.run_command(f"pct start {container_id}") + if result is not None: + max_wait = 15 + waited = 0 + while not self.is_container_running(container_id) and waited < max_wait: + time.sleep(1) + waited += 1 + + if self.is_container_running(container_id): + logging.info(f"Container {container_id} started successfully") + return True + else: + logging.error(f"Container {container_id} failed to start within {max_wait} seconds") + else: + logging.error(f"Failed to start container {container_id}") + else: + logging.debug(f"Container {container_id} already running, no action needed") + return True + return False + + def get_gpu_processes(self): + try: + output = self.run_command("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits") + processes = [] + if output: + for line in output.split('\n'): + if line.strip(): + parts = line.split(', ') + if len(parts) >= 3: + processes.append({ + 'pid': parts[0], + 'name': parts[1].strip(), + 'memory': parts[2] + ' MiB' + }) + return processes + except Exception as e: + logging.error(f"Error getting GPU processes: {e}") + return [] + + def is_compute_process(self, process_name): + process_lower = process_name.lower() + for pattern in IDLE_NvGPU_PROCESSES: + if pattern in process_lower: + return True + return False + + def is_known_system_process(self, process_name): + return any(sys_proc in process_name for sys_proc in KNOWN_NvidiaGPU_PROCESSES) + + def is_gpu_idle(self): + processes = self.get_gpu_processes() + + non_mining_processes = [] + for process in processes: + if not self.is_compute_process(process['name']) and not self.is_known_system_process(process['name']): + memory_usage = int(process['memory'].split()[0]) + if memory_usage > 100: # More than 100MB is significant + non_mining_processes.append(process) + + is_idle = len(non_mining_processes) == 0 + + if is_idle: + mining_count = len([p for p in processes if self.is_compute_process(p['name'])]) + if mining_count > 0: + logging.debug("GPU is running idle NvGPU task (acceptable idle state)") + else: + logging.debug("GPU is truly idle (no significant processes)") + else: + logging.debug(f"GPU is active with non-mining processes: {[p['name'] for p in non_mining_processes]}") + + return is_idle + + def is_Idle_NvGPU_process_active(self): + processes = self.get_gpu_processes() + mining_processes = [p for p in processes if self.is_compute_process(p['name'])] + return len(mining_processes) > 0 + + def is_ollama_still_active(self): + if not self.ollama_active: + return False + time_since_last_activity = time.time() - self.last_ollama_activity + return time_since_last_activity < self.ollama_activity_timeout + + def should_stop_for_schedule(self): + now = datetime.now().time() + stop_start = dt_time(Blackout_schedule_Start) # 2:15 AM + stop_end = dt_time(Blackout_schedule_End) # 3:30 AM + in_window = stop_start <= now <= stop_end + if in_window: + logging.debug("Within scheduled maintenance window (2:15am-3:30am)") + return in_window + + def manage_idle_NvGPU_process(self): + with self.lock: + if self.operation_in_progress: + return + + self.operation_in_progress = True + try: + current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID) + mining_active_on_gpu = self.is_Idle_NvGPU_process_active() + self.idle_compute_running = current_idle_NvGPU_process_state + if self.should_stop_for_schedule(): + if current_idle_NvGPU_process_state: + logging.info("Stopping idle NvGPU container due to scheduled maintenance window") + self.stop_container(IDLE_CONTAINER_ID) + self.idle_compute_running = False + return + + + ollama_still_active = self.is_ollama_still_active() + if ollama_still_active: + if current_idle_NvGPU_process_state or mining_active_on_gpu: + logging.info("Ollama still active, keeping idle NvGPU container stopped") + if current_idle_NvGPU_process_state: + self.stop_container(IDLE_CONTAINER_ID) + self.idle_compute_running = False + return + else: + + if self.ollama_active: + self.ollama_active = False + logging.info("Ollama activity timeout reached") + + if self.is_gpu_idle(): + if not current_idle_NvGPU_process_state and not mining_active_on_gpu: + logging.info("GPU idle, starting idle NvGPU container") + if self.start_container(IDLE_CONTAINER_ID): + self.idle_compute_running = True + elif mining_active_on_gpu and not current_idle_NvGPU_process_state: + self.idle_compute_running = True + logging.debug("Mining active on GPU, updating state") + else: + if current_idle_NvGPU_process_state: + logging.info("GPU in use by other process, stopping idle NvGPU container") + self.stop_container(IDLE_CONTAINER_ID) + self.idle_compute_running = False + finally: + self.operation_in_progress = False + + def force_stop_idle_NvGPU_process_for_ollama(self): + with self.lock: + self.ollama_active = True + self.last_ollama_activity = time.time() + current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID) + mining_active_on_gpu = self.is_Idle_NvGPU_process_active() + if current_idle_NvGPU_process_state or mining_active_on_gpu: + logging.info("Force stopping idle NvGPU container for Ollama GPU operation") + if current_idle_NvGPU_process_state: + self.stop_container(IDLE_CONTAINER_ID) + if mining_active_on_gpu: + max_wait = 10 + waited = 0 + while self.is_Idle_NvGPU_process_active() and waited < max_wait: + time.sleep(1) + waited += 1 + if self.is_Idle_NvGPU_process_active(): + logging.warning("Mining processes still active after container stop") + else: + logging.debug("idle NvGPU container already stopped, no action needed") + + self.idle_compute_running = False + + def start_monitoring(self): + def monitor_loop(): + while True: + try: + self.manage_idle_NvGPU_process() + except Exception as e: + logging.error(f"Error in monitor loop: {e}") + time.sleep(GPU_CHECK_INTERVAL) + + monitor_thread = threading.Thread(target=monitor_loop, daemon=True) + monitor_thread.start() + logging.info("GPU monitoring thread started") + +class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler): + def __init__(self, *args, **kwargs): + self.gpu_manager = kwargs.pop('gpu_manager') + super().__init__(*args, **kwargs) + + def do_GET(self): + if self._is_gpu_intensive_operation(self.path, {}): + self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama() + time.sleep(2) + + self._forward_request('GET') + + def do_POST(self): + content_length = int(self.headers.get('Content-Length', 0)) + post_data = self.rfile.read(content_length) if content_length > 0 else b'' + + try: + request_data = json.loads(post_data.decode('utf-8')) if post_data else {} + is_gpu_intensive = self._is_gpu_intensive_operation(self.path, request_data) + + if is_gpu_intensive: + logging.info(f"GPU-intensive operation detected: {self.path}") + self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama() + time.sleep(3.5) + + self._forward_request('POST', post_data) + + if is_gpu_intensive: + with self.gpu_manager.lock: + self.gpu_manager.last_ollama_activity = time.time() + logging.info("Ollama request completed, activity timestamp updated") + + except Exception as e: + logging.error(f"Error processing request: {e}") + self.send_error(500, f"Internal server error: {e}") + + def _is_gpu_intensive_operation(self, path, request_data): + if path in ['/api/generate', '/api/chat', '/api/embeddings']: + return True + if path == '/api/load': + return True + if path == '/api/pull': + return request_data.get('stream', True) + return False + + def _forward_request(self, method, data=None): + url = f"{OLLAMA_BASE_URL}{self.path}" + headers = {key: value for key, value in self.headers.items()} + hop_headers = ['connection', 'keep-alive', 'proxy-authenticate', + 'proxy-authorization', 'te', 'trailers', 'upgrade'] + for header in hop_headers: + headers.pop(header, None) + + try: + if method == 'GET': + response = requests.get(url, headers=headers, timeout=300) + else: + response = requests.post(url, data=data, headers=headers, timeout=300) + + self.send_response(response.status_code) + for key, value in response.headers.items(): + if key.lower() not in ['content-encoding', 'transfer-encoding', 'connection']: + self.send_header(key, value) + self.end_headers() + self.wfile.write(response.content) + + except requests.exceptions.RequestException as e: + logging.error(f"Error forwarding to Ollama: {e}") + self.send_error(502, f"Bad gateway: {e}") + + def log_message(self, format, *args): + logging.info(f"{self.address_string()} - {format % args}") + +def main(): + logging.info("Starting GPU Resource Manager Proxy on Proxmox Host") + + manager = GPUResourceManager() + test_output = manager.run_command("pct list > /dev/null && echo 'pct available'") + if test_output: + logging.info("pct command available") + else: + logging.error("pct command not available! Running on wrong system?") + return + + gpu_processes = manager.get_gpu_processes() + logging.info(f"Current GPU processes: {gpu_processes}") + + idle_NvGPU_process_status = manager.is_container_running(IDLE_CONTAINER_ID) + logging.info(f"idle NvGPU {IDLE_CONTAINER_ID} running: {idle_NvGPU_process_status}") + + gpu_manager = GPUResourceManager() + + gpu_manager.start_monitoring() + + handler = lambda *args, **kwargs: OllamaProxyHandler(*args, gpu_manager=gpu_manager, **kwargs) + + with socketserver.TCPServer(("", PROXY_PORT), handler) as httpd: + logging.info(f"Proxy server running on port {PROXY_PORT}") + logging.info(f"Forwarding to Ollama at {OLLAMA_HOST}:{OLLAMA_PORT}") + logging.info(f"Managing idle NvGPU process: {IDLE_CONTAINER_ID}") + logging.info("Monitoring GPU usage and scheduled maintenance windows") + logging.info(f"Mining process patterns: {IDLE_NvGPU_PROCESSES}") + + try: + httpd.serve_forever() + except KeyboardInterrupt: + logging.info("Shutting down proxy server") + except Exception as e: + logging.error(f"Server error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4f4adfc --- /dev/null +++ b/README.md @@ -0,0 +1,268 @@ +# GPU Resource Manager & Proxy for Ollama +*A smart GPU-aware proxy for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.* + +--- + +## โœ… Overview + +The **GPU Resource Manager & Proxy for Ollama** is a lightweight Python service that sits between your applications and the Ollama server. It intelligently supervises NVIDIA GPU usage on **Proxmox VE** hosts and ensures that GPU-intensive background tasks (e.g. mining) are temporarily suspended whenever Ollama requires GPU power. + +This enables smooth coexistence of AI workloads and other GPU idle tasks on the same host. + +--- + +## โœจ Features + +### ๐Ÿ” Intelligent GPU Monitoring +- Detects GPU usage patterns and active processes via `nvidia-smi`. +- Differentiates essential system processes from idle GPU workloads. + +### โš™๏ธ Dynamic Resource Allocation +- Automatically pauses idle/non-critical GPU processes when Ollama becomes active. +- Automatically resumes them after configurable inactivity timeouts. + +### ๐Ÿ—“๏ธ Scheduled Blackout Window +- By Default Automatically stops idle GPU processes between **2:15 AM โ€“ 3:30 AM** for maintenance. + +### ๐Ÿ–ฅ๏ธ Proxmox LXC Integration +- Direct container control using Proxmox's `pct` command. +- Ideal for GPU-passthrough LXC containers (miners, renderers, etc.). + +### โšก Real-Time Process Detection +- Inspects NVIDIA GPU processes continuously. +- Supports customizable allow-lists and idle-process lists. + +--- + +## ๐Ÿ“ฆ Requirements + +- NVIDIA GPU + drivers +- `nvidia-smi` +- Python **3.x** +- `python3-requests` +- Proxmox VE host +- At least one GPU-passthrough LXC container +- Optional: Ollama server running inside LXC + +Install required packages: + +```bash +sudo apt update +sudo apt install python3 python3-requests +``` + +--- + +## โš™๏ธ Configuration + +These are the primary configuration variables inside the script: + +```python +# Basic Configuration +OLLAMA_HOST = "localhost" # Ollama container IP +OLLAMA_PORT = 11434 # Ollama API port +PROXY_PORT = 11435 # Proxy server port +GPU_CHECK_INTERVAL = 10 # Seconds between GPU checks + +# GPU Process Management +IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer'] +KNOWN_NvidiaGPU_PROCESSES = ['Xorg'] +IDLE_CONTAINER_ID = "120" # LXC container ID of idle GPU workload +Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container. Hour, Minute. +Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again. Hour, Minute. + +``` + +--- + +## ๐Ÿ› ๏ธ Installation (systemd service) + +Create the service file: + +`/etc/systemd/system/gpu-proxy.service` + +```ini +[Unit] +Description=GPU Resource Manager and Proxy for Ollama +After=network.target + +[Service] +Type=simple +User=root +ExecStart=/usr/bin/python3 /usr/local/bin/gpu-proxy.py +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target +``` + +Enable and start the service: + +```bash +sudo systemctl daemon-reload +sudo systemctl enable gpu-proxy +sudo systemctl start gpu-proxy +``` + +--- + +## ๐Ÿš€ Usage + +Forward requests to the **proxy port** instead of directly to Ollama. + +Example: + +```bash +curl http://proxmox-host:11435/api/generate -d '{ + "model": "llama2", + "prompt": "Why is the sky blue?", + "stream": false +}' +``` + +### API Endpoints Automatically Managed +- `/api/generate` +- `/api/chat` +- `/api/embeddings` +- `/api/load` +- `/api/pull` + +Anything GPU-intensive triggers resource management logic. + +--- + +## ๐Ÿ”ง How It Works + +### ๐Ÿ”„ Resource Management Flow + +1. **Request received** by proxy +2. Proxy detects GPU-intensive Ollama endpoint +3. Proxy checks for non-idle GPU processes / idle container +4. If necessary โ†’ **stops idle GPU container** +5. Forwards request to Ollama +6. Waits for Ollama to Finish (default 120s timeout) +7. Watches GPU activity for non-idle processes +8. Once GPU is idle โ†’ **starts idle container** + +--- + +## ๐Ÿ“œ Logging + +Logs stored in: + +``` +/var/log/gpu_proxy.log +``` + +Examples: + +``` +2025-11-10 15:36:41,882 - INFO - Starting GPU Resource Manager And Proxy for ollama. +2025-11-10 15:36:42,695 - INFO - pct command available (this is the host) +2025-11-10 15:36:42,717 - INFO - Current GPU processes: [{'pid': '2381690', 'name': '/var/lib/cudo-miner/registry/aaf375fd4c7b39548121985bce1e7b64/t-rex', 'memory': '5478 MiB'}] +2025-11-10 15:36:43,524 - INFO - Idle NvGPU container 120 running: True +2025-11-10 15:36:43,525 - INFO - GPU monitoring thread started +2025-11-10 15:36:43,525 - INFO - Proxy server running on port 11435 +2025-11-10 15:36:43,525 - INFO - Forwarding to Ollama at localhost:11434 +2025-11-10 15:36:43,525 - INFO - Managing idle NvGPU container: 120 +2025-11-10 15:36:43,525 - INFO - Monitoring GPU usage and scheduled maintenance windows +2025-11-10 15:36:43,525 - INFO - Idle NvGPU process patterns: ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer'] +2025-11-10 15:36:55,310 - INFO - Localhost - "GET /api/tags HTTP/1.1" 200 - +2025-11-10 15:36:55,332 - INFO - Localhost - "GET /api/ps HTTP/1.1" 200 - +2025-11-10 15:37:01,223 - INFO - GPU-intensive operation detected: /api/chat +2025-11-10 15:37:02,040 - INFO - Force stopping idle NvGPU container for Ollama GPU operation +2025-11-10 15:37:02,859 - INFO - Stopping container 120 +2025-11-10 15:37:06,391 - INFO - Container 120 stopped successfully +2025-11-10 15:37:29,546 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 - +2025-11-10 15:37:29,551 - INFO - Ollama request completed, activity timestamp updated +2025-11-10 15:37:29,664 - INFO - GPU-intensive operation detected: /api/chat +2025-11-10 15:37:39,896 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 - +2025-11-10 15:37:39,896 - INFO - Ollama request completed, activity timestamp updated +2025-11-10 15:37:39,903 - INFO - GPU-intensive operation detected: /api/chat +2025-11-10 15:38:17,616 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 - +2025-11-10 15:38:17,616 - INFO - Ollama request completed, activity timestamp updated +2025-11-10 15:38:17,631 - INFO - GPU-intensive operation detected: /api/chat +2025-11-10 15:38:53,068 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 - +2025-11-10 15:38:53,069 - INFO - Ollama request completed, activity timestamp updated +2025-11-10 15:39:59,855 - INFO - Ollama activity timeout reached +2025-11-10 15:43:58,186 - INFO - GPU idle, starting idle NvGPU container +2025-11-10 15:43:59,001 - INFO - Starting container 120 +2025-11-10 15:44:02,739 - INFO - Container 120 started successfully +``` + +Enable debug mode: + +```python +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('/var/log/gpu_proxy.log'), + logging.StreamHandler() + ] +) +``` + +--- + +## ๐Ÿงช Monitoring & Testing + +Check service: + +```bash +systemctl status gpu-proxy +``` + +Tail logs: + +```bash +tail -f /var/log/gpu_proxy.log +``` + +Test GPU processes: + +```bash +nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits +``` + +--- + +## โ— Troubleshooting + +### `pct` command not found +โ†’ Script must run **on the Proxmox host**, not inside an LXC. + +### GPU processes not detected +- Verify NVIDIA drivers +- Run `nvidia-smi` manually +- Ensure GPU passthrough is configured + +### Idle container not managed +- Check the LXC exists +- Run `pct list` +- Ensure root permissions + +### Proxy connection refused +- Ensure Ollama is running +- Check firewall rules +- Check via curl inside Proxmox: + +```bash +curl http://:11434/api/version +``` + +--- + +## ๐Ÿค Contributing + +Pull requests and issues are welcome! +If youโ€™d like to contribute, please open an issue first to discuss your idea. + +--- + +## ๐Ÿ“„ License + +This project is licensed under the **MIT License**. See the `LICENSE` file for details. + +---