first commit

2025-11-10 16:02:57 +01:00
commit 1e2567fd59
2 changed files with 645 additions and 0 deletions
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+import http.server
+import socketserver
+import requests
+import json
+import time
+import threading
+import subprocess
+from datetime import datetime, time as dt_time
+from urllib.parse import urlparse, parse_qs
+import logging
+
+# Configuration
+OLLAMA_HOST = "localhost"  # Your Ollama LXC IP
+OLLAMA_PORT = 11434  # Your Ollama LXC Port
+PROXY_PORT = 11435 # Port Of This Proxy
+OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Don't touch unless you know what you are doing.
+
+# GPU monitoring
+GPU_CHECK_INTERVAL = 10  # seconds it wait's to check for other process apart from known/compute processes
+
+# process process patterns (from your nvidia-smi output)
+IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
+KNOWN_NvidiaGPU_PROCESSES = ['Xorg']  # Processes that are allowed when "idle" compute process is running
+IDLE_CONTAINER_ID = "" # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120"
+Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container.
+Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again.
+#----------------------------------------------------------active-code--------------------------------------------------------------#
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('/var/log/gpu_proxy.log'),
+        logging.StreamHandler()
+    ]
+)
+
+class GPUResourceManager:
+    def __init__(self):
+        self.idle_compute_running = False
+        self.ollama_active = False
+        self.last_ollama_activity = 0
+        self.ollama_activity_timeout = 120  # seconds of no activity before considering Ollama done
+        self.last_gpu_check = 0
+        self.gpu_processes = []
+        self.lock = threading.Lock()
+        self.operation_in_progress = False
+        
+    def run_command(self, cmd):
+        try:
+            result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
+            return result.stdout.strip()
+        except subprocess.CalledProcessError as e:
+            logging.error(f"Command failed: {cmd}, error: {e.stderr}")
+            return None
+
+    def is_container_running(self, container_id):
+        output = self.run_command(f"pct list | grep \"^{container_id}\"")
+        if output and "running" in output:
+            return True
+        return False
+
+    def stop_container(self, container_id):
+        if self.is_container_running(container_id):
+            logging.info(f"Stopping container {container_id}")
+            result = self.run_command(f"pct stop {container_id}")
+            if result is not None:
+                max_wait = 15
+                waited = 0
+                while self.is_container_running(container_id) and waited < max_wait:
+                    time.sleep(1)
+                    waited += 1
+                
+                if not self.is_container_running(container_id):
+                    logging.info(f"Container {container_id} stopped successfully")
+                    return True
+                else:
+                    logging.warning(f"Container {container_id} still running after {max_wait} seconds")
+                    return False
+            else:
+                logging.error(f"Failed to stop container {container_id}")
+        else:
+            logging.debug(f"Container {container_id} already stopped, no action needed")
+            return True
+        return False
+
+    def start_container(self, container_id):
+        if not self.is_container_running(container_id):
+            logging.info(f"Starting container {container_id}")
+            result = self.run_command(f"pct start {container_id}")
+            if result is not None:
+                max_wait = 15
+                waited = 0
+                while not self.is_container_running(container_id) and waited < max_wait:
+                    time.sleep(1)
+                    waited += 1
+                
+                if self.is_container_running(container_id):
+                    logging.info(f"Container {container_id} started successfully")
+                    return True
+                else:
+                    logging.error(f"Container {container_id} failed to start within {max_wait} seconds")
+            else:
+                logging.error(f"Failed to start container {container_id}")
+        else:
+            logging.debug(f"Container {container_id} already running, no action needed")
+            return True
+        return False
+
+    def get_gpu_processes(self):
+        try:
+            output = self.run_command("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits")
+            processes = []
+            if output:
+                for line in output.split('\n'):
+                    if line.strip():
+                        parts = line.split(', ')
+                        if len(parts) >= 3:
+                            processes.append({
+                                'pid': parts[0],
+                                'name': parts[1].strip(),
+                                'memory': parts[2] + ' MiB'
+                            })
+            return processes
+        except Exception as e:
+            logging.error(f"Error getting GPU processes: {e}")
+            return []
+
+    def is_compute_process(self, process_name):
+        process_lower = process_name.lower()
+        for pattern in IDLE_NvGPU_PROCESSES:
+            if pattern in process_lower:
+                return True
+        return False
+
+    def is_known_system_process(self, process_name):
+        return any(sys_proc in process_name for sys_proc in KNOWN_NvidiaGPU_PROCESSES)
+
+    def is_gpu_idle(self):
+        processes = self.get_gpu_processes()
+
+        non_mining_processes = []
+        for process in processes:
+            if not self.is_compute_process(process['name']) and not self.is_known_system_process(process['name']):
+                memory_usage = int(process['memory'].split()[0])
+                if memory_usage > 100:  # More than 100MB is significant
+                    non_mining_processes.append(process)
+        
+        is_idle = len(non_mining_processes) == 0
+        
+        if is_idle:
+            mining_count = len([p for p in processes if self.is_compute_process(p['name'])])
+            if mining_count > 0:
+                logging.debug("GPU is running idle NvGPU task (acceptable idle state)")
+            else:
+                logging.debug("GPU is truly idle (no significant processes)")
+        else:
+            logging.debug(f"GPU is active with non-mining processes: {[p['name'] for p in non_mining_processes]}")
+            
+        return is_idle
+
+    def is_Idle_NvGPU_process_active(self):
+        processes = self.get_gpu_processes()
+        mining_processes = [p for p in processes if self.is_compute_process(p['name'])]
+        return len(mining_processes) > 0
+
+    def is_ollama_still_active(self):
+        if not self.ollama_active:
+            return False
+        time_since_last_activity = time.time() - self.last_ollama_activity
+        return time_since_last_activity < self.ollama_activity_timeout
+
+    def should_stop_for_schedule(self):
+        now = datetime.now().time()
+        stop_start = dt_time(Blackout_schedule_Start)  # 2:15 AM
+        stop_end = dt_time(Blackout_schedule_End)    # 3:30 AM
+        in_window = stop_start <= now <= stop_end
+        if in_window:
+            logging.debug("Within scheduled maintenance window (2:15am-3:30am)")
+        return in_window
+
+    def manage_idle_NvGPU_process(self):
+        with self.lock:
+            if self.operation_in_progress:
+                return  
+                
+            self.operation_in_progress = True
+            try:
+                current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID)
+                mining_active_on_gpu = self.is_Idle_NvGPU_process_active()
+                self.idle_compute_running = current_idle_NvGPU_process_state
+                if self.should_stop_for_schedule():
+                    if current_idle_NvGPU_process_state:
+                        logging.info("Stopping idle NvGPU container due to scheduled maintenance window")
+                        self.stop_container(IDLE_CONTAINER_ID)
+                        self.idle_compute_running = False
+                    return
+                
+                
+                ollama_still_active = self.is_ollama_still_active()
+                if ollama_still_active:
+                    if current_idle_NvGPU_process_state or mining_active_on_gpu:
+                        logging.info("Ollama still active, keeping idle NvGPU container stopped")
+                        if current_idle_NvGPU_process_state:
+                            self.stop_container(IDLE_CONTAINER_ID)
+                        self.idle_compute_running = False
+                    return
+                else:
+                    
+                    if self.ollama_active:
+                        self.ollama_active = False
+                        logging.info("Ollama activity timeout reached")
+                
+                if self.is_gpu_idle():
+                    if not current_idle_NvGPU_process_state and not mining_active_on_gpu:
+                        logging.info("GPU idle, starting idle NvGPU container")
+                        if self.start_container(IDLE_CONTAINER_ID):
+                            self.idle_compute_running = True
+                    elif mining_active_on_gpu and not current_idle_NvGPU_process_state:
+                        self.idle_compute_running = True
+                        logging.debug("Mining active on GPU, updating state")
+                else:
+                    if current_idle_NvGPU_process_state:
+                        logging.info("GPU in use by other process, stopping idle NvGPU container")
+                        self.stop_container(IDLE_CONTAINER_ID)
+                        self.idle_compute_running = False
+            finally:
+                self.operation_in_progress = False
+
+    def force_stop_idle_NvGPU_process_for_ollama(self):
+        with self.lock:
+            self.ollama_active = True
+            self.last_ollama_activity = time.time()
+            current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID)
+            mining_active_on_gpu = self.is_Idle_NvGPU_process_active()
+            if current_idle_NvGPU_process_state or mining_active_on_gpu:
+                logging.info("Force stopping idle NvGPU container for Ollama GPU operation")
+                if current_idle_NvGPU_process_state:
+                    self.stop_container(IDLE_CONTAINER_ID)
+                if mining_active_on_gpu:
+                    max_wait = 10
+                    waited = 0
+                    while self.is_Idle_NvGPU_process_active() and waited < max_wait:
+                        time.sleep(1)
+                        waited += 1
+                    if self.is_Idle_NvGPU_process_active():
+                        logging.warning("Mining processes still active after container stop")
+            else:
+                logging.debug("idle NvGPU container already stopped, no action needed")
+                
+            self.idle_compute_running = False
+
+    def start_monitoring(self):
+        def monitor_loop():
+            while True:
+                try:
+                    self.manage_idle_NvGPU_process()
+                except Exception as e:
+                    logging.error(f"Error in monitor loop: {e}")
+                time.sleep(GPU_CHECK_INTERVAL)
+        
+        monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
+        monitor_thread.start()
+        logging.info("GPU monitoring thread started")
+
+class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
+    def __init__(self, *args, **kwargs):
+        self.gpu_manager = kwargs.pop('gpu_manager')
+        super().__init__(*args, **kwargs)
+    
+    def do_GET(self):
+        if self._is_gpu_intensive_operation(self.path, {}):
+            self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
+            time.sleep(2) 
+        
+        self._forward_request('GET')
+    
+    def do_POST(self):
+        content_length = int(self.headers.get('Content-Length', 0))
+        post_data = self.rfile.read(content_length) if content_length > 0 else b''
+        
+        try:
+            request_data = json.loads(post_data.decode('utf-8')) if post_data else {}
+            is_gpu_intensive = self._is_gpu_intensive_operation(self.path, request_data)
+            
+            if is_gpu_intensive:
+                logging.info(f"GPU-intensive operation detected: {self.path}")
+                self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
+                time.sleep(3.5)
+
+            self._forward_request('POST', post_data)
+            
+            if is_gpu_intensive:
+                with self.gpu_manager.lock:
+                    self.gpu_manager.last_ollama_activity = time.time()
+                logging.info("Ollama request completed, activity timestamp updated")
+                
+        except Exception as e:
+            logging.error(f"Error processing request: {e}")
+            self.send_error(500, f"Internal server error: {e}")
+    
+    def _is_gpu_intensive_operation(self, path, request_data):
+        if path in ['/api/generate', '/api/chat', '/api/embeddings']:
+            return True
+        if path == '/api/load':
+            return True
+        if path == '/api/pull':
+            return request_data.get('stream', True)
+        return False
+    
+    def _forward_request(self, method, data=None):
+        url = f"{OLLAMA_BASE_URL}{self.path}"
+        headers = {key: value for key, value in self.headers.items()}
+        hop_headers = ['connection', 'keep-alive', 'proxy-authenticate', 
+                      'proxy-authorization', 'te', 'trailers', 'upgrade']
+        for header in hop_headers:
+            headers.pop(header, None)
+
+        try:
+            if method == 'GET':
+                response = requests.get(url, headers=headers, timeout=300)
+            else:
+                response = requests.post(url, data=data, headers=headers, timeout=300)
+
+            self.send_response(response.status_code)
+            for key, value in response.headers.items():
+                if key.lower() not in ['content-encoding', 'transfer-encoding', 'connection']:
+                    self.send_header(key, value)
+            self.end_headers()
+            self.wfile.write(response.content)
+            
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Error forwarding to Ollama: {e}")
+            self.send_error(502, f"Bad gateway: {e}")
+    
+    def log_message(self, format, *args):
+        logging.info(f"{self.address_string()} - {format % args}")
+
+def main():
+    logging.info("Starting GPU Resource Manager Proxy on Proxmox Host")
+
+    manager = GPUResourceManager()
+    test_output = manager.run_command("pct list > /dev/null && echo 'pct available'")
+    if test_output:
+        logging.info("pct command available")
+    else:
+        logging.error("pct command not available! Running on wrong system?")
+        return
+
+    gpu_processes = manager.get_gpu_processes()
+    logging.info(f"Current GPU processes: {gpu_processes}")
+
+    idle_NvGPU_process_status = manager.is_container_running(IDLE_CONTAINER_ID)
+    logging.info(f"idle NvGPU {IDLE_CONTAINER_ID} running: {idle_NvGPU_process_status}")
+
+    gpu_manager = GPUResourceManager()
+
+    gpu_manager.start_monitoring()
+
+    handler = lambda *args, **kwargs: OllamaProxyHandler(*args, gpu_manager=gpu_manager, **kwargs)
+    
+    with socketserver.TCPServer(("", PROXY_PORT), handler) as httpd:
+        logging.info(f"Proxy server running on port {PROXY_PORT}")
+        logging.info(f"Forwarding to Ollama at {OLLAMA_HOST}:{OLLAMA_PORT}")
+        logging.info(f"Managing idle NvGPU process: {IDLE_CONTAINER_ID}")
+        logging.info("Monitoring GPU usage and scheduled maintenance windows")
+        logging.info(f"Mining process patterns: {IDLE_NvGPU_PROCESSES}")
+        
+        try:
+            httpd.serve_forever()
+        except KeyboardInterrupt:
+            logging.info("Shutting down proxy server")
+        except Exception as e:
+            logging.error(f"Server error: {e}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,268 @@
+# GPU Resource Manager & Proxy for Ollama  
+*A smart GPU-aware proxy for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.*
+
+---
+
+## ✅ Overview
+
+The **GPU Resource Manager & Proxy for Ollama** is a lightweight Python service that sits between your applications and the Ollama server. It intelligently supervises NVIDIA GPU usage on **Proxmox VE** hosts and ensures that GPU-intensive background tasks (e.g. mining) are temporarily suspended whenever Ollama requires GPU power.
+
+This enables smooth coexistence of AI workloads and other GPU idle tasks on the same host.
+
+---
+
+## ✨ Features
+
+### 🔍 Intelligent GPU Monitoring  
+- Detects GPU usage patterns and active processes via `nvidia-smi`.
+- Differentiates essential system processes from idle GPU workloads.
+
+### ⚙️ Dynamic Resource Allocation  
+- Automatically pauses idle/non-critical GPU processes when Ollama becomes active.  
+- Automatically resumes them after configurable inactivity timeouts.
+
+### 🗓️ Scheduled Blackout Window  
+- By Default Automatically stops idle GPU processes between **2:15 AM – 3:30 AM** for maintenance.
+
+### 🖥️ Proxmox LXC Integration  
+- Direct container control using Proxmox's `pct` command.  
+- Ideal for GPU-passthrough LXC containers (miners, renderers, etc.).
+
+### ⚡ Real-Time Process Detection  
+- Inspects NVIDIA GPU processes continuously.  
+- Supports customizable allow-lists and idle-process lists.
+
+---
+
+## 📦 Requirements
+
+- NVIDIA GPU + drivers  
+- `nvidia-smi`  
+- Python **3.x**  
+- `python3-requests`  
+- Proxmox VE host  
+- At least one GPU-passthrough LXC container  
+- Optional: Ollama server running inside LXC
+
+Install required packages:
+
+```bash
+sudo apt update
+sudo apt install python3 python3-requests
+```
+
+---
+
+## ⚙️ Configuration
+
+These are the primary configuration variables inside the script:
+
+```python
+# Basic Configuration
+OLLAMA_HOST = "localhost"      # Ollama container IP
+OLLAMA_PORT = 11434            # Ollama API port
+PROXY_PORT = 11435             # Proxy server port
+GPU_CHECK_INTERVAL = 10        # Seconds between GPU checks
+
+# GPU Process Management
+IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
+KNOWN_NvidiaGPU_PROCESSES = ['Xorg']  
+IDLE_CONTAINER_ID = "120"      # LXC container ID of idle GPU workload
+Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container. Hour, Minute.
+Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again. Hour, Minute.
+
+```
+
+---
+
+## 🛠️ Installation (systemd service)
+
+Create the service file:
+
+`/etc/systemd/system/gpu-proxy.service`
+
+```ini
+[Unit]
+Description=GPU Resource Manager and Proxy for Ollama
+After=network.target
+
+[Service]
+Type=simple
+User=root
+ExecStart=/usr/bin/python3 /usr/local/bin/gpu-proxy.py
+Restart=always
+RestartSec=5
+
+[Install]
+WantedBy=multi-user.target
+```
+
+Enable and start the service:
+
+```bash
+sudo systemctl daemon-reload
+sudo systemctl enable gpu-proxy
+sudo systemctl start gpu-proxy
+```
+
+---
+
+## 🚀 Usage
+
+Forward requests to the **proxy port** instead of directly to Ollama.
+
+Example:
+
+```bash
+curl http://proxmox-host:11435/api/generate -d '{
+  "model": "llama2",
+  "prompt": "Why is the sky blue?",
+  "stream": false
+}'
+```
+
+### API Endpoints Automatically Managed
+- `/api/generate`
+- `/api/chat`
+- `/api/embeddings`
+- `/api/load`
+- `/api/pull`
+
+Anything GPU-intensive triggers resource management logic.
+
+---
+
+## 🔧 How It Works
+
+### 🔄 Resource Management Flow
+
+1. **Request received** by proxy  
+2. Proxy detects GPU-intensive Ollama endpoint  
+3. Proxy checks for non-idle GPU processes / idle container  
+4. If necessary → **stops idle GPU container**  
+5. Forwards request to Ollama
+6. Waits for Ollama to Finish (default 120s timeout)
+7. Watches GPU activity for non-idle processes
+8. Once GPU is idle → **starts idle container**
+
+---
+
+## 📜 Logging
+
+Logs stored in:
+
+```
+/var/log/gpu_proxy.log
+```
+
+Examples:
+
+```
+2025-11-10 15:36:41,882 - INFO - Starting GPU Resource Manager And Proxy for ollama.
+2025-11-10 15:36:42,695 - INFO - pct command available (this is the host)
+2025-11-10 15:36:42,717 - INFO - Current GPU processes: [{'pid': '2381690', 'name': '/var/lib/cudo-miner/registry/aaf375fd4c7b39548121985bce1e7b64/t-rex', 'memory': '5478 MiB'}]
+2025-11-10 15:36:43,524 - INFO - Idle NvGPU container 120 running: True
+2025-11-10 15:36:43,525 - INFO - GPU monitoring thread started
+2025-11-10 15:36:43,525 - INFO - Proxy server running on port 11435
+2025-11-10 15:36:43,525 - INFO - Forwarding to Ollama at localhost:11434
+2025-11-10 15:36:43,525 - INFO - Managing idle NvGPU container: 120
+2025-11-10 15:36:43,525 - INFO - Monitoring GPU usage and scheduled maintenance windows
+2025-11-10 15:36:43,525 - INFO - Idle NvGPU process patterns: ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
+2025-11-10 15:36:55,310 - INFO - Localhost - "GET /api/tags HTTP/1.1" 200 -
+2025-11-10 15:36:55,332 - INFO - Localhost - "GET /api/ps HTTP/1.1" 200 -
+2025-11-10 15:37:01,223 - INFO - GPU-intensive operation detected: /api/chat
+2025-11-10 15:37:02,040 - INFO - Force stopping idle NvGPU container for Ollama GPU operation
+2025-11-10 15:37:02,859 - INFO - Stopping container 120
+2025-11-10 15:37:06,391 - INFO - Container 120 stopped successfully
+2025-11-10 15:37:29,546 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
+2025-11-10 15:37:29,551 - INFO - Ollama request completed, activity timestamp updated
+2025-11-10 15:37:29,664 - INFO - GPU-intensive operation detected: /api/chat
+2025-11-10 15:37:39,896 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
+2025-11-10 15:37:39,896 - INFO - Ollama request completed, activity timestamp updated
+2025-11-10 15:37:39,903 - INFO - GPU-intensive operation detected: /api/chat
+2025-11-10 15:38:17,616 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
+2025-11-10 15:38:17,616 - INFO - Ollama request completed, activity timestamp updated
+2025-11-10 15:38:17,631 - INFO - GPU-intensive operation detected: /api/chat
+2025-11-10 15:38:53,068 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
+2025-11-10 15:38:53,069 - INFO - Ollama request completed, activity timestamp updated
+2025-11-10 15:39:59,855 - INFO - Ollama activity timeout reached
+2025-11-10 15:43:58,186 - INFO - GPU idle, starting idle NvGPU container
+2025-11-10 15:43:59,001 - INFO - Starting container 120
+2025-11-10 15:44:02,739 - INFO - Container 120 started successfully
+```
+
+Enable debug mode:
+
+```python
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('/var/log/gpu_proxy.log'),
+        logging.StreamHandler()
+    ]
+)
+```
+
+---
+
+## 🧪 Monitoring & Testing
+
+Check service:
+
+```bash
+systemctl status gpu-proxy
+```
+
+Tail logs:
+
+```bash
+tail -f /var/log/gpu_proxy.log
+```
+
+Test GPU processes:
+
+```bash
+nvidia-smi --query-compute-apps=pid,process_name,used_memory   --format=csv,noheader,nounits
+```
+
+---
+
+## ❗ Troubleshooting
+
+### `pct` command not found  
+→ Script must run **on the Proxmox host**, not inside an LXC.
+
+### GPU processes not detected  
+- Verify NVIDIA drivers  
+- Run `nvidia-smi` manually  
+- Ensure GPU passthrough is configured
+
+### Idle container not managed  
+- Check the LXC exists  
+- Run `pct list`  
+- Ensure root permissions
+
+### Proxy connection refused  
+- Ensure Ollama is running  
+- Check firewall rules  
+- Check via curl inside Proxmox:
+
+```bash
+curl http://<OLLAMA_HOST>:11434/api/version
+```
+
+---
+
+## 🤝 Contributing
+
+Pull requests and issues are welcome!  
+If you’d like to contribute, please open an issue first to discuss your idea.
+
+---
+
+## 📄 License
+
+This project is licensed under the **MIT License**. See the `LICENSE` file for details.
+
+---