first commit

This commit is contained in:
mrmarcus007
2025-11-10 16:02:57 +01:00
commit 1e2567fd59
2 changed files with 645 additions and 0 deletions
@@ -0,0 +1,377 @@
#!/usr/bin/env python3
import http.server
import socketserver
import requests
import json
import time
import threading
import subprocess
from datetime import datetime, time as dt_time
from urllib.parse import urlparse, parse_qs
import logging
# Configuration
OLLAMA_HOST = "localhost" # Your Ollama LXC IP
OLLAMA_PORT = 11434 # Your Ollama LXC Port
PROXY_PORT = 11435 # Port Of This Proxy
OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Don't touch unless you know what you are doing.
# GPU monitoring
GPU_CHECK_INTERVAL = 10 # seconds it wait's to check for other process apart from known/compute processes
# process process patterns (from your nvidia-smi output)
IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
KNOWN_NvidiaGPU_PROCESSES = ['Xorg'] # Processes that are allowed when "idle" compute process is running
IDLE_CONTAINER_ID = "" # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120"
Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container.
Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again.
#----------------------------------------------------------active-code--------------------------------------------------------------#
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/var/log/gpu_proxy.log'),
logging.StreamHandler()
]
)
class GPUResourceManager:
def __init__(self):
self.idle_compute_running = False
self.ollama_active = False
self.last_ollama_activity = 0
self.ollama_activity_timeout = 120 # seconds of no activity before considering Ollama done
self.last_gpu_check = 0
self.gpu_processes = []
self.lock = threading.Lock()
self.operation_in_progress = False
def run_command(self, cmd):
try:
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
logging.error(f"Command failed: {cmd}, error: {e.stderr}")
return None
def is_container_running(self, container_id):
output = self.run_command(f"pct list | grep \"^{container_id}\"")
if output and "running" in output:
return True
return False
def stop_container(self, container_id):
if self.is_container_running(container_id):
logging.info(f"Stopping container {container_id}")
result = self.run_command(f"pct stop {container_id}")
if result is not None:
max_wait = 15
waited = 0
while self.is_container_running(container_id) and waited < max_wait:
time.sleep(1)
waited += 1
if not self.is_container_running(container_id):
logging.info(f"Container {container_id} stopped successfully")
return True
else:
logging.warning(f"Container {container_id} still running after {max_wait} seconds")
return False
else:
logging.error(f"Failed to stop container {container_id}")
else:
logging.debug(f"Container {container_id} already stopped, no action needed")
return True
return False
def start_container(self, container_id):
if not self.is_container_running(container_id):
logging.info(f"Starting container {container_id}")
result = self.run_command(f"pct start {container_id}")
if result is not None:
max_wait = 15
waited = 0
while not self.is_container_running(container_id) and waited < max_wait:
time.sleep(1)
waited += 1
if self.is_container_running(container_id):
logging.info(f"Container {container_id} started successfully")
return True
else:
logging.error(f"Container {container_id} failed to start within {max_wait} seconds")
else:
logging.error(f"Failed to start container {container_id}")
else:
logging.debug(f"Container {container_id} already running, no action needed")
return True
return False
def get_gpu_processes(self):
try:
output = self.run_command("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits")
processes = []
if output:
for line in output.split('\n'):
if line.strip():
parts = line.split(', ')
if len(parts) >= 3:
processes.append({
'pid': parts[0],
'name': parts[1].strip(),
'memory': parts[2] + ' MiB'
})
return processes
except Exception as e:
logging.error(f"Error getting GPU processes: {e}")
return []
def is_compute_process(self, process_name):
process_lower = process_name.lower()
for pattern in IDLE_NvGPU_PROCESSES:
if pattern in process_lower:
return True
return False
def is_known_system_process(self, process_name):
return any(sys_proc in process_name for sys_proc in KNOWN_NvidiaGPU_PROCESSES)
def is_gpu_idle(self):
processes = self.get_gpu_processes()
non_mining_processes = []
for process in processes:
if not self.is_compute_process(process['name']) and not self.is_known_system_process(process['name']):
memory_usage = int(process['memory'].split()[0])
if memory_usage > 100: # More than 100MB is significant
non_mining_processes.append(process)
is_idle = len(non_mining_processes) == 0
if is_idle:
mining_count = len([p for p in processes if self.is_compute_process(p['name'])])
if mining_count > 0:
logging.debug("GPU is running idle NvGPU task (acceptable idle state)")
else:
logging.debug("GPU is truly idle (no significant processes)")
else:
logging.debug(f"GPU is active with non-mining processes: {[p['name'] for p in non_mining_processes]}")
return is_idle
def is_Idle_NvGPU_process_active(self):
processes = self.get_gpu_processes()
mining_processes = [p for p in processes if self.is_compute_process(p['name'])]
return len(mining_processes) > 0
def is_ollama_still_active(self):
if not self.ollama_active:
return False
time_since_last_activity = time.time() - self.last_ollama_activity
return time_since_last_activity < self.ollama_activity_timeout
def should_stop_for_schedule(self):
now = datetime.now().time()
stop_start = dt_time(Blackout_schedule_Start) # 2:15 AM
stop_end = dt_time(Blackout_schedule_End) # 3:30 AM
in_window = stop_start <= now <= stop_end
if in_window:
logging.debug("Within scheduled maintenance window (2:15am-3:30am)")
return in_window
def manage_idle_NvGPU_process(self):
with self.lock:
if self.operation_in_progress:
return
self.operation_in_progress = True
try:
current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID)
mining_active_on_gpu = self.is_Idle_NvGPU_process_active()
self.idle_compute_running = current_idle_NvGPU_process_state
if self.should_stop_for_schedule():
if current_idle_NvGPU_process_state:
logging.info("Stopping idle NvGPU container due to scheduled maintenance window")
self.stop_container(IDLE_CONTAINER_ID)
self.idle_compute_running = False
return
ollama_still_active = self.is_ollama_still_active()
if ollama_still_active:
if current_idle_NvGPU_process_state or mining_active_on_gpu:
logging.info("Ollama still active, keeping idle NvGPU container stopped")
if current_idle_NvGPU_process_state:
self.stop_container(IDLE_CONTAINER_ID)
self.idle_compute_running = False
return
else:
if self.ollama_active:
self.ollama_active = False
logging.info("Ollama activity timeout reached")
if self.is_gpu_idle():
if not current_idle_NvGPU_process_state and not mining_active_on_gpu:
logging.info("GPU idle, starting idle NvGPU container")
if self.start_container(IDLE_CONTAINER_ID):
self.idle_compute_running = True
elif mining_active_on_gpu and not current_idle_NvGPU_process_state:
self.idle_compute_running = True
logging.debug("Mining active on GPU, updating state")
else:
if current_idle_NvGPU_process_state:
logging.info("GPU in use by other process, stopping idle NvGPU container")
self.stop_container(IDLE_CONTAINER_ID)
self.idle_compute_running = False
finally:
self.operation_in_progress = False
def force_stop_idle_NvGPU_process_for_ollama(self):
with self.lock:
self.ollama_active = True
self.last_ollama_activity = time.time()
current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID)
mining_active_on_gpu = self.is_Idle_NvGPU_process_active()
if current_idle_NvGPU_process_state or mining_active_on_gpu:
logging.info("Force stopping idle NvGPU container for Ollama GPU operation")
if current_idle_NvGPU_process_state:
self.stop_container(IDLE_CONTAINER_ID)
if mining_active_on_gpu:
max_wait = 10
waited = 0
while self.is_Idle_NvGPU_process_active() and waited < max_wait:
time.sleep(1)
waited += 1
if self.is_Idle_NvGPU_process_active():
logging.warning("Mining processes still active after container stop")
else:
logging.debug("idle NvGPU container already stopped, no action needed")
self.idle_compute_running = False
def start_monitoring(self):
def monitor_loop():
while True:
try:
self.manage_idle_NvGPU_process()
except Exception as e:
logging.error(f"Error in monitor loop: {e}")
time.sleep(GPU_CHECK_INTERVAL)
monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
monitor_thread.start()
logging.info("GPU monitoring thread started")
class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
self.gpu_manager = kwargs.pop('gpu_manager')
super().__init__(*args, **kwargs)
def do_GET(self):
if self._is_gpu_intensive_operation(self.path, {}):
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(2)
self._forward_request('GET')
def do_POST(self):
content_length = int(self.headers.get('Content-Length', 0))
post_data = self.rfile.read(content_length) if content_length > 0 else b''
try:
request_data = json.loads(post_data.decode('utf-8')) if post_data else {}
is_gpu_intensive = self._is_gpu_intensive_operation(self.path, request_data)
if is_gpu_intensive:
logging.info(f"GPU-intensive operation detected: {self.path}")
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(3.5)
self._forward_request('POST', post_data)
if is_gpu_intensive:
with self.gpu_manager.lock:
self.gpu_manager.last_ollama_activity = time.time()
logging.info("Ollama request completed, activity timestamp updated")
except Exception as e:
logging.error(f"Error processing request: {e}")
self.send_error(500, f"Internal server error: {e}")
def _is_gpu_intensive_operation(self, path, request_data):
if path in ['/api/generate', '/api/chat', '/api/embeddings']:
return True
if path == '/api/load':
return True
if path == '/api/pull':
return request_data.get('stream', True)
return False
def _forward_request(self, method, data=None):
url = f"{OLLAMA_BASE_URL}{self.path}"
headers = {key: value for key, value in self.headers.items()}
hop_headers = ['connection', 'keep-alive', 'proxy-authenticate',
'proxy-authorization', 'te', 'trailers', 'upgrade']
for header in hop_headers:
headers.pop(header, None)
try:
if method == 'GET':
response = requests.get(url, headers=headers, timeout=300)
else:
response = requests.post(url, data=data, headers=headers, timeout=300)
self.send_response(response.status_code)
for key, value in response.headers.items():
if key.lower() not in ['content-encoding', 'transfer-encoding', 'connection']:
self.send_header(key, value)
self.end_headers()
self.wfile.write(response.content)
except requests.exceptions.RequestException as e:
logging.error(f"Error forwarding to Ollama: {e}")
self.send_error(502, f"Bad gateway: {e}")
def log_message(self, format, *args):
logging.info(f"{self.address_string()} - {format % args}")
def main():
logging.info("Starting GPU Resource Manager Proxy on Proxmox Host")
manager = GPUResourceManager()
test_output = manager.run_command("pct list > /dev/null && echo 'pct available'")
if test_output:
logging.info("pct command available")
else:
logging.error("pct command not available! Running on wrong system?")
return
gpu_processes = manager.get_gpu_processes()
logging.info(f"Current GPU processes: {gpu_processes}")
idle_NvGPU_process_status = manager.is_container_running(IDLE_CONTAINER_ID)
logging.info(f"idle NvGPU {IDLE_CONTAINER_ID} running: {idle_NvGPU_process_status}")
gpu_manager = GPUResourceManager()
gpu_manager.start_monitoring()
handler = lambda *args, **kwargs: OllamaProxyHandler(*args, gpu_manager=gpu_manager, **kwargs)
with socketserver.TCPServer(("", PROXY_PORT), handler) as httpd:
logging.info(f"Proxy server running on port {PROXY_PORT}")
logging.info(f"Forwarding to Ollama at {OLLAMA_HOST}:{OLLAMA_PORT}")
logging.info(f"Managing idle NvGPU process: {IDLE_CONTAINER_ID}")
logging.info("Monitoring GPU usage and scheduled maintenance windows")
logging.info(f"Mining process patterns: {IDLE_NvGPU_PROCESSES}")
try:
httpd.serve_forever()
except KeyboardInterrupt:
logging.info("Shutting down proxy server")
except Exception as e:
logging.error(f"Server error: {e}")
if __name__ == "__main__":
main()
+268
View File
@@ -0,0 +1,268 @@
# GPU Resource Manager & Proxy for Ollama
*A smart GPU-aware proxy for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.*
---
## ✅ Overview
The **GPU Resource Manager & Proxy for Ollama** is a lightweight Python service that sits between your applications and the Ollama server. It intelligently supervises NVIDIA GPU usage on **Proxmox VE** hosts and ensures that GPU-intensive background tasks (e.g. mining) are temporarily suspended whenever Ollama requires GPU power.
This enables smooth coexistence of AI workloads and other GPU idle tasks on the same host.
---
## ✨ Features
### 🔍 Intelligent GPU Monitoring
- Detects GPU usage patterns and active processes via `nvidia-smi`.
- Differentiates essential system processes from idle GPU workloads.
### ⚙️ Dynamic Resource Allocation
- Automatically pauses idle/non-critical GPU processes when Ollama becomes active.
- Automatically resumes them after configurable inactivity timeouts.
### 🗓️ Scheduled Blackout Window
- By Default Automatically stops idle GPU processes between **2:15 AM 3:30 AM** for maintenance.
### 🖥️ Proxmox LXC Integration
- Direct container control using Proxmox's `pct` command.
- Ideal for GPU-passthrough LXC containers (miners, renderers, etc.).
### ⚡ Real-Time Process Detection
- Inspects NVIDIA GPU processes continuously.
- Supports customizable allow-lists and idle-process lists.
---
## 📦 Requirements
- NVIDIA GPU + drivers
- `nvidia-smi`
- Python **3.x**
- `python3-requests`
- Proxmox VE host
- At least one GPU-passthrough LXC container
- Optional: Ollama server running inside LXC
Install required packages:
```bash
sudo apt update
sudo apt install python3 python3-requests
```
---
## ⚙️ Configuration
These are the primary configuration variables inside the script:
```python
# Basic Configuration
OLLAMA_HOST = "localhost" # Ollama container IP
OLLAMA_PORT = 11434 # Ollama API port
PROXY_PORT = 11435 # Proxy server port
GPU_CHECK_INTERVAL = 10 # Seconds between GPU checks
# GPU Process Management
IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
KNOWN_NvidiaGPU_PROCESSES = ['Xorg']
IDLE_CONTAINER_ID = "120" # LXC container ID of idle GPU workload
Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container. Hour, Minute.
Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again. Hour, Minute.
```
---
## 🛠️ Installation (systemd service)
Create the service file:
`/etc/systemd/system/gpu-proxy.service`
```ini
[Unit]
Description=GPU Resource Manager and Proxy for Ollama
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/bin/python3 /usr/local/bin/gpu-proxy.py
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target
```
Enable and start the service:
```bash
sudo systemctl daemon-reload
sudo systemctl enable gpu-proxy
sudo systemctl start gpu-proxy
```
---
## 🚀 Usage
Forward requests to the **proxy port** instead of directly to Ollama.
Example:
```bash
curl http://proxmox-host:11435/api/generate -d '{
"model": "llama2",
"prompt": "Why is the sky blue?",
"stream": false
}'
```
### API Endpoints Automatically Managed
- `/api/generate`
- `/api/chat`
- `/api/embeddings`
- `/api/load`
- `/api/pull`
Anything GPU-intensive triggers resource management logic.
---
## 🔧 How It Works
### 🔄 Resource Management Flow
1. **Request received** by proxy
2. Proxy detects GPU-intensive Ollama endpoint
3. Proxy checks for non-idle GPU processes / idle container
4. If necessary → **stops idle GPU container**
5. Forwards request to Ollama
6. Waits for Ollama to Finish (default 120s timeout)
7. Watches GPU activity for non-idle processes
8. Once GPU is idle → **starts idle container**
---
## 📜 Logging
Logs stored in:
```
/var/log/gpu_proxy.log
```
Examples:
```
2025-11-10 15:36:41,882 - INFO - Starting GPU Resource Manager And Proxy for ollama.
2025-11-10 15:36:42,695 - INFO - pct command available (this is the host)
2025-11-10 15:36:42,717 - INFO - Current GPU processes: [{'pid': '2381690', 'name': '/var/lib/cudo-miner/registry/aaf375fd4c7b39548121985bce1e7b64/t-rex', 'memory': '5478 MiB'}]
2025-11-10 15:36:43,524 - INFO - Idle NvGPU container 120 running: True
2025-11-10 15:36:43,525 - INFO - GPU monitoring thread started
2025-11-10 15:36:43,525 - INFO - Proxy server running on port 11435
2025-11-10 15:36:43,525 - INFO - Forwarding to Ollama at localhost:11434
2025-11-10 15:36:43,525 - INFO - Managing idle NvGPU container: 120
2025-11-10 15:36:43,525 - INFO - Monitoring GPU usage and scheduled maintenance windows
2025-11-10 15:36:43,525 - INFO - Idle NvGPU process patterns: ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
2025-11-10 15:36:55,310 - INFO - Localhost - "GET /api/tags HTTP/1.1" 200 -
2025-11-10 15:36:55,332 - INFO - Localhost - "GET /api/ps HTTP/1.1" 200 -
2025-11-10 15:37:01,223 - INFO - GPU-intensive operation detected: /api/chat
2025-11-10 15:37:02,040 - INFO - Force stopping idle NvGPU container for Ollama GPU operation
2025-11-10 15:37:02,859 - INFO - Stopping container 120
2025-11-10 15:37:06,391 - INFO - Container 120 stopped successfully
2025-11-10 15:37:29,546 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
2025-11-10 15:37:29,551 - INFO - Ollama request completed, activity timestamp updated
2025-11-10 15:37:29,664 - INFO - GPU-intensive operation detected: /api/chat
2025-11-10 15:37:39,896 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
2025-11-10 15:37:39,896 - INFO - Ollama request completed, activity timestamp updated
2025-11-10 15:37:39,903 - INFO - GPU-intensive operation detected: /api/chat
2025-11-10 15:38:17,616 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
2025-11-10 15:38:17,616 - INFO - Ollama request completed, activity timestamp updated
2025-11-10 15:38:17,631 - INFO - GPU-intensive operation detected: /api/chat
2025-11-10 15:38:53,068 - INFO - Localhost - "POST /api/chat HTTP/1.1" 200 -
2025-11-10 15:38:53,069 - INFO - Ollama request completed, activity timestamp updated
2025-11-10 15:39:59,855 - INFO - Ollama activity timeout reached
2025-11-10 15:43:58,186 - INFO - GPU idle, starting idle NvGPU container
2025-11-10 15:43:59,001 - INFO - Starting container 120
2025-11-10 15:44:02,739 - INFO - Container 120 started successfully
```
Enable debug mode:
```python
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/var/log/gpu_proxy.log'),
logging.StreamHandler()
]
)
```
---
## 🧪 Monitoring & Testing
Check service:
```bash
systemctl status gpu-proxy
```
Tail logs:
```bash
tail -f /var/log/gpu_proxy.log
```
Test GPU processes:
```bash
nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits
```
---
## ❗ Troubleshooting
### `pct` command not found
→ Script must run **on the Proxmox host**, not inside an LXC.
### GPU processes not detected
- Verify NVIDIA drivers
- Run `nvidia-smi` manually
- Ensure GPU passthrough is configured
### Idle container not managed
- Check the LXC exists
- Run `pct list`
- Ensure root permissions
### Proxy connection refused
- Ensure Ollama is running
- Check firewall rules
- Check via curl inside Proxmox:
```bash
curl http://<OLLAMA_HOST>:11434/api/version
```
---
## 🤝 Contributing
Pull requests and issues are welcome!
If youd like to contribute, please open an issue first to discuss your idea.
---
## 📄 License
This project is licensed under the **MIT License**. See the `LICENSE` file for details.
---