12 Commits

Author SHA1 Message Date
mrmarcus007 17565331c7 Fix indent 2025-11-14 16:25:29 +01:00
mrmarcus007 7bf61ca44f Clean up blank lines in GPU resource manager script
Removed unnecessary blank lines in the GPU resource manager script.
2025-11-14 16:24:43 +01:00
mrmarcus007 40067ea9c2 Fix indentation 2025-11-14 16:23:59 +01:00
mrmarcus007 8c5b2cd2fe Removed unused imports and cleaned up code for size. 2025-11-14 16:20:36 +01:00
mrmarcus007 146d8456fa Commit 2025-11-14 16:07:24 +01:00
mrmarcus007 2fbc1ffa6f added support to unload ollama model when client requests. 2025-11-14 16:05:15 +01:00
mrmarcus007 8282740de4 Merge branch 'main' of https://github.com/mrmarcus007/GPU-Recource-Manager-and-proxy-for-ollama 2025-11-14 13:26:16 +01:00
mrmarcus007 1594e9ea1f Fixed errors, added hop by hop support and status of loaded ollama model. 2025-11-14 13:25:45 +01:00
mrmarcus007 2e73c69a0d Refine description of GPU Resource Manager 2025-11-11 18:03:07 +01:00
mrmarcus007 32499cd056 Add MIT LICENSE 2025-11-11 14:18:54 +01:00
mrmarcus007 ccd39b4f10 fixed something over looked 2025-11-10 16:13:59 +01:00
mrmarcus007 f0f8f22a6a make more consistant 2025-11-10 16:13:14 +01:00
3 changed files with 112 additions and 92 deletions
@@ -7,25 +7,22 @@ import time
import threading
import subprocess
from datetime import datetime, time as dt_time
from urllib.parse import urlparse, parse_qs
import logging
# Configuration
OLLAMA_HOST = "localhost" # Your Ollama LXC IP
import socket
# ----------------------------------------------------------Host-config--------------------------------------------------------------#
OLLAMA_HOST = "" # Your Ollama LXC IP
OLLAMA_PORT = 11434 # Your Ollama LXC Port
PROXY_PORT = 11435 # Port Of This Proxy
OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Don't touch unless you know what you are doing.
PROXY_PORT = 11435 # Port Of This Proxy
OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}"
# GPU monitoring
GPU_CHECK_INTERVAL = 10 # seconds it wait's to check for other process apart from known/compute processes
GPU_CHECK_INTERVAL = 10 # seconds it waits to check for other process apart from known/compute processes
# process process patterns (from your nvidia-smi output)
IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
KNOWN_NvidiaGPU_PROCESSES = ['Xorg'] # Processes that are allowed when "idle" compute process is running
IDLE_CONTAINER_ID = "" # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120"
Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container.
Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again.
#----------------------------------------------------------active-code--------------------------------------------------------------#
KNOWN_NvGPU_PROCESSES = ['Xorg'] # Processes that are allowed when "idle" compute process is running
IDLE_CONTAINER_ID = "" # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120"
Blackout_schedule_Start = dt_time(2, 15) # 2:15 AM
Blackout_schedule_End = dt_time(3, 30) # 3:30 AM
# ----------------------------------------------------------active-code--------------------------------------------------------------#
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
@@ -34,7 +31,6 @@ logging.basicConfig(
logging.StreamHandler()
]
)
class GPUResourceManager:
def __init__(self):
self.idle_compute_running = False
@@ -45,7 +41,6 @@ class GPUResourceManager:
self.gpu_processes = []
self.lock = threading.Lock()
self.operation_in_progress = False
def run_command(self, cmd):
try:
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
@@ -53,13 +48,11 @@ class GPUResourceManager:
except subprocess.CalledProcessError as e:
logging.error(f"Command failed: {cmd}, error: {e.stderr}")
return None
def is_container_running(self, container_id):
output = self.run_command(f"pct list | grep \"^{container_id}\"")
if output and "running" in output:
return True
return False
def stop_container(self, container_id):
if self.is_container_running(container_id):
logging.info(f"Stopping container {container_id}")
@@ -70,7 +63,6 @@ class GPUResourceManager:
while self.is_container_running(container_id) and waited < max_wait:
time.sleep(1)
waited += 1
if not self.is_container_running(container_id):
logging.info(f"Container {container_id} stopped successfully")
return True
@@ -83,7 +75,6 @@ class GPUResourceManager:
logging.debug(f"Container {container_id} already stopped, no action needed")
return True
return False
def start_container(self, container_id):
if not self.is_container_running(container_id):
logging.info(f"Starting container {container_id}")
@@ -94,7 +85,6 @@ class GPUResourceManager:
while not self.is_container_running(container_id) and waited < max_wait:
time.sleep(1)
waited += 1
if self.is_container_running(container_id):
logging.info(f"Container {container_id} started successfully")
return True
@@ -106,10 +96,11 @@ class GPUResourceManager:
logging.debug(f"Container {container_id} already running, no action needed")
return True
return False
def get_gpu_processes(self):
try:
output = self.run_command("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits")
output = self.run_command(
"nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
)
processes = []
if output:
for line in output.split('\n'):
@@ -125,29 +116,23 @@ class GPUResourceManager:
except Exception as e:
logging.error(f"Error getting GPU processes: {e}")
return []
def is_compute_process(self, process_name):
process_lower = process_name.lower()
for pattern in IDLE_NvGPU_PROCESSES:
if pattern in process_lower:
return True
return False
def is_known_system_process(self, process_name):
return any(sys_proc in process_name for sys_proc in KNOWN_NvidiaGPU_PROCESSES)
return any(sys_proc in process_name for sys_proc in KNOWN_NvGPU_PROCESSES)
def is_gpu_idle(self):
processes = self.get_gpu_processes()
non_mining_processes = []
for process in processes:
if not self.is_compute_process(process['name']) and not self.is_known_system_process(process['name']):
memory_usage = int(process['memory'].split()[0])
if memory_usage > 100: # More than 100MB is significant
if memory_usage > 100: #MB
non_mining_processes.append(process)
is_idle = len(non_mining_processes) == 0
if is_idle:
mining_count = len([p for p in processes if self.is_compute_process(p['name'])])
if mining_count > 0:
@@ -156,34 +141,28 @@ class GPUResourceManager:
logging.debug("GPU is truly idle (no significant processes)")
else:
logging.debug(f"GPU is active with non-mining processes: {[p['name'] for p in non_mining_processes]}")
return is_idle
def is_Idle_NvGPU_process_active(self):
processes = self.get_gpu_processes()
mining_processes = [p for p in processes if self.is_compute_process(p['name'])]
return len(mining_processes) > 0
def is_ollama_still_active(self):
if not self.ollama_active:
return False
time_since_last_activity = time.time() - self.last_ollama_activity
return time_since_last_activity < self.ollama_activity_timeout
def should_stop_for_schedule(self):
now = datetime.now().time()
stop_start = dt_time(Blackout_schedule_Start) # 2:15 AM
stop_end = dt_time(Blackout_schedule_End) # 3:30 AM
stop_start = Blackout_schedule_Start
stop_end = Blackout_schedule_End
in_window = stop_start <= now <= stop_end
if in_window:
logging.debug("Within scheduled maintenance window (2:15am-3:30am)")
return in_window
def manage_idle_NvGPU_process(self):
with self.lock:
if self.operation_in_progress:
return
return
self.operation_in_progress = True
try:
current_idle_NvGPU_process_state = self.is_container_running(IDLE_CONTAINER_ID)
@@ -195,8 +174,7 @@ class GPUResourceManager:
self.stop_container(IDLE_CONTAINER_ID)
self.idle_compute_running = False
return
ollama_still_active = self.is_ollama_still_active()
if ollama_still_active:
if current_idle_NvGPU_process_state or mining_active_on_gpu:
@@ -206,11 +184,9 @@ class GPUResourceManager:
self.idle_compute_running = False
return
else:
if self.ollama_active:
self.ollama_active = False
logging.info("Ollama activity timeout reached")
if self.is_gpu_idle():
if not current_idle_NvGPU_process_state and not mining_active_on_gpu:
logging.info("GPU idle, starting idle NvGPU container")
@@ -226,7 +202,6 @@ class GPUResourceManager:
self.idle_compute_running = False
finally:
self.operation_in_progress = False
def force_stop_idle_NvGPU_process_for_ollama(self):
with self.lock:
self.ollama_active = True
@@ -247,9 +222,8 @@ class GPUResourceManager:
logging.warning("Mining processes still active after container stop")
else:
logging.debug("idle NvGPU container already stopped, no action needed")
self.idle_compute_running = False
self.idle_compute_running = False
def start_monitoring(self):
def monitor_loop():
while True:
@@ -258,48 +232,52 @@ class GPUResourceManager:
except Exception as e:
logging.error(f"Error in monitor loop: {e}")
time.sleep(GPU_CHECK_INTERVAL)
monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
monitor_thread.start()
logging.info("GPU monitoring thread started")
class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
self.gpu_manager = kwargs.pop('gpu_manager')
super().__init__(*args, **kwargs)
def do_GET(self):
if self._is_gpu_intensive_operation(self.path, {}):
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(2)
time.sleep(2)
self._forward_request('GET')
def do_HEAD(self):
if self._is_gpu_intensive_operation(self.path, {}):
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(2)
self._forward_request('HEAD')
def do_POST(self):
content_length = int(self.headers.get('Content-Length', 0))
post_data = self.rfile.read(content_length) if content_length > 0 else b''
try:
request_data = json.loads(post_data.decode('utf-8')) if post_data else {}
request_data = {}
try:
request_data = json.loads(post_data.decode('utf-8')) if post_data else {}
except Exception:
request_data = {}
is_gpu_intensive = self._is_gpu_intensive_operation(self.path, request_data)
if is_gpu_intensive:
logging.info(f"GPU-intensive operation detected: {self.path}")
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(3.5)
self._forward_request('POST', post_data)
if is_gpu_intensive:
with self.gpu_manager.lock:
self.gpu_manager.last_ollama_activity = time.time()
logging.info("Ollama request completed, activity timestamp updated")
except Exception as e:
logging.error(f"Error processing request: {e}")
self.send_error(500, f"Internal server error: {e}")
def _is_gpu_intensive_operation(self, path, request_data):
if path == '/api/generate' and request_data.get('keep_alive') == 0:
logging.debug("Detected model unload request via /api/generate")
return False
if path == '/api/chat' and request_data.get('keep_alive') == 0 and request_data.get('messages') == []:
logging.debug("Detected model unload request via /api/chat")
return False
if path in ['/api/generate', '/api/chat', '/api/embeddings']:
return True
if path == '/api/load':
@@ -307,71 +285,92 @@ class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
if path == '/api/pull':
return request_data.get('stream', True)
return False
def _forward_request(self, method, data=None):
url = f"{OLLAMA_BASE_URL}{self.path}"
headers = {key: value for key, value in self.headers.items()}
hop_headers = ['connection', 'keep-alive', 'proxy-authenticate',
'proxy-authorization', 'te', 'trailers', 'upgrade']
for header in hop_headers:
headers.pop(header, None)
hop_headers = {
'connection', 'keep-alive', 'proxy-authenticate',
'proxy-authorization', 'te', 'trailers', 'upgrade',
'transfer-encoding'
}
for header in list(headers.keys()):
if header.lower() in hop_headers:
headers.pop(header, None)
headers.pop('Host', None)
timeout = (10, None)
try:
if method == 'GET':
response = requests.get(url, headers=headers, timeout=300)
if method.upper() in ('GET', 'HEAD'):
resp = requests.request(method, url, headers=headers, stream=True, timeout=timeout)
else:
response = requests.post(url, data=data, headers=headers, timeout=300)
self.send_response(response.status_code)
for key, value in response.headers.items():
if key.lower() not in ['content-encoding', 'transfer-encoding', 'connection']:
resp = requests.request(method, url, headers=headers, data=data, stream=True, timeout=timeout)
self.send_response(resp.status_code)
for key, value in resp.headers.items():
k_lower = key.lower()
if k_lower in hop_headers:
continue
try:
self.send_header(key, value)
except Exception:
logging.debug(f"Skipping header {key} due to send_header error")
self.end_headers()
self.wfile.write(response.content)
try:
for chunk in resp.iter_content(chunk_size=4096):
if not chunk:
continue
try:
self.wfile.write(chunk)
self.wfile.flush()
except (BrokenPipeError, ConnectionResetError, socket.error) as e:
logging.info(f"Client connection closed during streaming: {e}")
break
finally:
resp.close()
except requests.exceptions.RequestException as e:
logging.error(f"Error forwarding to Ollama: {e}")
self.send_error(502, f"Bad gateway: {e}")
try:
if hasattr(self, '_headers_buffer') and getattr(self, 'wfile', None):
try:
err_msg = f"\n\n[proxy error] upstream request failed: {e}\n"
self.wfile.write(err_msg.encode('utf-8'))
self.wfile.flush()
except Exception:
pass
else:
self.send_error(502, f"Bad gateway: {e}")
except Exception:
pass
def log_message(self, format, *args):
logging.info(f"{self.address_string()} - {format % args}")
class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
daemon_threads = True
allow_reuse_address = True
def main():
logging.info("Starting GPU Resource Manager Proxy on Proxmox Host")
manager = GPUResourceManager()
test_output = manager.run_command("pct list > /dev/null && echo 'pct available'")
if test_output:
logging.info("pct command available")
else:
logging.error("pct command not available! Running on wrong system?")
return
gpu_processes = manager.get_gpu_processes()
logging.info(f"Current GPU processes: {gpu_processes}")
idle_NvGPU_process_status = manager.is_container_running(IDLE_CONTAINER_ID)
logging.info(f"idle NvGPU {IDLE_CONTAINER_ID} running: {idle_NvGPU_process_status}")
gpu_manager = GPUResourceManager()
gpu_manager.start_monitoring()
handler = lambda *args, **kwargs: OllamaProxyHandler(*args, gpu_manager=gpu_manager, **kwargs)
with socketserver.TCPServer(("", PROXY_PORT), handler) as httpd:
with ThreadedTCPServer(("", PROXY_PORT), handler) as httpd:
logging.info(f"Proxy server running on port {PROXY_PORT}")
logging.info(f"Forwarding to Ollama at {OLLAMA_HOST}:{OLLAMA_PORT}")
logging.info(f"Managing idle NvGPU process: {IDLE_CONTAINER_ID}")
logging.info("Monitoring GPU usage and scheduled maintenance windows")
logging.info(f"Mining process patterns: {IDLE_NvGPU_PROCESSES}")
try:
httpd.serve_forever()
except KeyboardInterrupt:
logging.info("Shutting down proxy server")
except Exception as e:
logging.error(f"Server error: {e}")
if __name__ == "__main__":
main()
main()
+21
View File
@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 mrmarcus007
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+2 -2
View File
@@ -1,5 +1,5 @@
# GPU Resource Manager & Proxy for Ollama
*A smart GPU-aware proxy for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.*
*A smart GPU-aware proxy designed for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.*
---
@@ -66,7 +66,7 @@ GPU_CHECK_INTERVAL = 10 # Seconds between GPU checks
# GPU Process Management
IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
KNOWN_NvidiaGPU_PROCESSES = ['Xorg']
KNOWN_NvGPU_PROCESSES = ['Xorg']
IDLE_CONTAINER_ID = "120" # LXC container ID of idle GPU workload
Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container. Hour, Minute.
Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again. Hour, Minute.