Merge branch 'main' of https://github.com/mrmarcus007/GPU-Recource-Manager-and-proxy-for-ollama

Fixed errors, added hop by hop support and status of loaded ollama model.
Refine description of GPU Resource Manager
2025-11-14 13:26:16 +01:00 · 2025-11-14 13:25:45 +01:00 · 2025-11-11 18:03:07 +01:00 · 2025-11-11 14:18:54 +01:00 · 2025-11-10 16:13:59 +01:00 · 2025-11-10 16:13:14 +01:00
3 changed files with 137 additions and 60 deletions
@@ -9,22 +9,25 @@ import subprocess
 from datetime import datetime, time as dt_time
 from urllib.parse import urlparse, parse_qs
 import logging
+import socket

 # Configuration
-OLLAMA_HOST = "localhost"  # Your Ollama LXC IP
+OLLAMA_HOST = ""  # Your Ollama LXC IP
 OLLAMA_PORT = 11434  # Your Ollama LXC Port
 PROXY_PORT = 11435  # Port Of This Proxy
-OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Don't touch unless you know what you are doing.
+OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}"  # Fixed formatting

 # GPU monitoring
-GPU_CHECK_INTERVAL = 10  # seconds it wait's to check for other process apart from known/compute processes
+GPU_CHECK_INTERVAL = 10  # seconds it waits to check for other process apart from known/compute processes

 # process process patterns (from your nvidia-smi output)
 IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
-KNOWN_NvidiaGPU_PROCESSES = ['Xorg']  # Processes that are allowed when "idle" compute process is running
+KNOWN_NvGPU_PROCESSES = ['Xorg']  # Processes that are allowed when "idle" compute process is running
 IDLE_CONTAINER_ID = ""  # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120"
-Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container.
-Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again.
+# Maintenance window (dt_time objects)
+Blackout_schedule_Start = dt_time(2, 15)  # 2:15 AM
+Blackout_schedule_End = dt_time(3, 30)    # 3:30 AM
+
 # ----------------------------------------------------------active-code--------------------------------------------------------------#
 logging.basicConfig(
    level=logging.INFO,
@@ -109,7 +112,9 @@ class GPUResourceManager:

    def get_gpu_processes(self):
        try:
-            output = self.run_command("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits")
+            output = self.run_command(
+                "nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
+            )
            processes = []
            if output:
                for line in output.split('\n'):
@@ -134,7 +139,7 @@ class GPUResourceManager:
        return False

    def is_known_system_process(self, process_name):
-        return any(sys_proc in process_name for sys_proc in KNOWN_NvidiaGPU_PROCESSES)
+        return any(sys_proc in process_name for sys_proc in KNOWN_NvGPU_PROCESSES)

    def is_gpu_idle(self):
        processes = self.get_gpu_processes()
@@ -143,7 +148,7 @@ class GPUResourceManager:
        for process in processes:
            if not self.is_compute_process(process['name']) and not self.is_known_system_process(process['name']):
                memory_usage = int(process['memory'].split()[0])
-                if memory_usage > 100:  # More than 100MB is significant
+                if memory_usage > 100: #MB 
                    non_mining_processes.append(process)

        is_idle = len(non_mining_processes) == 0
@@ -172,8 +177,8 @@ class GPUResourceManager:

    def should_stop_for_schedule(self):
        now = datetime.now().time()
-        stop_start = dt_time(Blackout_schedule_Start)  # 2:15 AM
-        stop_end = dt_time(Blackout_schedule_End)    # 3:30 AM
+        stop_start = Blackout_schedule_Start 
+        stop_end = Blackout_schedule_End    
        in_window = stop_start <= now <= stop_end
        if in_window:
            logging.debug("Within scheduled maintenance window (2:15am-3:30am)")
@@ -196,7 +201,6 @@ class GPUResourceManager:
                        self.idle_compute_running = False
                    return

-                
                ollama_still_active = self.is_ollama_still_active()
                if ollama_still_active:
                    if current_idle_NvGPU_process_state or mining_active_on_gpu:
@@ -206,7 +210,6 @@ class GPUResourceManager:
                        self.idle_compute_running = False
                    return
                else:
-                    
                    if self.ollama_active:
                        self.ollama_active = False
                        logging.info("Ollama activity timeout reached")
@@ -263,6 +266,7 @@ class GPUResourceManager:
        monitor_thread.start()
        logging.info("GPU monitoring thread started")

+
 class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        self.gpu_manager = kwargs.pop('gpu_manager')
@@ -272,15 +276,25 @@ class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
        if self._is_gpu_intensive_operation(self.path, {}):
            self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
            time.sleep(2)
-        
        self._forward_request('GET')

+    def do_HEAD(self):
+        if self._is_gpu_intensive_operation(self.path, {}):
+            self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
+            time.sleep(2)
+        self._forward_request('HEAD')
+
    def do_POST(self):
        content_length = int(self.headers.get('Content-Length', 0))
        post_data = self.rfile.read(content_length) if content_length > 0 else b''

+        try:
+            request_data = {}
            try:
                request_data = json.loads(post_data.decode('utf-8')) if post_data else {}
+            except Exception:
+                request_data = {}
+
            is_gpu_intensive = self._is_gpu_intensive_operation(self.path, request_data)

            if is_gpu_intensive:
@@ -311,31 +325,75 @@ class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
    def _forward_request(self, method, data=None):
        url = f"{OLLAMA_BASE_URL}{self.path}"
        headers = {key: value for key, value in self.headers.items()}
-        hop_headers = ['connection', 'keep-alive', 'proxy-authenticate', 
-                      'proxy-authorization', 'te', 'trailers', 'upgrade']
-        for header in hop_headers:
+
+        hop_headers = {
+            'connection', 'keep-alive', 'proxy-authenticate',
+            'proxy-authorization', 'te', 'trailers', 'upgrade',
+            'transfer-encoding'
+        }
+        for header in list(headers.keys()):
+            if header.lower() in hop_headers:
                headers.pop(header, None)

-        try:
-            if method == 'GET':
-                response = requests.get(url, headers=headers, timeout=300)
-            else:
-                response = requests.post(url, data=data, headers=headers, timeout=300)
+        headers.pop('Host', None)

-            self.send_response(response.status_code)
-            for key, value in response.headers.items():
-                if key.lower() not in ['content-encoding', 'transfer-encoding', 'connection']:
+        timeout = (10, None)
+
+        try:
+            if method.upper() in ('GET', 'HEAD'):
+                resp = requests.request(method, url, headers=headers, stream=True, timeout=timeout)
+            else:
+                resp = requests.request(method, url, headers=headers, data=data, stream=True, timeout=timeout)
+
+            self.send_response(resp.status_code)
+
+            for key, value in resp.headers.items():
+                k_lower = key.lower()
+                if k_lower in hop_headers:
+                    continue
+                try:
                    self.send_header(key, value)
+                except Exception:
+                    logging.debug(f"Skipping header {key} due to send_header error")
            self.end_headers()
-            self.wfile.write(response.content)
+
+            try:
+                for chunk in resp.iter_content(chunk_size=4096):
+                    if not chunk:
+                        continue
+                    try:
+                        self.wfile.write(chunk)
+                        self.wfile.flush()
+                    except (BrokenPipeError, ConnectionResetError, socket.error) as e:
+                        logging.info(f"Client connection closed during streaming: {e}")
+                        break
+            finally:
+                resp.close()

        except requests.exceptions.RequestException as e:
            logging.error(f"Error forwarding to Ollama: {e}")
+            try:
+                if hasattr(self, '_headers_buffer') and getattr(self, 'wfile', None):
+                    try:
+                        err_msg = f"\n\n[proxy error] upstream request failed: {e}\n"
+                        self.wfile.write(err_msg.encode('utf-8'))
+                        self.wfile.flush()
+                    except Exception:
+                        pass
+                else:
                    self.send_error(502, f"Bad gateway: {e}")
+            except Exception:
+                pass

    def log_message(self, format, *args):
        logging.info(f"{self.address_string()} - {format % args}")

+
+class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
+    daemon_threads = True
+    allow_reuse_address = True
+
+
 def main():
    logging.info("Starting GPU Resource Manager Proxy on Proxmox Host")

@@ -345,7 +403,6 @@ def main():
        logging.info("pct command available")
    else:
        logging.error("pct command not available! Running on wrong system?")
-        return

    gpu_processes = manager.get_gpu_processes()
    logging.info(f"Current GPU processes: {gpu_processes}")
@@ -354,12 +411,11 @@ def main():
    logging.info(f"idle NvGPU {IDLE_CONTAINER_ID} running: {idle_NvGPU_process_status}")

    gpu_manager = GPUResourceManager()
-
    gpu_manager.start_monitoring()

    handler = lambda *args, **kwargs: OllamaProxyHandler(*args, gpu_manager=gpu_manager, **kwargs)

-    with socketserver.TCPServer(("", PROXY_PORT), handler) as httpd:
+    with ThreadedTCPServer(("", PROXY_PORT), handler) as httpd:
        logging.info(f"Proxy server running on port {PROXY_PORT}")
        logging.info(f"Forwarding to Ollama at {OLLAMA_HOST}:{OLLAMA_PORT}")
        logging.info(f"Managing idle NvGPU process: {IDLE_CONTAINER_ID}")
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 mrmarcus007
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -1,5 +1,5 @@
 # GPU Resource Manager & Proxy for Ollama  
-*A smart GPU-aware proxy for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.*
+*A smart GPU-aware proxy designed for Proxmox VE that dynamically manages GPU resources between Ollama and GPU-intensive background processes.*

 ---

@@ -66,7 +66,7 @@ GPU_CHECK_INTERVAL = 10        # Seconds between GPU checks

 # GPU Process Management
 IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
-KNOWN_NvidiaGPU_PROCESSES = ['Xorg']  
+KNOWN_NvGPU_PROCESSES = ['Xorg']  
 IDLE_CONTAINER_ID = "120"      # LXC container ID of idle GPU workload
 Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container. Hour, Minute.
 Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again. Hour, Minute.
Author	SHA1	Message	Date
mrmarcus007	8282740de4	Merge branch 'main' of https://github.com/mrmarcus007/GPU-Recource-Manager-and-proxy-for-ollama	2025-11-14 13:26:16 +01:00
mrmarcus007	1594e9ea1f	Fixed errors, added hop by hop support and status of loaded ollama model.	2025-11-14 13:25:45 +01:00
mrmarcus007	2e73c69a0d	Refine description of GPU Resource Manager	2025-11-11 18:03:07 +01:00
mrmarcus007	32499cd056	Add MIT LICENSE	2025-11-11 14:18:54 +01:00
mrmarcus007	ccd39b4f10	fixed something over looked	2025-11-10 16:13:59 +01:00
mrmarcus007	f0f8f22a6a	make more consistant	2025-11-10 16:13:14 +01:00