Fixed errors, added hop by hop support and status of loaded ollama model.

This commit is contained in:
mrmarcus007
2025-11-14 13:25:45 +01:00
parent 32499cd056
commit 1594e9ea1f
@@ -9,23 +9,26 @@ import subprocess
from datetime import datetime, time as dt_time
from urllib.parse import urlparse, parse_qs
import logging
import socket
# Configuration
OLLAMA_HOST = "localhost" # Your Ollama LXC IP
OLLAMA_HOST = "" # Your Ollama LXC IP
OLLAMA_PORT = 11434 # Your Ollama LXC Port
PROXY_PORT = 11435 # Port Of This Proxy
OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Don't touch unless you know what you are doing.
OLLAMA_BASE_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}" # Fixed formatting
# GPU monitoring
GPU_CHECK_INTERVAL = 10 # seconds it wait's to check for other process apart from known/compute processes
GPU_CHECK_INTERVAL = 10 # seconds it waits to check for other process apart from known/compute processes
# process process patterns (from your nvidia-smi output)
IDLE_NvGPU_PROCESSES = ['t-rex', 'trex', 'miner', 'xmrig', 'lolminer', 'nbminer']
KNOWN_NvGPU_PROCESSES = ['Xorg'] # Processes that are allowed when "idle" compute process is running
IDLE_CONTAINER_ID = "" # running Idle GPU Container ID, example: COMPUTE_CONTAINER_ID ="120"
Blackout_schedule_Start = 2, 15 #when to start stopping the idle NvGPU container.
Blackout_schedule_End = 3, 30 #when to allow starting the idle NvGPU container again.
#----------------------------------------------------------active-code--------------------------------------------------------------#
# Maintenance window (dt_time objects)
Blackout_schedule_Start = dt_time(2, 15) # 2:15 AM
Blackout_schedule_End = dt_time(3, 30) # 3:30 AM
# ----------------------------------------------------------active-code--------------------------------------------------------------#
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
@@ -109,7 +112,9 @@ class GPUResourceManager:
def get_gpu_processes(self):
try:
output = self.run_command("nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits")
output = self.run_command(
"nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits"
)
processes = []
if output:
for line in output.split('\n'):
@@ -143,7 +148,7 @@ class GPUResourceManager:
for process in processes:
if not self.is_compute_process(process['name']) and not self.is_known_system_process(process['name']):
memory_usage = int(process['memory'].split()[0])
if memory_usage > 100: # More than 100MB is significant
if memory_usage > 100: #MB
non_mining_processes.append(process)
is_idle = len(non_mining_processes) == 0
@@ -172,8 +177,8 @@ class GPUResourceManager:
def should_stop_for_schedule(self):
now = datetime.now().time()
stop_start = dt_time(Blackout_schedule_Start) # 2:15 AM
stop_end = dt_time(Blackout_schedule_End) # 3:30 AM
stop_start = Blackout_schedule_Start
stop_end = Blackout_schedule_End
in_window = stop_start <= now <= stop_end
if in_window:
logging.debug("Within scheduled maintenance window (2:15am-3:30am)")
@@ -196,7 +201,6 @@ class GPUResourceManager:
self.idle_compute_running = False
return
ollama_still_active = self.is_ollama_still_active()
if ollama_still_active:
if current_idle_NvGPU_process_state or mining_active_on_gpu:
@@ -206,7 +210,6 @@ class GPUResourceManager:
self.idle_compute_running = False
return
else:
if self.ollama_active:
self.ollama_active = False
logging.info("Ollama activity timeout reached")
@@ -263,6 +266,7 @@ class GPUResourceManager:
monitor_thread.start()
logging.info("GPU monitoring thread started")
class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
self.gpu_manager = kwargs.pop('gpu_manager')
@@ -272,15 +276,25 @@ class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
if self._is_gpu_intensive_operation(self.path, {}):
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(2)
self._forward_request('GET')
def do_HEAD(self):
if self._is_gpu_intensive_operation(self.path, {}):
self.gpu_manager.force_stop_idle_NvGPU_process_for_ollama()
time.sleep(2)
self._forward_request('HEAD')
def do_POST(self):
content_length = int(self.headers.get('Content-Length', 0))
post_data = self.rfile.read(content_length) if content_length > 0 else b''
try:
request_data = {}
try:
request_data = json.loads(post_data.decode('utf-8')) if post_data else {}
except Exception:
request_data = {}
is_gpu_intensive = self._is_gpu_intensive_operation(self.path, request_data)
if is_gpu_intensive:
@@ -311,31 +325,75 @@ class OllamaProxyHandler(http.server.SimpleHTTPRequestHandler):
def _forward_request(self, method, data=None):
url = f"{OLLAMA_BASE_URL}{self.path}"
headers = {key: value for key, value in self.headers.items()}
hop_headers = ['connection', 'keep-alive', 'proxy-authenticate',
'proxy-authorization', 'te', 'trailers', 'upgrade']
for header in hop_headers:
hop_headers = {
'connection', 'keep-alive', 'proxy-authenticate',
'proxy-authorization', 'te', 'trailers', 'upgrade',
'transfer-encoding'
}
for header in list(headers.keys()):
if header.lower() in hop_headers:
headers.pop(header, None)
try:
if method == 'GET':
response = requests.get(url, headers=headers, timeout=300)
else:
response = requests.post(url, data=data, headers=headers, timeout=300)
headers.pop('Host', None)
self.send_response(response.status_code)
for key, value in response.headers.items():
if key.lower() not in ['content-encoding', 'transfer-encoding', 'connection']:
timeout = (10, None)
try:
if method.upper() in ('GET', 'HEAD'):
resp = requests.request(method, url, headers=headers, stream=True, timeout=timeout)
else:
resp = requests.request(method, url, headers=headers, data=data, stream=True, timeout=timeout)
self.send_response(resp.status_code)
for key, value in resp.headers.items():
k_lower = key.lower()
if k_lower in hop_headers:
continue
try:
self.send_header(key, value)
except Exception:
logging.debug(f"Skipping header {key} due to send_header error")
self.end_headers()
self.wfile.write(response.content)
try:
for chunk in resp.iter_content(chunk_size=4096):
if not chunk:
continue
try:
self.wfile.write(chunk)
self.wfile.flush()
except (BrokenPipeError, ConnectionResetError, socket.error) as e:
logging.info(f"Client connection closed during streaming: {e}")
break
finally:
resp.close()
except requests.exceptions.RequestException as e:
logging.error(f"Error forwarding to Ollama: {e}")
try:
if hasattr(self, '_headers_buffer') and getattr(self, 'wfile', None):
try:
err_msg = f"\n\n[proxy error] upstream request failed: {e}\n"
self.wfile.write(err_msg.encode('utf-8'))
self.wfile.flush()
except Exception:
pass
else:
self.send_error(502, f"Bad gateway: {e}")
except Exception:
pass
def log_message(self, format, *args):
logging.info(f"{self.address_string()} - {format % args}")
class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
daemon_threads = True
allow_reuse_address = True
def main():
logging.info("Starting GPU Resource Manager Proxy on Proxmox Host")
@@ -345,7 +403,6 @@ def main():
logging.info("pct command available")
else:
logging.error("pct command not available! Running on wrong system?")
return
gpu_processes = manager.get_gpu_processes()
logging.info(f"Current GPU processes: {gpu_processes}")
@@ -354,12 +411,11 @@ def main():
logging.info(f"idle NvGPU {IDLE_CONTAINER_ID} running: {idle_NvGPU_process_status}")
gpu_manager = GPUResourceManager()
gpu_manager.start_monitoring()
handler = lambda *args, **kwargs: OllamaProxyHandler(*args, gpu_manager=gpu_manager, **kwargs)
with socketserver.TCPServer(("", PROXY_PORT), handler) as httpd:
with ThreadedTCPServer(("", PROXY_PORT), handler) as httpd:
logging.info(f"Proxy server running on port {PROXY_PORT}")
logging.info(f"Forwarding to Ollama at {OLLAMA_HOST}:{OLLAMA_PORT}")
logging.info(f"Managing idle NvGPU process: {IDLE_CONTAINER_ID}")