You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

136 lines
4.1 KiB

import subprocess
import time
import re
import psutil
import torch
import sys
from threading import Thread, Event
from queue import Queue, Empty
from colorama import Fore, Style, init
import os
def get_gpu_memory_usage():
"""Get current GPU memory usage percentage"""
if torch.cuda.is_available():
device = torch.cuda.current_device()
total_mem = torch.cuda.get_device_properties(device).total_memory
allocated_mem = torch.cuda.memory_allocated(device)
return (allocated_mem / total_mem) * 100
return 0
def check_for_cuda_errors(log_queue):
"""Check application output for CUDA-related errors"""
cuda_error_patterns = [
r"CUDA error",
r"out of memory",
r"cudaError",
r"RuntimeError: CUDA",
r"CUDA runtime error",
r"CUDA out of memory",
r"CUDA kernel failed"
]
try:
while True:
line = log_queue.get_nowait()
for pattern in cuda_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
return True
except Empty:
pass
return False
def read_output(pipe, log_queue, print_event):
"""Read output from subprocess and distribute it"""
try:
for line in iter(pipe.readline, ''):
log_queue.put(line)
if print_event.is_set():
print(line, end='', flush=True)
except:
pass
def run_application(command, max_gpu_usage=90, check_interval=10):
"""Run application with watchdog functionality and live output"""
print_event = Event()
print_event.set() # Enable printing by default
while True:
print(f"\n{'=' * 40}")
print(f"Starting application: {' '.join(command)}")
print(f"{'=' * 40}\n")
log_queue = Queue()
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
universal_newlines=True
)
output_thread = Thread(
target=read_output,
args=(process.stdout, log_queue, print_event)
)
output_thread.daemon = True
output_thread.start()
try:
while True:
if process.poll() is not None:
print("\nApplication exited with code:", process.returncode)
break
gpu_usage = get_gpu_memory_usage()
if gpu_usage > max_gpu_usage:
print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)")
break
if check_for_cuda_errors(log_queue):
print("\nCUDA error detected in application output")
break
time.sleep(check_interval)
print("\nWaiting 1.5 seconds before restart...")
print(Fore.RED + f"{30 * '-'}")
print(Fore.RED + "RESTATRING...")
print(Fore.RED + f"{30 * '-'}")
print(Fore.WHITE)
os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav")
# Clean up
try:
if process.poll() is None:
parent = psutil.Process(process.pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()
except psutil.NoSuchProcess:
pass
if torch.cuda.is_available():
torch.cuda.empty_cache()
time.sleep(1.5)
except KeyboardInterrupt:
print("\nStopping watchdog...")
print_event.clear()
output_thread.join()
try:
process.kill()
except:
pass
break
if __name__ == "__main__":
# Configure these parameters
APP_COMMAND = ["python", "app.py"] # Your application command
MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage
CHECK_INTERVAL = 0.5 # Seconds between checks
run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)