You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
136 lines
4.1 KiB
136 lines
4.1 KiB
import subprocess
|
|
import time
|
|
import re
|
|
import psutil
|
|
import torch
|
|
import sys
|
|
from threading import Thread, Event
|
|
from queue import Queue, Empty
|
|
from colorama import Fore, Style, init
|
|
import os
|
|
|
|
def get_gpu_memory_usage():
|
|
"""Get current GPU memory usage percentage"""
|
|
if torch.cuda.is_available():
|
|
device = torch.cuda.current_device()
|
|
total_mem = torch.cuda.get_device_properties(device).total_memory
|
|
allocated_mem = torch.cuda.memory_allocated(device)
|
|
return (allocated_mem / total_mem) * 100
|
|
return 0
|
|
|
|
|
|
def check_for_cuda_errors(log_queue):
|
|
"""Check application output for CUDA-related errors"""
|
|
cuda_error_patterns = [
|
|
r"CUDA error",
|
|
r"out of memory",
|
|
r"cudaError",
|
|
r"RuntimeError: CUDA",
|
|
r"CUDA runtime error",
|
|
r"CUDA out of memory",
|
|
r"CUDA kernel failed"
|
|
]
|
|
|
|
try:
|
|
while True:
|
|
line = log_queue.get_nowait()
|
|
for pattern in cuda_error_patterns:
|
|
if re.search(pattern, line, re.IGNORECASE):
|
|
return True
|
|
except Empty:
|
|
pass
|
|
return False
|
|
|
|
|
|
def read_output(pipe, log_queue, print_event):
|
|
"""Read output from subprocess and distribute it"""
|
|
try:
|
|
for line in iter(pipe.readline, ''):
|
|
log_queue.put(line)
|
|
if print_event.is_set():
|
|
print(line, end='', flush=True)
|
|
except:
|
|
pass
|
|
|
|
def run_application(command, max_gpu_usage=90, check_interval=10):
|
|
"""Run application with watchdog functionality and live output"""
|
|
print_event = Event()
|
|
print_event.set() # Enable printing by default
|
|
|
|
while True:
|
|
print(f"\n{'=' * 40}")
|
|
print(f"Starting application: {' '.join(command)}")
|
|
print(f"{'=' * 40}\n")
|
|
|
|
log_queue = Queue()
|
|
process = subprocess.Popen(
|
|
command,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
bufsize=1,
|
|
universal_newlines=True
|
|
)
|
|
|
|
output_thread = Thread(
|
|
target=read_output,
|
|
args=(process.stdout, log_queue, print_event)
|
|
)
|
|
output_thread.daemon = True
|
|
output_thread.start()
|
|
|
|
try:
|
|
while True:
|
|
if process.poll() is not None:
|
|
print("\nApplication exited with code:", process.returncode)
|
|
break
|
|
|
|
gpu_usage = get_gpu_memory_usage()
|
|
if gpu_usage > max_gpu_usage:
|
|
print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)")
|
|
break
|
|
|
|
if check_for_cuda_errors(log_queue):
|
|
print("\nCUDA error detected in application output")
|
|
break
|
|
|
|
time.sleep(check_interval)
|
|
|
|
print("\nWaiting 1.5 seconds before restart...")
|
|
print(Fore.RED + f"{30 * '-'}")
|
|
print(Fore.RED + "RESTATRING...")
|
|
print(Fore.RED + f"{30 * '-'}")
|
|
print(Fore.WHITE)
|
|
os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav")
|
|
# Clean up
|
|
try:
|
|
if process.poll() is None:
|
|
parent = psutil.Process(process.pid)
|
|
for child in parent.children(recursive=True):
|
|
child.kill()
|
|
parent.kill()
|
|
except psutil.NoSuchProcess:
|
|
pass
|
|
|
|
if torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
|
|
|
|
time.sleep(1.5)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\nStopping watchdog...")
|
|
print_event.clear()
|
|
output_thread.join()
|
|
try:
|
|
process.kill()
|
|
except:
|
|
pass
|
|
break
|
|
|
|
if __name__ == "__main__":
|
|
# Configure these parameters
|
|
APP_COMMAND = ["python", "app.py"] # Your application command
|
|
MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage
|
|
CHECK_INTERVAL = 0.5 # Seconds between checks
|
|
|
|
run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)
|