import subprocess import time import re import psutil import torch import sys from threading import Thread, Event from queue import Queue, Empty from colorama import Fore, Style, init import os def get_gpu_memory_usage(): """Get current GPU memory usage percentage""" if torch.cuda.is_available(): device = torch.cuda.current_device() total_mem = torch.cuda.get_device_properties(device).total_memory allocated_mem = torch.cuda.memory_allocated(device) return (allocated_mem / total_mem) * 100 return 0 def check_for_cuda_errors(log_queue): """Check application output for CUDA-related errors""" cuda_error_patterns = [ r"CUDA error", r"out of memory", r"cudaError", r"RuntimeError: CUDA", r"CUDA runtime error", r"CUDA out of memory", r"CUDA kernel failed" ] try: while True: line = log_queue.get_nowait() for pattern in cuda_error_patterns: if re.search(pattern, line, re.IGNORECASE): return True except Empty: pass return False def read_output(pipe, log_queue, print_event): """Read output from subprocess and distribute it""" try: for line in iter(pipe.readline, ''): log_queue.put(line) if print_event.is_set(): print(line, end='', flush=True) except: pass def run_application(command, max_gpu_usage=90, check_interval=10): """Run application with watchdog functionality and live output""" print_event = Event() print_event.set() # Enable printing by default while True: print(f"\n{'=' * 40}") print(f"Starting application: {' '.join(command)}") print(f"{'=' * 40}\n") log_queue = Queue() process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True ) output_thread = Thread( target=read_output, args=(process.stdout, log_queue, print_event) ) output_thread.daemon = True output_thread.start() try: while True: if process.poll() is not None: print("\nApplication exited with code:", process.returncode) break gpu_usage = get_gpu_memory_usage() if gpu_usage > max_gpu_usage: print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)") break if check_for_cuda_errors(log_queue): print("\nCUDA error detected in application output") break time.sleep(check_interval) print("\nWaiting 1.5 seconds before restart...") print(Fore.RED + f"{30 * '-'}") print(Fore.RED + "RESTATRING...") print(Fore.RED + f"{30 * '-'}") print(Fore.WHITE) os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav") # Clean up try: if process.poll() is None: parent = psutil.Process(process.pid) for child in parent.children(recursive=True): child.kill() parent.kill() except psutil.NoSuchProcess: pass if torch.cuda.is_available(): torch.cuda.empty_cache() time.sleep(1.5) except KeyboardInterrupt: print("\nStopping watchdog...") print_event.clear() output_thread.join() try: process.kill() except: pass break if __name__ == "__main__": # Configure these parameters APP_COMMAND = ["python", "app.py"] # Your application command MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage CHECK_INTERVAL = 0.5 # Seconds between checks run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)