diff --git a/watchdogs.py b/watchdogs.py new file mode 100644 index 0000000..73021f7 --- /dev/null +++ b/watchdogs.py @@ -0,0 +1,136 @@ +import subprocess +import time +import re +import psutil +import torch +import sys +from threading import Thread, Event +from queue import Queue, Empty +from colorama import Fore, Style, init +import os + +def get_gpu_memory_usage(): + """Get current GPU memory usage percentage""" + if torch.cuda.is_available(): + device = torch.cuda.current_device() + total_mem = torch.cuda.get_device_properties(device).total_memory + allocated_mem = torch.cuda.memory_allocated(device) + return (allocated_mem / total_mem) * 100 + return 0 + + +def check_for_cuda_errors(log_queue): + """Check application output for CUDA-related errors""" + cuda_error_patterns = [ + r"CUDA error", + r"out of memory", + r"cudaError", + r"RuntimeError: CUDA", + r"CUDA runtime error", + r"CUDA out of memory", + r"CUDA kernel failed" + ] + + try: + while True: + line = log_queue.get_nowait() + for pattern in cuda_error_patterns: + if re.search(pattern, line, re.IGNORECASE): + return True + except Empty: + pass + return False + + +def read_output(pipe, log_queue, print_event): + """Read output from subprocess and distribute it""" + try: + for line in iter(pipe.readline, ''): + log_queue.put(line) + if print_event.is_set(): + print(line, end='', flush=True) + except: + pass + +def run_application(command, max_gpu_usage=90, check_interval=10): + """Run application with watchdog functionality and live output""" + print_event = Event() + print_event.set() # Enable printing by default + + while True: + print(f"\n{'=' * 40}") + print(f"Starting application: {' '.join(command)}") + print(f"{'=' * 40}\n") + + log_queue = Queue() + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=1, + universal_newlines=True + ) + + output_thread = Thread( + target=read_output, + args=(process.stdout, log_queue, print_event) + ) + output_thread.daemon = True + output_thread.start() + + try: + while True: + if process.poll() is not None: + print("\nApplication exited with code:", process.returncode) + break + + gpu_usage = get_gpu_memory_usage() + if gpu_usage > max_gpu_usage: + print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)") + break + + if check_for_cuda_errors(log_queue): + print("\nCUDA error detected in application output") + break + + time.sleep(check_interval) + + print("\nWaiting 1.5 seconds before restart...") + print(Fore.RED + f"{30 * '-'}") + print(Fore.RED + "RESTATRING...") + print(Fore.RED + f"{30 * '-'}") + print(Fore.WHITE) + os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav") + # Clean up + try: + if process.poll() is None: + parent = psutil.Process(process.pid) + for child in parent.children(recursive=True): + child.kill() + parent.kill() + except psutil.NoSuchProcess: + pass + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + + time.sleep(1.5) + + except KeyboardInterrupt: + print("\nStopping watchdog...") + print_event.clear() + output_thread.join() + try: + process.kill() + except: + pass + break + +if __name__ == "__main__": + # Configure these parameters + APP_COMMAND = ["python", "app.py"] # Your application command + MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage + CHECK_INTERVAL = 0.5 # Seconds between checks + + run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)