added watchdogs.py

8 months ago · eadb5f36e8
1 changed files with 136 additions and 0 deletions
--- a/watchdogs.py
+++ b/watchdogs.py
@ -0,0 +1,136 @@
+import subprocess
+import time
+import re
+import psutil
+import torch
+import sys
+from threading import Thread, Event
+from queue import Queue, Empty
+from colorama import Fore, Style, init
+import os
+
+def get_gpu_memory_usage():
+    """Get current GPU memory usage percentage"""
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        total_mem = torch.cuda.get_device_properties(device).total_memory
+        allocated_mem = torch.cuda.memory_allocated(device)
+        return (allocated_mem / total_mem) * 100
+    return 0
+
+
+def check_for_cuda_errors(log_queue):
+    """Check application output for CUDA-related errors"""
+    cuda_error_patterns = [
+        r"CUDA error",
+        r"out of memory",
+        r"cudaError",
+        r"RuntimeError: CUDA",
+        r"CUDA runtime error",
+        r"CUDA out of memory",
+        r"CUDA kernel failed"
+    ]
+
+    try:
+        while True:
+            line = log_queue.get_nowait()
+            for pattern in cuda_error_patterns:
+                if re.search(pattern, line, re.IGNORECASE):
+                    return True
+    except Empty:
+        pass
+    return False
+
+
+def read_output(pipe, log_queue, print_event):
+    """Read output from subprocess and distribute it"""
+    try:
+        for line in iter(pipe.readline, ''):
+            log_queue.put(line)
+            if print_event.is_set():
+                print(line, end='', flush=True)
+    except:
+        pass
+
+def run_application(command, max_gpu_usage=90, check_interval=10):
+    """Run application with watchdog functionality and live output"""
+    print_event = Event()
+    print_event.set()  # Enable printing by default
+
+    while True:
+        print(f"\n{'=' * 40}")
+        print(f"Starting application: {' '.join(command)}")
+        print(f"{'=' * 40}\n")
+
+        log_queue = Queue()
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            bufsize=1,
+            universal_newlines=True
+        )
+
+        output_thread = Thread(
+            target=read_output,
+            args=(process.stdout, log_queue, print_event)
+        )
+        output_thread.daemon = True
+        output_thread.start()
+
+        try:
+            while True:
+                if process.poll() is not None:
+                    print("\nApplication exited with code:", process.returncode)
+                    break
+
+                gpu_usage = get_gpu_memory_usage()
+                if gpu_usage > max_gpu_usage:
+                    print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)")
+                    break
+
+                if check_for_cuda_errors(log_queue):
+                    print("\nCUDA error detected in application output")
+                    break
+
+                time.sleep(check_interval)
+
+            print("\nWaiting 1.5 seconds before restart...")
+            print(Fore.RED + f"{30 * '-'}")
+            print(Fore.RED + "RESTATRING...")
+            print(Fore.RED + f"{30 * '-'}")
+            print(Fore.WHITE)
+            os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav")
+            # Clean up
+            try:
+                if process.poll() is None:
+                    parent = psutil.Process(process.pid)
+                    for child in parent.children(recursive=True):
+                        child.kill()
+                    parent.kill()
+            except psutil.NoSuchProcess:
+                pass
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+
+            time.sleep(1.5)
+
+        except KeyboardInterrupt:
+            print("\nStopping watchdog...")
+            print_event.clear()
+            output_thread.join()
+            try:
+                process.kill()
+            except:
+                pass
+            break
+
+if __name__ == "__main__":
+    # Configure these parameters
+    APP_COMMAND = ["python", "app.py"]  # Your application command
+    MAX_GPU_USAGE = 90  # Percentage threshold for GPU memory usage
+    CHECK_INTERVAL = 0.5  # Seconds between checks
+
+    run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)