added watchdogs.py

1 week ago · eadb5f36e8
1 changed files with 136 additions and 0 deletions
--- a/watchdogs.py
+++ b/watchdogs.py
@ -0,0 +1,136 @@
 import subprocess
 import time
 import re
 import psutil
 import torch
 import sys
 from threading import Thread, Event
 from queue import Queue, Empty
 from colorama import Fore, Style, init
 import os
 def get_gpu_memory_usage():
    """Get current GPU memory usage percentage"""
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        total_mem = torch.cuda.get_device_properties(device).total_memory
        allocated_mem = torch.cuda.memory_allocated(device)
        return (allocated_mem / total_mem) * 100
    return 0
 def check_for_cuda_errors(log_queue):
    """Check application output for CUDA-related errors"""
    cuda_error_patterns = [
        r"CUDA error",
        r"out of memory",
        r"cudaError",
        r"RuntimeError: CUDA",
        r"CUDA runtime error",
        r"CUDA out of memory",
        r"CUDA kernel failed"
    ]
    try:
        while True:
            line = log_queue.get_nowait()
            for pattern in cuda_error_patterns:
                if re.search(pattern, line, re.IGNORECASE):
                    return True
    except Empty:
        pass
    return False
 def read_output(pipe, log_queue, print_event):
    """Read output from subprocess and distribute it"""
    try:
        for line in iter(pipe.readline, ''):
            log_queue.put(line)
            if print_event.is_set():
                print(line, end='', flush=True)
    except:
        pass
 def run_application(command, max_gpu_usage=90, check_interval=10):
    """Run application with watchdog functionality and live output"""
    print_event = Event()
    print_event.set()  # Enable printing by default
    while True:
        print(f"\n{'=' * 40}")
        print(f"Starting application: {' '.join(command)}")
        print(f"{'=' * 40}\n")
        log_queue = Queue()
        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            bufsize=1,
            universal_newlines=True
        )
        output_thread = Thread(
            target=read_output,
            args=(process.stdout, log_queue, print_event)
        )
        output_thread.daemon = True
        output_thread.start()
        try:
            while True:
                if process.poll() is not None:
                    print("\nApplication exited with code:", process.returncode)
                    break
                gpu_usage = get_gpu_memory_usage()
                if gpu_usage > max_gpu_usage:
                    print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)")
                    break
                if check_for_cuda_errors(log_queue):
                    print("\nCUDA error detected in application output")
                    break
                time.sleep(check_interval)
            print("\nWaiting 1.5 seconds before restart...")
            print(Fore.RED + f"{30 * '-'}")
            print(Fore.RED + "RESTATRING...")
            print(Fore.RED + f"{30 * '-'}")
            print(Fore.WHITE)
            os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav")
            # Clean up
            try:
                if process.poll() is None:
                    parent = psutil.Process(process.pid)
                    for child in parent.children(recursive=True):
                        child.kill()
                    parent.kill()
            except psutil.NoSuchProcess:
                pass
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            time.sleep(1.5)
        except KeyboardInterrupt:
            print("\nStopping watchdog...")
            print_event.clear()
            output_thread.join()
            try:
                process.kill()
            except:
                pass
            break
 if __name__ == "__main__":
    # Configure these parameters
    APP_COMMAND = ["python", "app.py"]  # Your application command
    MAX_GPU_USAGE = 90  # Percentage threshold for GPU memory usage
    CHECK_INTERVAL = 0.5  # Seconds between checks
    run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)