NE-Smart-Tracker/watchdogs.py


								import subprocess

								import time

								import re

								import psutil

								import torch

								import sys

								from threading import Thread, Event

								from queue import Queue, Empty

								from colorama import Fore, Style, init

								import os


								def get_gpu_memory_usage():

								    """Get current GPU memory usage percentage"""

								    if torch.cuda.is_available():

								        device = torch.cuda.current_device()

								        total_mem = torch.cuda.get_device_properties(device).total_memory

								        allocated_mem = torch.cuda.memory_allocated(device)

								        return (allocated_mem / total_mem) * 100

								    return 0


								def check_for_cuda_errors(log_queue):

								    """Check application output for CUDA-related errors"""

								    cuda_error_patterns = [

								        r"CUDA error",

								        r"out of memory",

								        r"cudaError",

								        r"RuntimeError: CUDA",

								        r"CUDA runtime error",

								        r"CUDA out of memory",

								        r"CUDA kernel failed"

								    ]


								    try:

								        while True:

								            line = log_queue.get_nowait()

								            for pattern in cuda_error_patterns:

								                if re.search(pattern, line, re.IGNORECASE):

								                    return True

								    except Empty:

								        pass

								    return False


								def read_output(pipe, log_queue, print_event):

								    """Read output from subprocess and distribute it"""

								    try:

								        for line in iter(pipe.readline, ''):

								            log_queue.put(line)

								            if print_event.is_set():

								                print(line, end='', flush=True)

								    except:

								        pass


								def run_application(command, max_gpu_usage=90, check_interval=10):

								    """Run application with watchdog functionality and live output"""

								    print_event = Event()

								    print_event.set()  # Enable printing by default


								    while True:

								        print(f"\n{'=' * 40}")

								        print(f"Starting application: {' '.join(command)}")

								        print(f"{'=' * 40}\n")


								        log_queue = Queue()

								        process = subprocess.Popen(

								            command,

								            stdout=subprocess.PIPE,

								            stderr=subprocess.STDOUT,

								            bufsize=1,

								            universal_newlines=True

								        )


								        output_thread = Thread(

								            target=read_output,

								            args=(process.stdout, log_queue, print_event)

								        )

								        output_thread.daemon = True

								        output_thread.start()


								        try:

								            while True:

								                if process.poll() is not None:

								                    print("\nApplication exited with code:", process.returncode)

								                    break


								                gpu_usage = get_gpu_memory_usage()

								                if gpu_usage > max_gpu_usage:

								                    print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)")

								                    break


								                if check_for_cuda_errors(log_queue):

								                    print("\nCUDA error detected in application output")

								                    break


								                time.sleep(check_interval)


								            print("\nWaiting 1.5 seconds before restart...")

								            print(Fore.RED + f"{30 * '-'}")

								            print(Fore.RED + "RESTATRING...")

								            print(Fore.RED + f"{30 * '-'}")

								            print(Fore.WHITE)

								            os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav")

								            # Clean up

								            try:

								                if process.poll() is None:

								                    parent = psutil.Process(process.pid)

								                    for child in parent.children(recursive=True):

								                        child.kill()

								                    parent.kill()

								            except psutil.NoSuchProcess:

								                pass


								            if torch.cuda.is_available():

								                torch.cuda.empty_cache()


								            time.sleep(1.5)


								        except KeyboardInterrupt:

								            print("\nStopping watchdog...")

								            print_event.clear()

								            output_thread.join()

								            try:

								                process.kill()

								            except:

								                pass

								            break


								if __name__ == "__main__":

								    # Configure these parameters

								    APP_COMMAND = ["python", "app.py"]  # Your application command

								    MAX_GPU_USAGE = 90  # Percentage threshold for GPU memory usage

								    CHECK_INTERVAL = 0.5  # Seconds between checks


								    run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)