1 changed files with 136 additions and 0 deletions
-
136watchdogs.py
@ -0,0 +1,136 @@ |
|||||
|
import subprocess |
||||
|
import time |
||||
|
import re |
||||
|
import psutil |
||||
|
import torch |
||||
|
import sys |
||||
|
from threading import Thread, Event |
||||
|
from queue import Queue, Empty |
||||
|
from colorama import Fore, Style, init |
||||
|
import os |
||||
|
|
||||
|
def get_gpu_memory_usage(): |
||||
|
"""Get current GPU memory usage percentage""" |
||||
|
if torch.cuda.is_available(): |
||||
|
device = torch.cuda.current_device() |
||||
|
total_mem = torch.cuda.get_device_properties(device).total_memory |
||||
|
allocated_mem = torch.cuda.memory_allocated(device) |
||||
|
return (allocated_mem / total_mem) * 100 |
||||
|
return 0 |
||||
|
|
||||
|
|
||||
|
def check_for_cuda_errors(log_queue): |
||||
|
"""Check application output for CUDA-related errors""" |
||||
|
cuda_error_patterns = [ |
||||
|
r"CUDA error", |
||||
|
r"out of memory", |
||||
|
r"cudaError", |
||||
|
r"RuntimeError: CUDA", |
||||
|
r"CUDA runtime error", |
||||
|
r"CUDA out of memory", |
||||
|
r"CUDA kernel failed" |
||||
|
] |
||||
|
|
||||
|
try: |
||||
|
while True: |
||||
|
line = log_queue.get_nowait() |
||||
|
for pattern in cuda_error_patterns: |
||||
|
if re.search(pattern, line, re.IGNORECASE): |
||||
|
return True |
||||
|
except Empty: |
||||
|
pass |
||||
|
return False |
||||
|
|
||||
|
|
||||
|
def read_output(pipe, log_queue, print_event): |
||||
|
"""Read output from subprocess and distribute it""" |
||||
|
try: |
||||
|
for line in iter(pipe.readline, ''): |
||||
|
log_queue.put(line) |
||||
|
if print_event.is_set(): |
||||
|
print(line, end='', flush=True) |
||||
|
except: |
||||
|
pass |
||||
|
|
||||
|
def run_application(command, max_gpu_usage=90, check_interval=10): |
||||
|
"""Run application with watchdog functionality and live output""" |
||||
|
print_event = Event() |
||||
|
print_event.set() # Enable printing by default |
||||
|
|
||||
|
while True: |
||||
|
print(f"\n{'=' * 40}") |
||||
|
print(f"Starting application: {' '.join(command)}") |
||||
|
print(f"{'=' * 40}\n") |
||||
|
|
||||
|
log_queue = Queue() |
||||
|
process = subprocess.Popen( |
||||
|
command, |
||||
|
stdout=subprocess.PIPE, |
||||
|
stderr=subprocess.STDOUT, |
||||
|
bufsize=1, |
||||
|
universal_newlines=True |
||||
|
) |
||||
|
|
||||
|
output_thread = Thread( |
||||
|
target=read_output, |
||||
|
args=(process.stdout, log_queue, print_event) |
||||
|
) |
||||
|
output_thread.daemon = True |
||||
|
output_thread.start() |
||||
|
|
||||
|
try: |
||||
|
while True: |
||||
|
if process.poll() is not None: |
||||
|
print("\nApplication exited with code:", process.returncode) |
||||
|
break |
||||
|
|
||||
|
gpu_usage = get_gpu_memory_usage() |
||||
|
if gpu_usage > max_gpu_usage: |
||||
|
print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)") |
||||
|
break |
||||
|
|
||||
|
if check_for_cuda_errors(log_queue): |
||||
|
print("\nCUDA error detected in application output") |
||||
|
break |
||||
|
|
||||
|
time.sleep(check_interval) |
||||
|
|
||||
|
print("\nWaiting 1.5 seconds before restart...") |
||||
|
print(Fore.RED + f"{30 * '-'}") |
||||
|
print(Fore.RED + "RESTATRING...") |
||||
|
print(Fore.RED + f"{30 * '-'}") |
||||
|
print(Fore.WHITE) |
||||
|
os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav") |
||||
|
# Clean up |
||||
|
try: |
||||
|
if process.poll() is None: |
||||
|
parent = psutil.Process(process.pid) |
||||
|
for child in parent.children(recursive=True): |
||||
|
child.kill() |
||||
|
parent.kill() |
||||
|
except psutil.NoSuchProcess: |
||||
|
pass |
||||
|
|
||||
|
if torch.cuda.is_available(): |
||||
|
torch.cuda.empty_cache() |
||||
|
|
||||
|
|
||||
|
time.sleep(1.5) |
||||
|
|
||||
|
except KeyboardInterrupt: |
||||
|
print("\nStopping watchdog...") |
||||
|
print_event.clear() |
||||
|
output_thread.join() |
||||
|
try: |
||||
|
process.kill() |
||||
|
except: |
||||
|
pass |
||||
|
break |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
# Configure these parameters |
||||
|
APP_COMMAND = ["python", "app.py"] # Your application command |
||||
|
MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage |
||||
|
CHECK_INTERVAL = 0.5 # Seconds between checks |
||||
|
|
||||
|
run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL) |
Write
Preview
Loading…
Cancel
Save
Reference in new issue