1 changed files with 136 additions and 0 deletions
-
136watchdogs.py
@ -0,0 +1,136 @@ |
|||
import subprocess |
|||
import time |
|||
import re |
|||
import psutil |
|||
import torch |
|||
import sys |
|||
from threading import Thread, Event |
|||
from queue import Queue, Empty |
|||
from colorama import Fore, Style, init |
|||
import os |
|||
|
|||
def get_gpu_memory_usage(): |
|||
"""Get current GPU memory usage percentage""" |
|||
if torch.cuda.is_available(): |
|||
device = torch.cuda.current_device() |
|||
total_mem = torch.cuda.get_device_properties(device).total_memory |
|||
allocated_mem = torch.cuda.memory_allocated(device) |
|||
return (allocated_mem / total_mem) * 100 |
|||
return 0 |
|||
|
|||
|
|||
def check_for_cuda_errors(log_queue): |
|||
"""Check application output for CUDA-related errors""" |
|||
cuda_error_patterns = [ |
|||
r"CUDA error", |
|||
r"out of memory", |
|||
r"cudaError", |
|||
r"RuntimeError: CUDA", |
|||
r"CUDA runtime error", |
|||
r"CUDA out of memory", |
|||
r"CUDA kernel failed" |
|||
] |
|||
|
|||
try: |
|||
while True: |
|||
line = log_queue.get_nowait() |
|||
for pattern in cuda_error_patterns: |
|||
if re.search(pattern, line, re.IGNORECASE): |
|||
return True |
|||
except Empty: |
|||
pass |
|||
return False |
|||
|
|||
|
|||
def read_output(pipe, log_queue, print_event): |
|||
"""Read output from subprocess and distribute it""" |
|||
try: |
|||
for line in iter(pipe.readline, ''): |
|||
log_queue.put(line) |
|||
if print_event.is_set(): |
|||
print(line, end='', flush=True) |
|||
except: |
|||
pass |
|||
|
|||
def run_application(command, max_gpu_usage=90, check_interval=10): |
|||
"""Run application with watchdog functionality and live output""" |
|||
print_event = Event() |
|||
print_event.set() # Enable printing by default |
|||
|
|||
while True: |
|||
print(f"\n{'=' * 40}") |
|||
print(f"Starting application: {' '.join(command)}") |
|||
print(f"{'=' * 40}\n") |
|||
|
|||
log_queue = Queue() |
|||
process = subprocess.Popen( |
|||
command, |
|||
stdout=subprocess.PIPE, |
|||
stderr=subprocess.STDOUT, |
|||
bufsize=1, |
|||
universal_newlines=True |
|||
) |
|||
|
|||
output_thread = Thread( |
|||
target=read_output, |
|||
args=(process.stdout, log_queue, print_event) |
|||
) |
|||
output_thread.daemon = True |
|||
output_thread.start() |
|||
|
|||
try: |
|||
while True: |
|||
if process.poll() is not None: |
|||
print("\nApplication exited with code:", process.returncode) |
|||
break |
|||
|
|||
gpu_usage = get_gpu_memory_usage() |
|||
if gpu_usage > max_gpu_usage: |
|||
print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)") |
|||
break |
|||
|
|||
if check_for_cuda_errors(log_queue): |
|||
print("\nCUDA error detected in application output") |
|||
break |
|||
|
|||
time.sleep(check_interval) |
|||
|
|||
print("\nWaiting 1.5 seconds before restart...") |
|||
print(Fore.RED + f"{30 * '-'}") |
|||
print(Fore.RED + "RESTATRING...") |
|||
print(Fore.RED + f"{30 * '-'}") |
|||
print(Fore.WHITE) |
|||
os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav") |
|||
# Clean up |
|||
try: |
|||
if process.poll() is None: |
|||
parent = psutil.Process(process.pid) |
|||
for child in parent.children(recursive=True): |
|||
child.kill() |
|||
parent.kill() |
|||
except psutil.NoSuchProcess: |
|||
pass |
|||
|
|||
if torch.cuda.is_available(): |
|||
torch.cuda.empty_cache() |
|||
|
|||
|
|||
time.sleep(1.5) |
|||
|
|||
except KeyboardInterrupt: |
|||
print("\nStopping watchdog...") |
|||
print_event.clear() |
|||
output_thread.join() |
|||
try: |
|||
process.kill() |
|||
except: |
|||
pass |
|||
break |
|||
|
|||
if __name__ == "__main__": |
|||
# Configure these parameters |
|||
APP_COMMAND = ["python", "app.py"] # Your application command |
|||
MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage |
|||
CHECK_INTERVAL = 0.5 # Seconds between checks |
|||
|
|||
run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL) |
Write
Preview
Loading…
Cancel
Save
Reference in new issue