Browse Source

added watchdogs.py

video-streaming
s_kiani 1 week ago
parent
commit
eadb5f36e8
  1. 136
      watchdogs.py

136
watchdogs.py

@ -0,0 +1,136 @@
import subprocess
import time
import re
import psutil
import torch
import sys
from threading import Thread, Event
from queue import Queue, Empty
from colorama import Fore, Style, init
import os
def get_gpu_memory_usage():
"""Get current GPU memory usage percentage"""
if torch.cuda.is_available():
device = torch.cuda.current_device()
total_mem = torch.cuda.get_device_properties(device).total_memory
allocated_mem = torch.cuda.memory_allocated(device)
return (allocated_mem / total_mem) * 100
return 0
def check_for_cuda_errors(log_queue):
"""Check application output for CUDA-related errors"""
cuda_error_patterns = [
r"CUDA error",
r"out of memory",
r"cudaError",
r"RuntimeError: CUDA",
r"CUDA runtime error",
r"CUDA out of memory",
r"CUDA kernel failed"
]
try:
while True:
line = log_queue.get_nowait()
for pattern in cuda_error_patterns:
if re.search(pattern, line, re.IGNORECASE):
return True
except Empty:
pass
return False
def read_output(pipe, log_queue, print_event):
"""Read output from subprocess and distribute it"""
try:
for line in iter(pipe.readline, ''):
log_queue.put(line)
if print_event.is_set():
print(line, end='', flush=True)
except:
pass
def run_application(command, max_gpu_usage=90, check_interval=10):
"""Run application with watchdog functionality and live output"""
print_event = Event()
print_event.set() # Enable printing by default
while True:
print(f"\n{'=' * 40}")
print(f"Starting application: {' '.join(command)}")
print(f"{'=' * 40}\n")
log_queue = Queue()
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
universal_newlines=True
)
output_thread = Thread(
target=read_output,
args=(process.stdout, log_queue, print_event)
)
output_thread.daemon = True
output_thread.start()
try:
while True:
if process.poll() is not None:
print("\nApplication exited with code:", process.returncode)
break
gpu_usage = get_gpu_memory_usage()
if gpu_usage > max_gpu_usage:
print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)")
break
if check_for_cuda_errors(log_queue):
print("\nCUDA error detected in application output")
break
time.sleep(check_interval)
print("\nWaiting 1.5 seconds before restart...")
print(Fore.RED + f"{30 * '-'}")
print(Fore.RED + "RESTATRING...")
print(Fore.RED + f"{30 * '-'}")
print(Fore.WHITE)
os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav")
# Clean up
try:
if process.poll() is None:
parent = psutil.Process(process.pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()
except psutil.NoSuchProcess:
pass
if torch.cuda.is_available():
torch.cuda.empty_cache()
time.sleep(1.5)
except KeyboardInterrupt:
print("\nStopping watchdog...")
print_event.clear()
output_thread.join()
try:
process.kill()
except:
pass
break
if __name__ == "__main__":
# Configure these parameters
APP_COMMAND = ["python", "app.py"] # Your application command
MAX_GPU_USAGE = 90 # Percentage threshold for GPU memory usage
CHECK_INTERVAL = 0.5 # Seconds between checks
run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL)
Loading…
Cancel
Save