1 changed files with 136 additions and 0 deletions
			
			
		- 
					136watchdogs.py
| @ -0,0 +1,136 @@ | |||||
|  | import subprocess | ||||
|  | import time | ||||
|  | import re | ||||
|  | import psutil | ||||
|  | import torch | ||||
|  | import sys | ||||
|  | from threading import Thread, Event | ||||
|  | from queue import Queue, Empty | ||||
|  | from colorama import Fore, Style, init | ||||
|  | import os | ||||
|  | 
 | ||||
|  | def get_gpu_memory_usage(): | ||||
|  |     """Get current GPU memory usage percentage""" | ||||
|  |     if torch.cuda.is_available(): | ||||
|  |         device = torch.cuda.current_device() | ||||
|  |         total_mem = torch.cuda.get_device_properties(device).total_memory | ||||
|  |         allocated_mem = torch.cuda.memory_allocated(device) | ||||
|  |         return (allocated_mem / total_mem) * 100 | ||||
|  |     return 0 | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | def check_for_cuda_errors(log_queue): | ||||
|  |     """Check application output for CUDA-related errors""" | ||||
|  |     cuda_error_patterns = [ | ||||
|  |         r"CUDA error", | ||||
|  |         r"out of memory", | ||||
|  |         r"cudaError", | ||||
|  |         r"RuntimeError: CUDA", | ||||
|  |         r"CUDA runtime error", | ||||
|  |         r"CUDA out of memory", | ||||
|  |         r"CUDA kernel failed" | ||||
|  |     ] | ||||
|  | 
 | ||||
|  |     try: | ||||
|  |         while True: | ||||
|  |             line = log_queue.get_nowait() | ||||
|  |             for pattern in cuda_error_patterns: | ||||
|  |                 if re.search(pattern, line, re.IGNORECASE): | ||||
|  |                     return True | ||||
|  |     except Empty: | ||||
|  |         pass | ||||
|  |     return False | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | def read_output(pipe, log_queue, print_event): | ||||
|  |     """Read output from subprocess and distribute it""" | ||||
|  |     try: | ||||
|  |         for line in iter(pipe.readline, ''): | ||||
|  |             log_queue.put(line) | ||||
|  |             if print_event.is_set(): | ||||
|  |                 print(line, end='', flush=True) | ||||
|  |     except: | ||||
|  |         pass | ||||
|  | 
 | ||||
|  | def run_application(command, max_gpu_usage=90, check_interval=10): | ||||
|  |     """Run application with watchdog functionality and live output""" | ||||
|  |     print_event = Event() | ||||
|  |     print_event.set()  # Enable printing by default | ||||
|  | 
 | ||||
|  |     while True: | ||||
|  |         print(f"\n{'=' * 40}") | ||||
|  |         print(f"Starting application: {' '.join(command)}") | ||||
|  |         print(f"{'=' * 40}\n") | ||||
|  | 
 | ||||
|  |         log_queue = Queue() | ||||
|  |         process = subprocess.Popen( | ||||
|  |             command, | ||||
|  |             stdout=subprocess.PIPE, | ||||
|  |             stderr=subprocess.STDOUT, | ||||
|  |             bufsize=1, | ||||
|  |             universal_newlines=True | ||||
|  |         ) | ||||
|  | 
 | ||||
|  |         output_thread = Thread( | ||||
|  |             target=read_output, | ||||
|  |             args=(process.stdout, log_queue, print_event) | ||||
|  |         ) | ||||
|  |         output_thread.daemon = True | ||||
|  |         output_thread.start() | ||||
|  | 
 | ||||
|  |         try: | ||||
|  |             while True: | ||||
|  |                 if process.poll() is not None: | ||||
|  |                     print("\nApplication exited with code:", process.returncode) | ||||
|  |                     break | ||||
|  | 
 | ||||
|  |                 gpu_usage = get_gpu_memory_usage() | ||||
|  |                 if gpu_usage > max_gpu_usage: | ||||
|  |                     print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)") | ||||
|  |                     break | ||||
|  | 
 | ||||
|  |                 if check_for_cuda_errors(log_queue): | ||||
|  |                     print("\nCUDA error detected in application output") | ||||
|  |                     break | ||||
|  | 
 | ||||
|  |                 time.sleep(check_interval) | ||||
|  | 
 | ||||
|  |             print("\nWaiting 1.5 seconds before restart...") | ||||
|  |             print(Fore.RED + f"{30 * '-'}") | ||||
|  |             print(Fore.RED + "RESTATRING...") | ||||
|  |             print(Fore.RED + f"{30 * '-'}") | ||||
|  |             print(Fore.WHITE) | ||||
|  |             os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav") | ||||
|  |             # Clean up | ||||
|  |             try: | ||||
|  |                 if process.poll() is None: | ||||
|  |                     parent = psutil.Process(process.pid) | ||||
|  |                     for child in parent.children(recursive=True): | ||||
|  |                         child.kill() | ||||
|  |                     parent.kill() | ||||
|  |             except psutil.NoSuchProcess: | ||||
|  |                 pass | ||||
|  | 
 | ||||
|  |             if torch.cuda.is_available(): | ||||
|  |                 torch.cuda.empty_cache() | ||||
|  | 
 | ||||
|  | 
 | ||||
|  |             time.sleep(1.5) | ||||
|  | 
 | ||||
|  |         except KeyboardInterrupt: | ||||
|  |             print("\nStopping watchdog...") | ||||
|  |             print_event.clear() | ||||
|  |             output_thread.join() | ||||
|  |             try: | ||||
|  |                 process.kill() | ||||
|  |             except: | ||||
|  |                 pass | ||||
|  |             break | ||||
|  | 
 | ||||
|  | if __name__ == "__main__": | ||||
|  |     # Configure these parameters | ||||
|  |     APP_COMMAND = ["python", "app.py"]  # Your application command | ||||
|  |     MAX_GPU_USAGE = 90  # Percentage threshold for GPU memory usage | ||||
|  |     CHECK_INTERVAL = 0.5  # Seconds between checks | ||||
|  | 
 | ||||
|  |     run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL) | ||||
						Write
						Preview
					
					
					Loading…
					
					Cancel
						Save
					
		Reference in new issue