1 changed files with 136 additions and 0 deletions
			
			
		- 
					136watchdogs.py
| @ -0,0 +1,136 @@ | |||
| import subprocess | |||
| import time | |||
| import re | |||
| import psutil | |||
| import torch | |||
| import sys | |||
| from threading import Thread, Event | |||
| from queue import Queue, Empty | |||
| from colorama import Fore, Style, init | |||
| import os | |||
| 
 | |||
| def get_gpu_memory_usage(): | |||
|     """Get current GPU memory usage percentage""" | |||
|     if torch.cuda.is_available(): | |||
|         device = torch.cuda.current_device() | |||
|         total_mem = torch.cuda.get_device_properties(device).total_memory | |||
|         allocated_mem = torch.cuda.memory_allocated(device) | |||
|         return (allocated_mem / total_mem) * 100 | |||
|     return 0 | |||
| 
 | |||
| 
 | |||
| def check_for_cuda_errors(log_queue): | |||
|     """Check application output for CUDA-related errors""" | |||
|     cuda_error_patterns = [ | |||
|         r"CUDA error", | |||
|         r"out of memory", | |||
|         r"cudaError", | |||
|         r"RuntimeError: CUDA", | |||
|         r"CUDA runtime error", | |||
|         r"CUDA out of memory", | |||
|         r"CUDA kernel failed" | |||
|     ] | |||
| 
 | |||
|     try: | |||
|         while True: | |||
|             line = log_queue.get_nowait() | |||
|             for pattern in cuda_error_patterns: | |||
|                 if re.search(pattern, line, re.IGNORECASE): | |||
|                     return True | |||
|     except Empty: | |||
|         pass | |||
|     return False | |||
| 
 | |||
| 
 | |||
| def read_output(pipe, log_queue, print_event): | |||
|     """Read output from subprocess and distribute it""" | |||
|     try: | |||
|         for line in iter(pipe.readline, ''): | |||
|             log_queue.put(line) | |||
|             if print_event.is_set(): | |||
|                 print(line, end='', flush=True) | |||
|     except: | |||
|         pass | |||
| 
 | |||
| def run_application(command, max_gpu_usage=90, check_interval=10): | |||
|     """Run application with watchdog functionality and live output""" | |||
|     print_event = Event() | |||
|     print_event.set()  # Enable printing by default | |||
| 
 | |||
|     while True: | |||
|         print(f"\n{'=' * 40}") | |||
|         print(f"Starting application: {' '.join(command)}") | |||
|         print(f"{'=' * 40}\n") | |||
| 
 | |||
|         log_queue = Queue() | |||
|         process = subprocess.Popen( | |||
|             command, | |||
|             stdout=subprocess.PIPE, | |||
|             stderr=subprocess.STDOUT, | |||
|             bufsize=1, | |||
|             universal_newlines=True | |||
|         ) | |||
| 
 | |||
|         output_thread = Thread( | |||
|             target=read_output, | |||
|             args=(process.stdout, log_queue, print_event) | |||
|         ) | |||
|         output_thread.daemon = True | |||
|         output_thread.start() | |||
| 
 | |||
|         try: | |||
|             while True: | |||
|                 if process.poll() is not None: | |||
|                     print("\nApplication exited with code:", process.returncode) | |||
|                     break | |||
| 
 | |||
|                 gpu_usage = get_gpu_memory_usage() | |||
|                 if gpu_usage > max_gpu_usage: | |||
|                     print(f"\nGPU memory usage exceeded threshold ({gpu_usage:.1f}%)") | |||
|                     break | |||
| 
 | |||
|                 if check_for_cuda_errors(log_queue): | |||
|                     print("\nCUDA error detected in application output") | |||
|                     break | |||
| 
 | |||
|                 time.sleep(check_interval) | |||
| 
 | |||
|             print("\nWaiting 1.5 seconds before restart...") | |||
|             print(Fore.RED + f"{30 * '-'}") | |||
|             print(Fore.RED + "RESTATRING...") | |||
|             print(Fore.RED + f"{30 * '-'}") | |||
|             print(Fore.WHITE) | |||
|             os.system("aplay /home/rog/repos/Tracker/NE-Smart-Tracker/Oxygen-Sys-Warning.wav") | |||
|             # Clean up | |||
|             try: | |||
|                 if process.poll() is None: | |||
|                     parent = psutil.Process(process.pid) | |||
|                     for child in parent.children(recursive=True): | |||
|                         child.kill() | |||
|                     parent.kill() | |||
|             except psutil.NoSuchProcess: | |||
|                 pass | |||
| 
 | |||
|             if torch.cuda.is_available(): | |||
|                 torch.cuda.empty_cache() | |||
| 
 | |||
| 
 | |||
|             time.sleep(1.5) | |||
| 
 | |||
|         except KeyboardInterrupt: | |||
|             print("\nStopping watchdog...") | |||
|             print_event.clear() | |||
|             output_thread.join() | |||
|             try: | |||
|                 process.kill() | |||
|             except: | |||
|                 pass | |||
|             break | |||
| 
 | |||
| if __name__ == "__main__": | |||
|     # Configure these parameters | |||
|     APP_COMMAND = ["python", "app.py"]  # Your application command | |||
|     MAX_GPU_USAGE = 90  # Percentage threshold for GPU memory usage | |||
|     CHECK_INTERVAL = 0.5  # Seconds between checks | |||
| 
 | |||
|     run_application(APP_COMMAND, MAX_GPU_USAGE, CHECK_INTERVAL) | |||
						Write
						Preview
					
					
					Loading…
					
					Cancel
						Save
					
		Reference in new issue