#!/usr/bin/env python3 import os import torch import numpy as np import glob import matplotlib.pyplot as plt from pathlib import Path import sys import json from tqdm import tqdm import inspect # Add the project root to path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) # Import model wrappers from pytracking.features.net_wrappers import DiMPTorchScriptWrapper class ModelComparison: def __init__(self, model_dir='exported_weights', num_samples=1000): self.model_dir = model_dir self.num_samples = num_samples self.device = 'cuda' if torch.cuda.is_available() else 'cpu' # Initialize comparison results self.comparison_dir = Path('test') / 'comparison' self.comparison_dir.mkdir(parents=True, exist_ok=True) self.plots_dir = self.comparison_dir / 'plots' # plots_dir initialized here # Initialize models self._init_models() def _init_models(self): """Initialize Python models""" print("Loading Python models...") # Load DiMP components self.models = DiMPTorchScriptWrapper( model_dir=self.model_dir, device=self.device, backbone_sd='backbone', # Directory with backbone weights classifier_sd='classifier', # Directory with classifier weights bbregressor_sd='bb_regressor' # Directory with bbox regressor weights ) def compare_classifier(self): """Compare classifier model outputs between Python and C++""" print("\nComparing classifier outputs...") # Ensure paths are Path objects for consistency if not already input_dir_path = Path('test') / 'input_samples' / 'classifier' cpp_output_dir_path = Path('test') / 'output' / 'classifier' if not input_dir_path.exists() or not cpp_output_dir_path.exists(): print(f"Classifier input or C++ output directory not found ({input_dir_path}, {cpp_output_dir_path}). Skipping.") return # Removed: train_errors = [] # Removed: test_errors = [] # self.all_errors_stats is initialized per test run. # Compare training samples print("\nClassifier - Comparing Training Samples...") for i in tqdm(range(self.num_samples), desc="Training samples"): current_errors = {} # For this sample sample_dir = input_dir_path / f'sample_{i}' cpp_out_sample_dir = cpp_output_dir_path / f'sample_{i}' py_clf_feat = None cpp_clf_feat = None if not sample_dir.exists() or not cpp_out_sample_dir.exists(): print(f"Warning: Skipping classifier train sample {i}, files not found at {sample_dir} or {cpp_out_sample_dir}.") # No explicit error assignment here; _compare_tensor_data will handle Nones else: feat_path = sample_dir / 'backbone_feat.pt' feat = self.load_cpp_tensor(feat_path, self.device) if feat is None: print(f"Critical: Failed to load input tensor for {feat_path} for classifier train sample {i}.") # feat is None, py_clf_feat will remain None else: try: with torch.no_grad(): py_clf_feat = self.models.classifier.extract_classification_feat(feat) except Exception as e: print(f"ERROR: Python model extract_classification_feat (train) failed for sample {i}: {e}") # py_clf_feat remains None cpp_clf_feat_path = cpp_out_sample_dir / 'clf_features.pt' cpp_clf_feat = self.load_cpp_tensor(cpp_clf_feat_path, self.device) if cpp_clf_feat is None: print(f"Warning: Failed to load C++ output tensor {cpp_clf_feat_path} for classifier train sample {i}.") # cpp_clf_feat remains None self._compare_tensor_data(py_clf_feat, cpp_clf_feat, "Classifier Features Train", i, current_errors) if current_errors: self.all_errors_stats[f"Clf_Train_Sample_{i}"] = current_errors # Compare test samples print("\nClassifier - Comparing Test Samples...") for i in tqdm(range(self.num_samples), desc="Test samples"): current_errors = {} # For this sample test_sample_input_dir = input_dir_path / f'test_{i}' cpp_test_out_sample_dir = cpp_output_dir_path / f'test_{i}' py_clf_feat_test = None cpp_clf_feat_test = None if not test_sample_input_dir.exists() or not cpp_test_out_sample_dir.exists(): print(f"Warning: Skipping classifier test sample {i}, files not found at {test_sample_input_dir} or {cpp_test_out_sample_dir}.") # No explicit error assignment here else: test_feat_path = test_sample_input_dir / 'test_feat.pt' test_feat = self.load_cpp_tensor(test_feat_path, self.device) if test_feat is None: print(f"Critical: Failed to load input tensor for {test_feat_path} for classifier test sample {i}.") # test_feat is None, py_clf_feat_test remains None else: try: with torch.no_grad(): py_clf_feat_test = self.models.classifier.extract_classification_feat(test_feat) except Exception as e: print(f"ERROR: Python model extract_classification_feat (test) failed for sample {i}: {e}") # py_clf_feat_test remains None cpp_clf_feat_test_path = cpp_test_out_sample_dir / 'clf_feat_test.pt' cpp_clf_feat_test = self.load_cpp_tensor(cpp_clf_feat_test_path, self.device) if cpp_clf_feat_test is None: print(f"Warning: Failed to load C++ output tensor {cpp_clf_feat_test_path} for classifier test sample {i}.") # cpp_clf_feat_test remains None self._compare_tensor_data(py_clf_feat_test, cpp_clf_feat_test, "Classifier Features Test", i, current_errors) if current_errors: self.all_errors_stats[f"Clf_Test_Sample_{i}"] = current_errors # Old stats and plotting code removed/commented below, now handled by HTML report # print("\nClassifier Comparison Statistics:") # if train_errors: # print(f" Training Features MAE: Mean={np.mean(train_errors):.4e}, Std={np.std(train_errors):.4e}") # if test_errors: # print(f" Test Features MAE: Mean={np.mean(test_errors):.4e}, Std={np.std(test_errors):.4e}") # self._generate_stats_and_plots(train_errors, "Classifier Training Features Error", self.plots_dir / "clf_train_feat_error_hist.png") # self._generate_stats_and_plots(test_errors, "Classifier Test Features Error", self.plots_dir / "clf_test_feat_error_hist.png") def compare_bb_regressor(self): """Compare bb_regressor model outputs between Python and C++""" print("\nComparing bb_regressor outputs...") input_dir = Path('test') / 'input_samples' / 'bb_regressor' cpp_output_dir = Path('test') / 'output' / 'bb_regressor' if not input_dir.exists() or not cpp_output_dir.exists(): print(f"BB Regressor input or C++ output directory not found ({input_dir}, {cpp_output_dir}). Skipping.") return for i in tqdm(range(self.num_samples), desc="BB Regressor samples"): sample_dir = input_dir / f'sample_{i}' cpp_output_sample_dir = cpp_output_dir / f'sample_{i}' # Load input tensors for BB Regressor for this sample feat_layer2_path = sample_dir / 'feat_layer2.pt' feat_layer3_path = sample_dir / 'feat_layer3.pt' init_bbox_path = sample_dir / 'init_bbox.pt' proposals_path = sample_dir / 'proposals.pt' feat_layer2 = self.load_cpp_tensor(feat_layer2_path, self.device) feat_layer3 = self.load_cpp_tensor(feat_layer3_path, self.device) init_bbox = self.load_cpp_tensor(init_bbox_path, self.device) proposals = self.load_cpp_tensor(proposals_path, self.device) if any(t is None for t in [feat_layer2, feat_layer3, init_bbox, proposals]): print(f"Critical: Failed to load one or more BB Regressor input tensors for sample {i}. Skipping.") continue backbone_feat_tuple = (feat_layer2, feat_layer3) # Define the tuple for clarity # Get IoU features from Python model # self.models.get_backbone_bbreg_feat calls self.bb_regressor.get_iou_feat with torch.no_grad(): py_iou_feat = self.models.get_backbone_bbreg_feat({"layer2": feat_layer2, "layer3": feat_layer3}) # Get modulation vectors squeezed_init_bbox = init_bbox if init_bbox is not None and init_bbox.dim() == 3 and init_bbox.shape[1] == 1: squeezed_init_bbox = init_bbox.squeeze(1) with torch.no_grad(): # Pass original backbone features to get_modulation py_modulation = self.models.bb_regressor.get_modulation(backbone_feat_tuple, squeezed_init_bbox) # DEBUG: Print shapes print(f"Sample {i}: py_iou_feat[0] shape: {py_iou_feat[0].shape}, py_modulation[0] shape: {py_modulation[0].shape}") print(f"Sample {i}: py_iou_feat[1] shape: {py_iou_feat[1].shape}, py_modulation[1] shape: {py_modulation[1].shape}") # Predict IoU (Python model) py_iou_pred = None try: with torch.no_grad(): py_iou_pred = self.models.bb_regressor.predict_iou(py_modulation, py_iou_feat, proposals) except RuntimeError as e: print(f"WARNING: Python model self.models.bb_regressor.predict_iou failed for sample {i}: {e}") # Load C++ outputs cpp_iou_pred_path = cpp_output_sample_dir / 'iou_pred.pt' cpp_modulation_0_path = cpp_output_sample_dir / 'modulation_0.pt' cpp_modulation_1_path = cpp_output_sample_dir / 'modulation_1.pt' cpp_feat_0_path = cpp_output_sample_dir / 'iou_feat_0.pt' cpp_feat_1_path = cpp_output_sample_dir / 'iou_feat_1.pt' cpp_iou_pred = self.load_cpp_tensor(cpp_iou_pred_path, self.device) cpp_modulation_0 = self.load_cpp_tensor(cpp_modulation_0_path, self.device) cpp_modulation_1 = self.load_cpp_tensor(cpp_modulation_1_path, self.device) cpp_feat_0 = self.load_cpp_tensor(cpp_feat_0_path, self.device) cpp_feat_1 = self.load_cpp_tensor(cpp_feat_1_path, self.device) current_errors = {} # Store errors for this sample for the HTML report # Compare IoU features (py_iou_feat vs cpp_feat_0/1) # _compare_tensor_data will handle None inputs appropriately py_iou_f0 = py_iou_feat[0] if py_iou_feat and len(py_iou_feat) > 0 else None py_iou_f1 = py_iou_feat[1] if py_iou_feat and len(py_iou_feat) > 1 else None self._compare_tensor_data(py_iou_f0, cpp_feat_0, "BBReg PyIoUFeat0 vs CppIoUFeat0", i, current_errors) self._compare_tensor_data(py_iou_f1, cpp_feat_1, "BBReg PyIoUFeat1 vs CppIoUFeat1", i, current_errors) # Compare modulation vectors (py_modulation vs cpp_modulation_0/1) py_mod_0 = py_modulation[0] if py_modulation and len(py_modulation) > 0 else None py_mod_1 = py_modulation[1] if py_modulation and len(py_modulation) > 1 else None self._compare_tensor_data(py_mod_0, cpp_modulation_0, "BBReg PyMod0 vs CppMod0", i, current_errors) self._compare_tensor_data(py_mod_1, cpp_modulation_1, "BBReg PyMod1 vs CppMod1", i, current_errors) # Compare final IoU prediction # _compare_tensor_data will handle None for py_iou_pred or cpp_iou_pred self._compare_tensor_data(py_iou_pred, cpp_iou_pred, "BBReg IoUPred", i, current_errors) if current_errors: # Add to overall statistics if any comparisons were made/attempted self.all_errors_stats[f"BBReg_Sample_{i}"] = current_errors # Note: MAE accumulation for overall average needs to be selective based on valid comparisons # For simplicity, we'll let the HTML report show NaNs for failed/skipped comparisons. if not self.all_errors_stats: # Check if any BB regressor comparisons were made print("No BB Regressor comparisons were performed for this model type.") # Clarified message # No plots or stats if nothing was compared for BB regressor return # The following old averaging and plotting is now handled by generate_html_report using all_errors_stats # print("\nBB Regressor Comparison Statistics:") # if iou_pred_errors: # print(f" IoU Prediction MAE: Mean={np.mean(iou_pred_errors):.4e}, Std={np.std(iou_pred_errors):.4e}") # if modulation_errors: # print(f" Modulation MAE: Mean={np.mean(modulation_errors):.4e}, Std={np.std(modulation_errors):.4e}") # if feat_errors: # print(f" IoU Feature MAE: Mean={np.mean(feat_errors):.4e}, Std={np.std(feat_errors):.4e}") # # Plots - these would need to be rethought with the new error structure # self._generate_stats_and_plots(iou_pred_errors, "BB Regressor IoU Prediction Error", self.plots_dir / "bbreg_iou_pred_error_hist.png") # self._generate_stats_and_plots(modulation_errors, "BB Regressor Modulation Error", self.plots_dir / "bbreg_modulation_error_hist.png") # self._generate_stats_and_plots(feat_errors, "BB Regressor IoU Feature Error", self.plots_dir / "bbreg_feature_error_hist.png") def generate_html_report(self): print("\nGenerating HTML report...") report_path = self.comparison_dir / "report.html" # plot_paths_dict = {} # This variable was unused # Prepare data for the report: group by model and comparison type report_data = { # "Model_Type Component_Name": { \ # "samples": {0: {\"mae\":X, \"max_err\":Y, \"mean_py\":Z, \"std_err\":S, \"plot_path\":\"...\"}, 1:{...} },\n # "overall_mae_mean": A, "overall_mae_std": B, "overall_max_err_mean": C\n # }\n } } for sample_key, comparisons in self.all_errors_stats.items(): # sample_key examples: "Clf_Train_Sample_0", "Clf_Test_Sample_0", "BBReg_Sample_0" parts = sample_key.split("_") model_prefix = parts[0] # Clf, BBReg sample_type_str = "" sample_idx = -1 if model_prefix == "Clf": sample_type_str = parts[1] # Train or Test sample_idx = int(parts[-1]) model_name_key = f"Classifier {sample_type_str}" elif model_prefix == "BBReg": sample_idx = int(parts[-1]) model_name_key = "BB Regressor" else: print(f"WARNING: Unknown sample key format in all_errors_stats: {sample_key}") continue for comparison_name, stats in comparisons.items(): # comparison_name examples: "Classifier Features Train", "BBReg PyIoUFeat0 vs CppIoUFeat0" # Unpack all 11 metrics now mae, max_err, diff_arr, mean_py_val, std_abs_err, \ l2_py, l2_cpp, l2_diff, cos_sim, pearson, mre = stats full_comparison_key = f"{model_name_key} - {comparison_name}" if full_comparison_key not in report_data: report_data[full_comparison_key] = { "samples": {}, "all_maes": [], "all_max_errs": [], "all_mean_py_vals": [], "all_std_abs_errs": [], # Renamed from all_std_errs "all_l2_py_vals": [], "all_l2_cpp_vals": [], "all_l2_diff_vals": [], "all_cos_sim_vals": [], "all_pearson_vals": [], "all_mre_vals": [] } plot_filename = None if diff_arr is not None and len(diff_arr) > 0 and not np.all(np.isnan(diff_arr)): plot_filename = f"{model_prefix}_{sample_type_str}_sample{sample_idx}_{comparison_name.replace(' ', '_').replace('/', '_')}_hist.png" plot_abs_path = self.plots_dir / plot_filename # Pass std_abs_err to plotting function self._generate_single_plot(diff_arr, comparison_name, plot_abs_path, mean_py_val, std_abs_err, mae, max_err) report_data[full_comparison_key]["samples"][sample_idx] = { "mae": mae, "max_err": max_err, "mean_py_val": mean_py_val, "std_abs_err": std_abs_err, # Renamed from std_err "l2_py": l2_py, "l2_cpp": l2_cpp, "l2_diff": l2_diff, "cos_sim": cos_sim, "pearson": pearson, "mre": mre, "plot_path": plot_filename # Store relative path for HTML } if not np.isnan(mae): report_data[full_comparison_key]["all_maes"].append(mae) if not np.isnan(max_err): report_data[full_comparison_key]["all_max_errs"].append(max_err) if not np.isnan(mean_py_val): report_data[full_comparison_key]["all_mean_py_vals"].append(mean_py_val) if not np.isnan(std_abs_err): report_data[full_comparison_key]["all_std_abs_errs"].append(std_abs_err) if not np.isnan(l2_py): report_data[full_comparison_key]["all_l2_py_vals"].append(l2_py) if not np.isnan(l2_cpp): report_data[full_comparison_key]["all_l2_cpp_vals"].append(l2_cpp) if not np.isnan(l2_diff): report_data[full_comparison_key]["all_l2_diff_vals"].append(l2_diff) if not np.isnan(cos_sim): report_data[full_comparison_key]["all_cos_sim_vals"].append(cos_sim) if not np.isnan(pearson): report_data[full_comparison_key]["all_pearson_vals"].append(pearson) if not np.isnan(mre): report_data[full_comparison_key]["all_mre_vals"].append(mre) # Calculate overall stats for comp_key, data in report_data.items(): data["overall_mae_mean"] = np.mean(data["all_maes"]) if data["all_maes"] else float('nan') data["overall_mae_std"] = np.std(data["all_maes"]) if data["all_maes"] else float('nan') data["overall_max_err_mean"] = np.mean(data["all_max_errs"]) if data["all_max_errs"] else float('nan') data["overall_mean_py_val_mean"] = np.mean(data["all_mean_py_vals"]) if data["all_mean_py_vals"] else float('nan') data["overall_std_abs_err_mean"] = np.mean(data["all_std_abs_errs"]) if data["all_std_abs_errs"] else float('nan') # Renamed data["overall_l2_py_mean"] = np.mean(data["all_l2_py_vals"]) if data["all_l2_py_vals"] else float('nan') data["overall_l2_cpp_mean"] = np.mean(data["all_l2_cpp_vals"]) if data["all_l2_cpp_vals"] else float('nan') data["overall_l2_diff_mean"] = np.mean(data["all_l2_diff_vals"]) if data["all_l2_diff_vals"] else float('nan') data["overall_cos_sim_mean"] = np.mean(data["all_cos_sim_vals"]) if data["all_cos_sim_vals"] else float('nan') data["overall_pearson_mean"] = np.mean(data["all_pearson_vals"]) if data["all_pearson_vals"] else float('nan') data["overall_mre_mean"] = np.mean(data["all_mre_vals"]) if data["all_mre_vals"] else float('nan') # HTML Generation html_content = """ Model Comparison Report

Model Comparison Report

Number of samples per model component: {self.num_samples}

Understanding the Metrics:

Mean MAE (Mean Absolute Error)
Calculation: Average of the absolute differences between corresponding elements of the Python and C++ tensors (mean(abs(py - cpp))). The "Mean MAE" in the summary table is the average of these MAEs over all samples for a given comparison.
Range & Interpretation: 0 to ∞. Closer to 0 indicates better agreement. This metric shows the average magnitude of error.
Std MAE (Standard Deviation of MAE)
Calculation: Standard deviation of the MAE values calculated for each sample within a comparison group.
Range & Interpretation: 0 to ∞. A smaller value indicates that the MAE is consistent across samples. A larger value suggests variability in agreement from sample to sample.
Mean Max Error
Calculation: Average of the maximum absolute differences found between Python and C++ tensors for each sample (mean(max(abs(py - cpp))) over samples).
Range & Interpretation: 0 to ∞. Closer to 0 is better. Indicates the average of the worst-case discrepancies per sample.
Mean Py Val (Mean Python Tensor Value)
Calculation: Average of the mean values of the Python reference tensors over all samples (mean(mean(py_tensor_sample_N))).
Range & Interpretation: Problem-dependent. Provides context about the typical magnitude of the Python model's output values.
Mean Std Abs Err (Mean Standard Deviation of Absolute Errors)
Calculation: Average of the standard deviations of the absolute error arrays (abs(py - cpp)) for each sample. The "Err Std" in plot titles is this value for that specific sample.
Range & Interpretation: 0 to ∞. A smaller value indicates that the errors are concentrated around their mean (MAE), implying less spread in error magnitudes within a sample.
Mean L2 Py (Mean L2 Norm of Python Tensor)
Calculation: Average of the L2 norms (Euclidean norm) of the flattened Python tensors over all samples.
Range & Interpretation: 0 to ∞. Represents the average magnitude or "length" of the Python output vectors.
Mean L2 Cpp (Mean L2 Norm of C++ Tensor)
Calculation: Average of the L2 norms of the flattened C++ tensors over all samples.
Range & Interpretation: 0 to ∞. Represents the average magnitude of the C++ output vectors. Should be comparable to Mean L2 Py if models agree in scale.
Mean L2 Diff (Mean L2 Norm of Difference)
Calculation: Average of the L2 norms of the flattened difference tensors (py - cpp) over all samples.
Range & Interpretation: 0 to ∞. Closer to 0 indicates better agreement. This is the magnitude of the average difference vector.
Mean Cosine Sim (Mean Cosine Similarity)
Calculation: Average of the cosine similarities between the flattened Python and C++ tensors over all samples. Cosine similarity is dot(py, cpp) / (norm(py) * norm(cpp)).
Range & Interpretation: -1 to 1 (typically 0 to 1 for non-negative features). Closer to 1 indicates that the tensors point in the same direction (high similarity in terms of orientation, ignoring magnitude). Values near 0 suggest orthogonality, and near -1 suggest opposite directions.
Mean Pearson Corr (Mean Pearson Correlation Coefficient)
Calculation: Average of the Pearson correlation coefficients between the flattened Python and C++ tensors over all samples. Measures linear correlation.
Range & Interpretation: -1 to 1. Closer to 1 indicates strong positive linear correlation. Closer to -1 indicates strong negative linear correlation. Closer to 0 indicates weak or no linear correlation.
Mean MRE (Mean Relative Error)
Calculation: Average of the mean relative errors per sample, where relative error is mean(abs(py - cpp) / (abs(py) + epsilon)). Epsilon is a small value to prevent division by zero.
Range & Interpretation: 0 to ∞. Closer to 0 is better. This metric normalizes the absolute error by the magnitude of the Python reference values, useful for understanding error relative to signal strength.
""" sorted_report_keys = sorted(report_data.keys()) html_content += "

Overall Comparison Statistics

" for comp_key in sorted_report_keys: data = report_data[comp_key] html_content += f""" """ html_content += "
Comparison KeyMean MAEStd MAEMean Max ErrorMean Py ValMean Std Abs ErrMean L2 PyMean L2 CppMean L2 DiffMean Cosine SimMean Pearson CorrMean MRE
{comp_key} {f"{data['overall_mae_mean']:.4e}" if not np.isnan(data['overall_mae_mean']) else 'N/A'} {f"{data['overall_mae_std']:.4e}" if not np.isnan(data['overall_mae_std']) else 'N/A'} {f"{data['overall_max_err_mean']:.4e}" if not np.isnan(data['overall_max_err_mean']) else 'N/A'} {f"{data['overall_mean_py_val_mean']:.4e}" if not np.isnan(data['overall_mean_py_val_mean']) else 'N/A'} {f"{data['overall_std_abs_err_mean']:.4e}" if not np.isnan(data['overall_std_abs_err_mean']) else 'N/A'} {f"{data['overall_l2_py_mean']:.4e}" if not np.isnan(data['overall_l2_py_mean']) else 'N/A'} {f"{data['overall_l2_cpp_mean']:.4e}" if not np.isnan(data['overall_l2_cpp_mean']) else 'N/A'} {f"{data['overall_l2_diff_mean']:.4e}" if not np.isnan(data['overall_l2_diff_mean']) else 'N/A'} {f"{data['overall_cos_sim_mean']:.4f}" if not np.isnan(data['overall_cos_sim_mean']) else 'N/A'} {f"{data['overall_pearson_mean']:.4f}" if not np.isnan(data['overall_pearson_mean']) else 'N/A'} {f"{data['overall_mre_mean']:.4e}" if not np.isnan(data['overall_mre_mean']) else 'N/A'}
" for comp_key in sorted_report_keys: data = report_data[comp_key] html_content += f"

Details for: {comp_key}

" html_content += f"""

Overall Mean MAE: {f'{data["overall_mae_mean"]:.4e}' if not np.isnan(data['overall_mae_mean']) else 'N/A'}

""" html_content += "" for sample_idx in sorted(data["samples"].keys()): sample_data = data["samples"][sample_idx] plot_path_html = f'./plots/{sample_data["plot_path"]}' if sample_data["plot_path"] else "N/A" img_tag = f'Error histogram' if sample_data["plot_path"] else "N/A" html_content += f""" """ html_content += "
Sample IndexMAEMax ErrorMean Py ValStd Abs ErrL2 PyL2 CppL2 DiffCosine SimPearson CorrMREError Distribution Plot
{sample_idx} {f"{sample_data['mae']:.4e}" if not np.isnan(sample_data['mae']) else 'N/A'} {f"{sample_data['max_err']:.4e}" if not np.isnan(sample_data['max_err']) else 'N/A'} {f"{sample_data['mean_py_val']:.4e}" if not np.isnan(sample_data['mean_py_val']) else 'N/A'} {f"{sample_data['std_abs_err']:.4e}" if not np.isnan(sample_data['std_abs_err']) else 'N/A'} {f"{sample_data['l2_py']:.4e}" if not np.isnan(sample_data['l2_py']) else 'N/A'} {f"{sample_data['l2_cpp']:.4e}" if not np.isnan(sample_data['l2_cpp']) else 'N/A'} {f"{sample_data['l2_diff']:.4e}" if not np.isnan(sample_data['l2_diff']) else 'N/A'} {f"{sample_data['cos_sim']:.4f}" if not np.isnan(sample_data['cos_sim']) else 'N/A'} {f"{sample_data['pearson']:.4f}" if not np.isnan(sample_data['pearson']) else 'N/A'} {f"{sample_data['mre']:.4e}" if not np.isnan(sample_data['mre']) else 'N/A'} {img_tag}
" html_content += """ """ with open(report_path, 'w') as f: f.write(html_content) print(f"HTML report generated at {report_path}") def _generate_single_plot(self, error_array, title, plot_path, mean_val, std_abs_err, mae, max_err): if error_array is None or len(error_array) == 0 or np.all(np.isnan(error_array)): # print(f"Skipping plot for {title} as error_array is empty or all NaNs.") return plt.figure(figsize=(8, 6)) plt.hist(error_array, bins=50, color='skyblue', edgecolor='black') stats_text = f"Ref Mean: {mean_val:.3e} | MAE: {mae:.3e} | MaxErr: {max_err:.3e} | Err Std: {std_abs_err:.3e}" plt.title(f"{title}\n{stats_text}", fontsize=10) plt.xlabel("Error Value") plt.ylabel("Frequency") plt.grid(True, linestyle='--', alpha=0.7) try: plt.tight_layout() plt.savefig(plot_path) except Exception as e: print(f"ERROR: Failed to save plot {plot_path}: {e}") plt.close() def run_all_tests(self): self.all_errors_stats = {} # Initialize/clear for the new run self.plots_dir.mkdir(parents=True, exist_ok=True) # Ensure plots_dir exists self.compare_classifier() self.compare_bb_regressor() self.generate_html_report() print("All tests completed!") def load_cpp_tensor(self, path, device): path_str = str(path) # Ensure path is a string try: # Attempt 1: Load as a plain tensor, assuming it's not a TorchScript module. # This is the most common and safest way to load tensors saved from PyTorch (Python or C++). tensor = torch.load(path_str, map_location=device, weights_only=True) # print(f"Successfully loaded tensor from {path_str} with weights_only=True") return tensor except RuntimeError as e_weights_only: # Handle cases where weights_only=True is not appropriate (e.g., TorchScript archives) if "TorchScript archive" in str(e_weights_only) or \ "PytorchStreamReader failed" in str(e_weights_only) or \ "weights_only" in str(e_weights_only): # Broader check for weights_only issues # print(f"weights_only=True failed for {path_str} ({e_weights_only}). Trying weights_only=False.") try: # Attempt 2: Load with weights_only=False. loaded_obj = torch.load(path_str, map_location=device, weights_only=False) if isinstance(loaded_obj, torch.Tensor): # print(f"Successfully loaded tensor from {path_str} with weights_only=False.") return loaded_obj # Check for _actual_script_module for deeply nested tensors elif hasattr(loaded_obj, '_actual_script_module') and hasattr(loaded_obj._actual_script_module, 'forward'): # print(f"Found _actual_script_module in {path_str}, trying its forward().") try: potential_tensor = loaded_obj._actual_script_module.forward() if isinstance(potential_tensor, torch.Tensor): # print(f"Extracted tensor using _actual_script_module.forward() from {path_str}") return potential_tensor except Exception as e_deep_forward: print(f"Warning: Calling _actual_script_module.forward() from {path_str} failed: {e_deep_forward}") # General ScriptModule handling (RecursiveScriptModule or any object with forward) elif isinstance(loaded_obj, torch.jit.RecursiveScriptModule) or hasattr(loaded_obj, 'forward'): # print(f"Loaded a ScriptModule/object with forward from {path_str}. Attempting extraction.") # Attempt 2a: Greedily find the first tensor attribute for attr_name in dir(loaded_obj): if attr_name.startswith('__'): continue try: attr_val = getattr(loaded_obj, attr_name) if isinstance(attr_val, torch.Tensor): # print(f"Extracted tensor from attribute '{attr_name}' of ScriptModule at {path_str}") return attr_val except Exception: pass # Ignore errors from getattr # Attempt 2b: Try calling forward() if it exists and no tensor attribute was found if hasattr(loaded_obj, 'forward') and callable(loaded_obj.forward): sig = inspect.signature(loaded_obj.forward) if not sig.parameters: # Only call if forward() takes no arguments try: potential_tensor = loaded_obj.forward() if isinstance(potential_tensor, torch.Tensor): # print(f"Extracted tensor using forward() from ScriptModule at {path_str}") return potential_tensor except Exception as e_forward: print(f"Warning: Calling forward() on ScriptModule from {path_str} failed: {e_forward}") # Attempt 2c: Check state_dict try: sd = loaded_obj.state_dict() # print(f"DEBUG: state_dict for {path_str}: {list(sd.keys())}") if len(sd) == 1: tensor_name = list(sd.keys())[0] potential_tensor = sd[tensor_name] if isinstance(potential_tensor, torch.Tensor): print(f"INFO: Extracted tensor '{tensor_name}' from single-entry state_dict of ScriptModule at {path_str}") return potential_tensor elif len(sd) > 1: # If multiple tensors, this is heuristic. Prefer known/simple names if possible. # For now, just take the first one if it's a tensor. for tensor_name, potential_tensor in sd.items(): if isinstance(potential_tensor, torch.Tensor): print(f"INFO: Extracted tensor '{tensor_name}' (from multiple) from state_dict of ScriptModule at {path_str}") return potential_tensor print(f"Warning: ScriptModule at {path_str} has multiple state_dict entries: {list(sd.keys())} but none were straightforwardly returned as the primary tensor.") # else: state_dict is empty, or no tensors found above except Exception as e_sd: print(f"Warning: Error accessing/processing state_dict for ScriptModule at {path_str}: {e_sd}") print(f"ERROR: Could not extract tensor from ScriptModule at {path_str} after trying attributes, forward(), and state_dict(). Dir: {dir(loaded_obj)}") return None else: print(f"ERROR: Loaded object from {path_str} (with weights_only=False) is not a Tensor or recognized ScriptModule. Type: {type(loaded_obj)}.") return None except Exception as e_load_false: print(f"ERROR: weights_only=False also failed for {path_str}. Last error: {e_load_false}") return None else: # Some other error with weights_only=True print(f"ERROR: Loading tensor from {path_str} with weights_only=True failed with an unexpected error: {e_weights_only}") return None except Exception as e_generic: print(f"ERROR: A generic error occurred while loading tensor from {path_str}: {e_generic}") return None def _compare_tensor_data(self, tensor1, tensor2, name, sample_idx, current_errors): """Compare two tensors and return error metrics.""" num_metrics = 11 # mae, max_err, diff_arr, mean_py_val, std_abs_err, l2_py, l2_cpp, l2_diff, cos_sim, pearson, mre nan_metrics_tuple = ( float('nan'), float('nan'), [], float('nan'), float('nan'), # Original 5 float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan') # New 6 ) if tensor1 is None or tensor2 is None: py_mean = float('nan') py_l2 = float('nan') if tensor1 is not None: # Python tensor exists t1_cpu_temp = tensor1.cpu().detach().numpy().astype(np.float32) py_mean = np.mean(t1_cpu_temp) py_l2 = np.linalg.norm(t1_cpu_temp.flatten()) # If only tensor2 is None, we can't calculate C++ l2 or comparison metrics # If only tensor1 is None, py_mean and py_l2 remain NaN. current_errors[name] = ( float('nan'), float('nan'), [], py_mean, float('nan'), py_l2, float('nan'), float('nan'), float('nan'), float('nan'), float('nan') ) print(f"Warning: Cannot compare '{name}' for sample {sample_idx}, one or both tensors are None.") return t1_cpu = tensor1.cpu().detach().numpy().astype(np.float32) t2_cpu = tensor2.cpu().detach().numpy().astype(np.float32) if t1_cpu.shape != t2_cpu.shape: print(f"Warning: Shape mismatch for '{name}' sample {sample_idx}. Py: {t1_cpu.shape}, Cpp: {t2_cpu.shape}. Skipping most comparisons.") current_errors[name] = ( float('nan'), float('nan'), [], np.mean(t1_cpu), float('nan'), # MAE, MaxErr, diff_arr, MeanPy, StdAbsErr np.linalg.norm(t1_cpu.flatten()), np.linalg.norm(t2_cpu.flatten()), float('nan'), # L2Py, L2Cpp, L2Diff float('nan'), float('nan'), float('nan') # CosSim, Pearson, MRE ) return # All calculations from here assume shapes match and tensors are not None t1_flat = t1_cpu.flatten() t2_flat = t2_cpu.flatten() abs_diff_elements = np.abs(t1_cpu - t2_cpu) mae = np.mean(abs_diff_elements) max_err = np.max(abs_diff_elements) diff_arr_for_hist = abs_diff_elements.flatten() # For histogram mean_py_val = np.mean(t1_cpu) std_abs_err = np.std(diff_arr_for_hist) l2_norm_py = np.linalg.norm(t1_flat) l2_norm_cpp = np.linalg.norm(t2_flat) l2_norm_diff = np.linalg.norm(t1_flat - t2_flat) # Cosine Similarity dot_product = np.dot(t1_flat, t2_flat) if l2_norm_py == 0 or l2_norm_cpp == 0: cosine_sim = float('nan') else: cosine_sim = dot_product / (l2_norm_py * l2_norm_cpp) # Pearson Correlation Coefficient if len(t1_flat) < 2: pearson_corr = float('nan') else: std_t1 = np.std(t1_flat) std_t2 = np.std(t2_flat) if std_t1 == 0 or std_t2 == 0: # If either is constant if std_t1 == 0 and std_t2 == 0 and np.allclose(t1_flat, t2_flat): pearson_corr = 1.0 # Both constant and identical else: pearson_corr = float('nan') # Otherwise, undefined or not meaningfully 1 else: try: corr_matrix = np.corrcoef(t1_flat, t2_flat) if corr_matrix.ndim == 2: pearson_corr = corr_matrix[0, 1] else: # Should be a scalar if inputs were effectively constant, already handled by std checks pearson_corr = float(corr_matrix) if np.isscalar(corr_matrix) else float('nan') except Exception: pearson_corr = float('nan') # Mean Relative Error (MRE) epsilon_rel_err = 1e-9 # Small epsilon to avoid division by zero and extreme values # Calculate relative error where abs(t1_cpu) is not zero (or very small) # For elements where t1_cpu is zero (or very small): # - If t2_cpu is also zero (small), error is small. # - If t2_cpu is not zero, relative error is infinite/large. # Using (abs(t1_cpu) + epsilon) in denominator handles this. mean_rel_err = np.mean(abs_diff_elements / (np.abs(t1_cpu) + epsilon_rel_err)) current_errors[name] = ( mae, max_err, diff_arr_for_hist, mean_py_val, std_abs_err, l2_norm_py, l2_norm_cpp, l2_norm_diff, cosine_sim, pearson_corr, mean_rel_err ) # Optional: print detailed error for specific high-error cases # if mae > 1e-4: # print(f"High MAE for {name}, sample {sample_idx}: {mae:.6f}") # The function implicitly returns None as it modifies current_errors in place. # For direct use, if needed, it could return the tuple: # return (mae, max_err, diff_arr_for_hist, mean_py_val, std_abs_err, l2_norm_py, l2_norm_cpp, l2_norm_diff, cosine_sim, pearson_corr, mean_rel_err) if __name__ == "__main__": # Parse command line arguments import argparse parser = argparse.ArgumentParser(description="Compare Python and C++ model implementations") parser.add_argument("--num-samples", type=int, default=1000, help="Number of test samples (default: 1000)") args = parser.parse_args() # Run comparison comparison = ModelComparison(num_samples=args.num_samples) comparison.run_all_tests()