Fix ResNet BatchNorm parameter loading and enhance BN1 debugging. Corrected loading of BatchNorm running_mean, running_var, and num_batches_tracked parameters in C++ ResNet BottleneckImpl and ResNetImpl to use direct member assignment instead of named_buffers(). This resolved discrepancies with Python's BatchNorm behavior. Added detailed intermediate output saving for bn1 in both C++ ResNet and Python comparison script to facilitate debugging. Ensured Python comparison script correctly loads and compares these new ResNet intermediate tensors. This series of changes led to numerical equivalence for ResNet conv1, bn1, and subsequently layer1-4 outputs between Python and C++.

5 months ago · c0e5aa7d0a
3 changed files with 725 additions and 303 deletions
--- a/cimp/resnet/resnet.cpp
+++ b/cimp/resnet/resnet.cpp
@ -71,9 +71,9 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
    conv1->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv1.weight", device);
    bn1->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.weight", device);
    bn1->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.bias", device);
-    bn1->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_mean", device);
-    bn1->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_var", device);
-    bn1->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.num_batches_tracked", device);
+    bn1->running_mean = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_mean", device);
+    bn1->running_var = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_var", device);
+    bn1->num_batches_tracked = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.num_batches_tracked", device);
    register_module("conv1", conv1);
    register_module("bn1", bn1);

@ -83,9 +83,9 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
    conv2->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv2.weight", device);
    bn2->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.weight", device);
    bn2->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.bias", device);
-    bn2->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_mean", device);
-    bn2->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_var", device);
-    bn2->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.num_batches_tracked", device);
+    bn2->running_mean = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_mean", device);
+    bn2->running_var = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_var", device);
+    bn2->num_batches_tracked = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.num_batches_tracked", device);
    register_module("conv2", conv2);
    register_module("bn2", bn2);

@ -95,9 +95,9 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
    conv3->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv3.weight", device);
    bn3->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.weight", device);
    bn3->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.bias", device);
-    bn3->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_mean", device);
-    bn3->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_var", device);
-    bn3->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.num_batches_tracked", device);
+    bn3->running_mean = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_mean", device);
+    bn3->running_var = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_var", device);
+    bn3->num_batches_tracked = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.num_batches_tracked", device);
    register_module("conv3", conv3);
    register_module("bn3", bn3);

@ -118,17 +118,85 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
 // Forward method implementation for BottleneckImpl
 torch::Tensor BottleneckImpl::forward(torch::Tensor x) {
    torch::Tensor identity = x;
+    torch::ScalarType original_dtype = x.scalar_type();

+    // conv1 -> bn1 -> relu
    x = conv1->forward(x);
-    x = bn1->forward(x);
+
+    if (!this->is_training() && bn1) {
+        const auto& bn_module = *bn1;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+
+        auto c = x.size(1);
+        running_mean_double = running_mean_double.reshape({1, c, 1, 1});
+        running_var_double = running_var_double.reshape({1, c, 1, 1});
+        if (weight_double.defined()) weight_double = weight_double.reshape({1, c, 1, 1});
+        if (bias_double.defined()) bias_double = bias_double.reshape({1, c, 1, 1});
+        
+        torch::Tensor out_double = (input_double - running_mean_double) / (torch::sqrt(running_var_double + eps_double));
+        if (weight_double.defined()) out_double = out_double * weight_double;
+        if (bias_double.defined()) out_double = out_double + bias_double;
+        x = out_double.to(original_dtype);
+    } else if (bn1) {
+        x = bn1->forward(x);
+    }
    x = relu->forward(x);

+    // conv2 -> bn2 -> relu
    x = conv2->forward(x);
-    x = bn2->forward(x);
+    if (!this->is_training() && bn2) {
+        const auto& bn_module = *bn2;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+
+        auto c = x.size(1);
+        running_mean_double = running_mean_double.reshape({1, c, 1, 1});
+        running_var_double = running_var_double.reshape({1, c, 1, 1});
+        if (weight_double.defined()) weight_double = weight_double.reshape({1, c, 1, 1});
+        if (bias_double.defined()) bias_double = bias_double.reshape({1, c, 1, 1});
+
+        torch::Tensor out_double = (input_double - running_mean_double) / (torch::sqrt(running_var_double + eps_double));
+        if (weight_double.defined()) out_double = out_double * weight_double;
+        if (bias_double.defined()) out_double = out_double + bias_double;
+        x = out_double.to(original_dtype);
+    } else if (bn2) {
+        x = bn2->forward(x);
+    }
    x = relu->forward(x);

+    // conv3 -> bn3
    x = conv3->forward(x);
-    x = bn3->forward(x);
+    if (!this->is_training() && bn3) {
+        const auto& bn_module = *bn3;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+        
+        auto c = x.size(1);
+        running_mean_double = running_mean_double.reshape({1, c, 1, 1});
+        running_var_double = running_var_double.reshape({1, c, 1, 1});
+        if (weight_double.defined()) weight_double = weight_double.reshape({1, c, 1, 1});
+        if (bias_double.defined()) bias_double = bias_double.reshape({1, c, 1, 1});
+
+        torch::Tensor out_double = (input_double - running_mean_double) / (torch::sqrt(running_var_double + eps_double));
+        if (weight_double.defined()) out_double = out_double * weight_double;
+        if (bias_double.defined()) out_double = out_double + bias_double;
+        x = out_double.to(original_dtype);
+    } else if (bn3) {
+        x = bn3->forward(x);
+    }

    if (this->projection_shortcut) {
        identity = this->projection_shortcut->forward(identity);
@ -150,18 +218,16 @@ ResNetImpl::ResNetImpl(const std::string& base_weights_dir_path,
    conv1 = torch::nn::Conv2d(torch::nn::Conv2dOptions(3, 64, 7).stride(2).padding(3).bias(false));
    bn1 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(64).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
    this->conv1->weight = load_named_tensor(this->_base_weights_dir, "conv1.weight", device);
+    
+    // Directly assign to the public member tensors of the bn1 module
    this->bn1->weight = load_named_tensor(this->_base_weights_dir, "bn1.weight", device);
    this->bn1->bias = load_named_tensor(this->_base_weights_dir, "bn1.bias", device);
-    
-    this->bn1->named_buffers()["running_mean"] = load_named_tensor(this->_base_weights_dir, "bn1.running_mean", device);
-    this->bn1->named_buffers()["running_var"] = load_named_tensor(this->_base_weights_dir, "bn1.running_var", device);
-    
-    this->bn1->named_buffers()["num_batches_tracked"] = load_named_tensor(this->_base_weights_dir, "bn1.num_batches_tracked", device);
-    register_module("conv1", conv1);
-    register_module("bn1", bn1);
+    this->bn1->running_mean = load_named_tensor(this->_base_weights_dir, "bn1.running_mean", device);
+    this->bn1->running_var = load_named_tensor(this->_base_weights_dir, "bn1.running_var", device);
+    this->bn1->num_batches_tracked = load_named_tensor(this->_base_weights_dir, "bn1.num_batches_tracked", device);

-    std::cout << "CPP ResNetImpl::bn1 running_mean sum: " << std::fixed << std::setprecision(10) << this->bn1->running_mean.sum().item<double>() << std::endl;
-    std::cout << "CPP ResNetImpl::bn1 running_var sum: " << std::fixed << std::setprecision(10) << this->bn1->running_var.sum().item<double>() << std::endl;
+    register_module("conv1", conv1);
+    register_module("bn1", bn1); // bn1 is already populated correctly

    relu = torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true));
    maxpool = torch::nn::MaxPool2d(torch::nn::MaxPool2dOptions(3).stride(2).padding(1));
@ -195,9 +261,9 @@ torch::nn::Sequential ResNetImpl::_make_layer(int64_t planes_for_block, int64_t
        conv_down->weight = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "0.weight", device);
        bn_down->weight = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.weight", device);
        bn_down->bias = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.bias", device);
-        bn_down->named_buffers()["running_mean"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_mean", device);
-        bn_down->named_buffers()["running_var"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_var", device);
-        bn_down->named_buffers()["num_batches_tracked"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.num_batches_tracked", device);
+        bn_down->running_mean = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_mean", device);
+        bn_down->running_var = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_var", device);
+        bn_down->num_batches_tracked = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.num_batches_tracked", device);

        ds_seq->push_back(conv_down);
        ds_seq->push_back(bn_down);
@ -229,9 +295,50 @@ std::map<std::string, torch::Tensor> ResNetImpl::forward(torch::Tensor x) {
    };

    x = conv1->forward(x);
-    if (should_output("conv1_output")) outputs["conv1_output"] = x; 
-    
-    x = bn1->forward(x);
+    if (should_output("conv1_output")) outputs["conv1_output"] = x;
+    if (should_output("debug_resnet_conv1_output_for_bn1_input")) {
+        outputs["debug_resnet_conv1_output_for_bn1_input"] = x.clone(); 
+    }
+    torch::ScalarType original_dtype_resnet_bn1 = x.scalar_type();
+
+    // Apply bn1
+    if (!this->is_training() && bn1) {
+        const auto& bn_module = *bn1;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+
+        auto c = x.size(1);
+        torch::Tensor reshaped_running_mean = running_mean_double.reshape({1, c, 1, 1});
+        torch::Tensor reshaped_running_var = running_var_double.reshape({1, c, 1, 1});
+        torch::Tensor reshaped_weight = weight_double.defined() ? weight_double.reshape({1, c, 1, 1}) : torch::Tensor();
+        torch::Tensor reshaped_bias = bias_double.defined() ? bias_double.reshape({1, c, 1, 1}) : torch::Tensor();
+        
+        torch::Tensor centered_x = input_double - reshaped_running_mean;
+        if (should_output("bn1_centered_x")) outputs["bn1_centered_x"] = centered_x.clone();
+
+        torch::Tensor variance_plus_eps = reshaped_running_var + eps_double;
+        if (should_output("bn1_variance_plus_eps")) outputs["bn1_variance_plus_eps"] = variance_plus_eps.clone();
+
+        torch::Tensor inv_std = torch::rsqrt(variance_plus_eps); // Using rsqrt for potential match
+        if (should_output("bn1_inv_std")) outputs["bn1_inv_std"] = inv_std.clone();
+        
+        torch::Tensor normalized_x = centered_x * inv_std;
+        if (should_output("bn1_normalized_x")) outputs["bn1_normalized_x"] = normalized_x.clone();
+
+        torch::Tensor out_double = normalized_x;
+        if (reshaped_weight.defined()) out_double = out_double * reshaped_weight;
+        if (reshaped_bias.defined()) out_double = out_double + reshaped_bias;
+        
+        x = out_double.to(original_dtype_resnet_bn1);
+    } else if (bn1) { // Training mode or if manual is disabled
+        x = bn1->forward(x);
+    }
+    // End apply bn1
+
    if (should_output("bn1_output")) outputs["bn1_output"] = x; 
    
    x = relu->forward(x); 
--- a/test/compare_models.py
+++ b/test/compare_models.py
@ -37,13 +37,23 @@ def get_model_configs(root_dir_param):
    return {
        # ... (existing model_configs definitions)
        'ResNet': {
-            'python_model_loader': lambda: DiMPTorchScriptWrapper(os.path.join(root_dir_param, 'pytracking_models/dimp50_ Ausdruck_ep0050.pth.tar')),
+            'python_model_loader': lambda: DiMPTorchScriptWrapper(os.path.join(root_dir_param, 'pytracking_models/dimp50_ausdruck_ep0050.pth.tar')),
            'cpp_output_subdir': 'resnet',
-            'python_output_subdir': 'resnet_py', # If Python outputs are saved separately
+            'python_output_subdir': 'resnet_py', 
            'outputs_to_compare': {
-                'Conv1': 'conv1_output.pt', # ADDED
-                'BN1': 'bn1_output.pt', # ADDED
-                'ReLU1': 'relu1_output.pt', # ADDED for completeness before MaxPool
+                'Conv1': ('conv1_output.pt', 'conv1'), 
+                'Debug ResNet Conv1->BN1 Input': ('debug_resnet_conv1_output_for_bn1_input.pt', 'conv1_pre_bn'),
+                
+                # BN1 final output (manual C++ vs manual Python pre-ReLU)
+                'BN1': ('bn1_output.pt', 'bn1_post_relu_pre'), 
+
+                # BN1 Intermediate comparisons
+                'BN1 Centered X': ('bn1_centered_x.pt', 'bn1_centered_x_py'),
+                'BN1 Var+Eps': ('bn1_variance_plus_eps.pt', 'bn1_variance_plus_eps_py'),
+                'BN1 InvStd': ('bn1_inv_std.pt', 'bn1_inv_std_py'),
+                'BN1 Normalized X': ('bn1_normalized_x.pt', 'bn1_normalized_x_py'),
+
+                'ReLU1': ('relu1_output.pt', 'conv1'), 
                'MaxPool': 'maxpool_output.pt',
                'Features': 'features.pt', 
                'Layer1': 'layer1.pt',
@ -523,6 +533,9 @@ class ComparisonRunner:
            cpp_mod_vec0_path = cpp_output_bb_reg_dir_path / f'sample_{i}_mod_vec0.pt'
            cpp_mod_vec1_path = cpp_output_bb_reg_dir_path / f'sample_{i}_mod_vec1.pt'
            cpp_iou_scores_path = cpp_output_bb_reg_dir_path / f'sample_{i}_iou_scores.pt'
+            # Paths for debug C++ outputs
+            cpp_debug_conv3_1t_path = cpp_output_bb_reg_dir_path / f'sample_{i}_debug_conv3_1t_output.pt'
+            cpp_debug_conv4_1t_path = cpp_output_bb_reg_dir_path / f'sample_{i}_debug_conv4_1t_output.pt'

            # Load initial inputs for Python model
            py_image_tensor = self.load_cpp_tensor(py_image_input_path, self.device)
@ -549,6 +562,31 @@ class ComparisonRunner:
            else:
                print(f"Warning: Skipping Python BB Regressor for sample {i}, image input not found at {py_image_input_path}")

+            # ---- Intermediate debug outputs for conv3_1t and conv4_1t ----
+            py_debug_conv3_1t_out = None
+            py_debug_conv4_1t_out = None
+
+            if py_feat_layer2 is not None:
+                try:
+                    _feat2_for_debug_conv3_1t = py_feat_layer2
+                    if _feat2_for_debug_conv3_1t.dim() == 5:
+                        _feat2_for_debug_conv3_1t = _feat2_for_debug_conv3_1t.reshape(-1, *_feat2_for_debug_conv3_1t.shape[-3:])
+                    with torch.no_grad(): # Ensure no_grad context
+                        py_debug_conv3_1t_out = self.bb_regressor_from_source.conv3_1t(_feat2_for_debug_conv3_1t)
+                except Exception as e:
+                    print(f"ERROR calculating Python Debug_Conv3_1t for sample {i}: {e}")
+            
+            if py_feat_layer3 is not None:
+                try:
+                    _feat3_for_debug_conv4_1t = py_feat_layer3
+                    if _feat3_for_debug_conv4_1t.dim() == 5:
+                        _feat3_for_debug_conv4_1t = _feat3_for_debug_conv4_1t.reshape(-1, *_feat3_for_debug_conv4_1t.shape[-3:])
+                    with torch.no_grad(): # Ensure no_grad context
+                        py_debug_conv4_1t_out = self.bb_regressor_from_source.conv4_1t(_feat3_for_debug_conv4_1t)
+                except Exception as e:
+                    print(f"ERROR calculating Python Debug_Conv4_1t for sample {i}: {e}")
+            # ---- End intermediate debug outputs ----
+
            # Get Python IoU features
            py_iou_feat_list = [None, None] # Initialize as a list of two Nones
            if py_feat_layer2 is not None and py_feat_layer3 is not None:
@ -622,8 +660,13 @@ class ComparisonRunner:
            cpp_mod_vec0 = self.load_cpp_tensor(cpp_mod_vec0_path, self.device)
            cpp_mod_vec1 = self.load_cpp_tensor(cpp_mod_vec1_path, self.device)
            cpp_iou_scores = self.load_cpp_tensor(cpp_iou_scores_path, self.device)
+            # Load debug C++ tensors
+            cpp_debug_conv3_1t_tensor = self.load_cpp_tensor(cpp_debug_conv3_1t_path, self.device)
+            cpp_debug_conv4_1t_tensor = self.load_cpp_tensor(cpp_debug_conv4_1t_path, self.device)

            # Comparisons
+            self._compare_tensor_data(py_debug_conv3_1t_out, cpp_debug_conv3_1t_tensor, "BBReg Debug_Conv3_1t", i, current_errors)
+            self._compare_tensor_data(py_debug_conv4_1t_out, cpp_debug_conv4_1t_tensor, "BBReg Debug_Conv4_1t", i, current_errors)
            self._compare_tensor_data(py_iou_feat_list[0], cpp_iou_feat0, "BBReg PyIoUFeat0 vs CppIoUFeat0", i, current_errors)
            self._compare_tensor_data(py_iou_feat_list[1], cpp_iou_feat1, "BBReg PyIoUFeat1 vs CppIoUFeat1", i, current_errors)
            self._compare_tensor_data(py_modulation_list[0], cpp_mod_vec0, "BBReg PyMod0 vs CppMod0", i, current_errors)
@ -633,224 +676,311 @@ class ComparisonRunner:
            if current_errors: self.all_comparison_stats[f"BBReg_Sample_{i}"] = current_errors

    def compare_resnet_outputs(self):
-        print("Comparing ResNet outputs...")
-        print("\n--- Types at START of compare_resnet_outputs: ---")
-        if 'ResNet' in self.models: print(f"  self.models['ResNet'] type: {type(self.models['ResNet'])}")
-        if 'Classifier' in self.models: print(f"  self.models['Classifier'] type: {type(self.models['Classifier'])}")
-        if 'BBRegressor' in self.models: print(f"  self.models['BBRegressor'] type: {type(self.models['BBRegressor'])}")
+        print("\\n--- Comparing ResNet Outputs ---")
+        if not self.models.get('ResNet'):
+            print("PYTHON: ResNet model not loaded, skipping ResNet comparison.")
+            return

-        py_input_common_dir = os.path.join(self.root_dir, 'test', 'input_samples', 'common')
-        cpp_output_resnet_dir = os.path.join(self.cpp_output_dir, 'resnet')
-        # Ensure self.py_resnet_output_dir is defined, e.g., in __init__ or where other py output dirs are
-        if not hasattr(self, 'py_resnet_output_dir') or not self.py_resnet_output_dir:
-            self.py_resnet_output_dir = Path(self.python_output_dir) / 'resnet'
-            self.py_resnet_output_dir.mkdir(parents=True, exist_ok=True)
+        resnet_model = self.models['ResNet']
+        config = self.model_configs['ResNet']
+        cpp_resnet_dir = os.path.join(self.cpp_output_dir, config['cpp_output_subdir'])
+        
+        python_resnet_save_dir = os.path.join(self.python_output_dir, config.get('python_output_subdir', config['cpp_output_subdir']))
+        if not os.path.exists(python_resnet_save_dir):
+            os.makedirs(python_resnet_save_dir, exist_ok=True)
+
+        num_samples_to_process = self.num_samples
+        if num_samples_to_process == -1: # If -1, determine from available C++ output files
+            # This logic can be complex if C++ output is sparse. For now, let's assume if -1 it means process all *common* inputs.
+            # A safer way for -1 would be to count common input samples first.
+            common_input_glob = os.path.join(self.root_dir, "test", "input_samples", "common", "sample_*_image.pt")
+            num_samples_to_process = len(glob.glob(common_input_glob))
+            print(f"INFO: num_samples set to -1, determined {num_samples_to_process} common input samples.")
+
+        processed_samples_count = 0 # Renamed from processed_samples to avoid conflict
+        
+        sample_input_base_dir = os.path.join(self.root_dir, "test", "input_samples", "common")
+        
+        # Loop exactly self.num_samples times (or detected count if -1)
+        for sample_idx in tqdm(range(num_samples_to_process), desc="Comparing ResNet samples"):
+            current_errors = {} # Initialize for each sample
+            python_intermediate_outputs_cache = {} # Reset for each sample
+
+            # Construct the input file path based on sample_idx
+            sample_input_file_path = os.path.join(sample_input_base_dir, f"sample_{sample_idx}_image.pt")
+
+            if not os.path.exists(sample_input_file_path):
+                print(f"Warning: Input sample file {sample_input_file_path} not found for sample index {sample_idx}. Skipping ResNet sample.")
+                empty_errors_for_skipped_sample = {}
+                for output_key_config in config['outputs_to_compare'].keys():
+                    self._compare_tensor_data(None, None, output_key_config, sample_idx, empty_errors_for_skipped_sample)
+                if empty_errors_for_skipped_sample:
+                     self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = empty_errors_for_skipped_sample
+                continue
+            
+            # --- START REINSTATED INPUT LOADING AND PREPROCESSING ---
+            input_tensor = self.load_cpp_tensor(sample_input_file_path, self.device, is_image=True)
+
+            if input_tensor is None:
+                print(f"Warning: Failed to load a valid tensor for ResNet input sample {sample_input_file_path} (sample {sample_idx}) using self.load_cpp_tensor. Skipping.")
+                # Populate NaNs for all expected outputs for this sample
+                empty_errors_for_skipped_sample = {}
+                for output_key_config in config['outputs_to_compare'].keys():
+                    self._compare_tensor_data(None, None, output_key_config, sample_idx, empty_errors_for_skipped_sample)
+                if empty_errors_for_skipped_sample:
+                    self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = empty_errors_for_skipped_sample
+                continue

-        # Define Path objects for directory checks
-        py_input_common_dir_path = Path(py_input_common_dir)
-        cpp_output_resnet_dir_path = Path(cpp_output_resnet_dir)
-
-        comparison_configs = [
-            ("ResNet Conv1 Output (Pre-BN)", "_conv1_output_py.pt", "_conv1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Conv1", "_conv1_output.pt", "_conv1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir), # Assumes Py also saved conv1 output if it was meant to be same as C++ pre-bn
-            ("ResNet BN1", "_bn1_output.pt", "_bn1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet ReLU1", "_relu1_output.pt", "_relu1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet MaxPool", "_maxpool_output.pt", "_maxpool_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer1.0 Block Output", "_layer1_0_block_output.pt", "_layer1_0_block_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer1.0 Shortcut Output", "_layer1_0_shortcut_output.pt", "_layer1_0_shortcut_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer1", "_layer1_output.pt", "_layer1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer2", "_layer2_output.pt", "_layer2_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer3", "_layer3_output.pt", "_layer3_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer4", "_layer4_output.pt", "_layer4_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Features", "_features_output.pt", "_features_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir) 
-        ]
-
-        if not py_input_common_dir_path.exists() or not cpp_output_resnet_dir_path.exists():
-            print(f"ResNet input ({py_input_common_dir_path}) or C++ ResNet output dir ({cpp_output_resnet_dir_path}) not found. Skipping ResNet comparison.")
-            # Populate NaN for all expected ResNet comparisons if dirs are missing
-            for i in range(self.num_samples):
-                sample_key_base = f"ResNet_Sample_{i}"
-                current_errors = {}
-                self._compare_tensor_data(None, None, "ResNet Layer1", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Layer2", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Layer3", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Layer4", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Features", i, current_errors)
-                self.all_comparison_stats[sample_key_base] = current_errors
-            return
+            if not isinstance(input_tensor, torch.Tensor):
+                print(f"Warning: self.load_cpp_tensor for {sample_input_file_path} did not return a Tensor (got {type(input_tensor)}). Skipping sample {sample_idx}.")
+                # Populate NaNs for all expected outputs for this sample
+                empty_errors_for_skipped_sample = {}
+                for output_key_config in config['outputs_to_compare'].keys():
+                    self._compare_tensor_data(None, None, output_key_config, sample_idx, empty_errors_for_skipped_sample)
+                if empty_errors_for_skipped_sample:
+                    self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = empty_errors_for_skipped_sample
+                continue

-        for i in tqdm(range(self.num_samples), desc="ResNet samples"):
-            current_errors = {} # For this sample
+            # Preprocess the input tensor for Python's ResNet
+            if hasattr(self.python_wrapper, 'preprocess_image'):
+                processed_input_tensor = self.python_wrapper.preprocess_image(input_tensor.clone()) # Use clone
+            else:
+                print("Warning: python_wrapper.preprocess_image not found. Using input_tensor as is.")
+                processed_input_tensor = input_tensor.to(self.device) # Ensure device
+            # --- END REINSTATED INPUT LOADING AND PREPROCESSING ---

-            py_image_input_path = py_input_common_dir_path / f'sample_{i}_image.pt'
-            py_image_tensor = self.load_cpp_tensor(py_image_input_path, self.device)
+            # Initialize dictionaries to store Python-side outputs for the current sample
+            python_outputs = {} # To store outputs from the Python model for this sample

-            py_conv1_out, py_bn1_out, py_relu1_out, py_maxpool_out, py_layer1_out, py_layer2_out, py_layer3_out, py_layer4_out, py_features_out = None, None, None, None, None, None, None, None, None # ADDED py_conv1_out, py_bn1_out, py_relu1_out
-            py_layer1_0_shortcut_out = None
+            try:
+                # Python ResNet forward pass (assuming it's a JIT model or similar)
+                # The output of a JIT ResNet model might be a dictionary or a list/tuple of tensors
+                # We need to ensure we can map these to the 'outputs_to_compare' keys
+                print(f"PYTHON ResNet forward pass for sample {sample_idx}...")
+                
+                # For ResNet, the output is a dictionary from its forward method.
+                # output_layers = list(config['outputs_to_compare'].keys()) # This might be too broad initially
+                
+                # Define the layers we actually need from the Python ResNet forward pass.
+                # These should match the keys used in the Python ResNet's forward method.
+                # e.g., ['layer1', 'layer2', 'layer3', 'layer4', 'conv1_output', 'bn1_output', etc.]
+                # For now, let's define specific layers needed for the comparison.
+                # The JIT ResNet model we have should output a dictionary.
+
+                py_output_layers_needed = ['conv1', 'layer1', 'layer2', 'layer3', 'layer4']
+                # Add 'conv1_pre_bn' if we need to compare the input to BN1
+                if 'Debug ResNet Conv1->BN1 Input' in config['outputs_to_compare']:
+                    py_output_layers_needed.append('conv1_pre_bn')
+                
+                # If we are comparing the direct C++ BN1 output, we need 'bn1_output' from Python
+                if 'BN1' in config['outputs_to_compare']:
+                    py_output_layers_needed.append('bn1_output') 
+                
+                # If we are comparing the C++ ReLU1 output (after BN1 and ReLU), we need 'bn1_post_relu_pre' from Python
+                if 'ReLU1' in config['outputs_to_compare']:
+                    py_output_layers_needed.append('bn1_post_relu_pre')
+
+                # Add Python-side BN1 intermediate layer names if they are in outputs_to_compare
+                # The config value (cpp_output_filename_or_tuple) is not directly used here for this part,
+                # we care about the py_dict_key that will be derived from the C++ key.
+                bn1_intermediate_py_keys_to_request = []
+                if 'BN1 Centered X' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_centered_x_py')
+                if 'BN1 Var+Eps' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_variance_plus_eps_py')
+                if 'BN1 InvStd' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_inv_std_py')
+                if 'BN1 Normalized X' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_normalized_x_py')
+                
+                for py_key in bn1_intermediate_py_keys_to_request:
+                    if py_key not in py_output_layers_needed:
+                        py_output_layers_needed.append(py_key)

-            if py_image_tensor is not None:
-                # Save Python's preprocessed input to conv1
-                # This py_image_tensor is already preprocessed by DiMPTorchScriptWrapper.extract_backbone -> preprocess_image
-                # which is called before this compare_resnet_outputs function if we follow the logic for py_feat_layer2, py_feat_layer3 in compare_bb_regressor
-                # However, here in compare_resnet_outputs, py_image_tensor comes from load_cpp_tensor(py_image_input_path, ...)
-                # which is the RAW image. Preprocessing for python side happens inside self.python_wrapper.extract_backbone
-                # or when we manually call py_model_resnet.conv1(py_image_tensor)
-                # Let's get the preprocessed image from the wrapper as that's the true input to Python's ResNet
+                # Add 'fc' if configured, though not typically used in these comparisons
+                if 'fc' in config['outputs_to_compare']:
+                     py_output_layers_needed.append('fc')
                
-                # The input to python_wrapper.extract_backbone is the raw image tensor
-                # It then calls self.preprocess_image(im) and then self.net.extract_backbone_features(im, layers)
-                # So, py_image_tensor IS the raw image. We need to get the preprocessed one.
-
-                preprocessed_py_image_for_conv1 = None
-                if self.python_wrapper:
-                    # Manually preprocess for saving, mimicking what extract_backbone would do before its first conv
-                    preprocessed_py_image_for_conv1 = self.python_wrapper.preprocess_image(py_image_tensor.clone()) # Clone to avoid in-place modification of py_image_tensor
-                    py_preprocessed_save_path = Path(self.cpp_output_dir) / 'resnet' / f'sample_{i}_image_preprocessed_python.pt'
-                    # Ensure self.cpp_output_dir / resnet exists
-                    (Path(self.cpp_output_dir) / 'resnet').mkdir(parents=True, exist_ok=True)
-                    torch.save(preprocessed_py_image_for_conv1.cpu(), str(py_preprocessed_save_path))
-                    print(f"Saved Python preprocessed image for sample {i} to {py_preprocessed_save_path}")
+                # Deduplicate, just in case (though construction above should be fine)
+                py_output_layers_needed = list(OrderedDict.fromkeys(py_output_layers_needed))
+                
+                print(f"DEBUG: Requesting these layers from Python ResNet: {py_output_layers_needed}")
+
+                # Call the Python ResNet forward
+                # The `self.models['ResNet']` should be the loaded JIT model
+                # It expects the output_layers argument.
+                # The DiMPTorchScriptWrapper's backbone should also support this.
+                if hasattr(resnet_model, 'forward') and callable(getattr(resnet_model, 'forward')) and 'output_layers' in inspect.signature(resnet_model.forward).parameters:
+                    python_model_outputs_dict = resnet_model.forward(processed_input_tensor, output_layers=py_output_layers_needed)
+                elif hasattr(self.python_wrapper, 'extract_backbone') and callable(getattr(self.python_wrapper, 'extract_backbone')):
+                    # This is the case if ResNet is accessed via the DiMPTorchScriptWrapper's extract_backbone,
+                    # which internally calls the backbone's forward with output_layers.
+                    python_model_outputs_dict = self.python_wrapper.extract_backbone(input_tensor.clone()) # extract_backbone handles preprocessing
+                else:
+                    print(f"ERROR: Cannot call forward on Python ResNet model. Type: {type(resnet_model)}")
+                    continue
+
+                # DEBUG: Print keys from Python model output
+                if isinstance(python_model_outputs_dict, dict):
+                    print(f"DEBUG RN_CMP: Keys from python_model_outputs_dict (sample {sample_idx}): {list(python_model_outputs_dict.keys())}")
                else:
-                    print("ERROR: self.python_wrapper not available to get preprocessed image for Python.")
+                    print(f"DEBUG RN_CMP: python_model_outputs_dict is not a dict (sample {sample_idx}), type: {type(python_model_outputs_dict)}")
+
+                # Populate python_outputs based on the python_model_outputs_dict
+                # This maps the Python output names to the keys used in 'outputs_to_compare'
+                if isinstance(python_model_outputs_dict, dict):
+                    python_outputs = python_model_outputs_dict
+                    # If 'features' is an alias for 'layer4' in Python output
+                    if 'layer4' in python_outputs and 'features' not in python_outputs:
+                         python_outputs['features'] = python_outputs['layer4']
+                    if 'conv1_output' in python_outputs:
+                        python_intermediate_outputs_cache['conv1_output'] = python_outputs['conv1_output']
+
+                else:
+                    print(f"ERROR: Python ResNet output is not a dict. Got {type(python_model_outputs_dict)}")
+                    # Handle tuple/list output if necessary, mapping by order or specific logic.
+                    # For now, we assume dict output from our ResNet.
+                    continue
+
+
+            except Exception as e:
+                print(f"Error during Python ResNet forward pass for sample {sample_idx}: {e}")
+                import traceback
+                traceback.print_exc()
+                continue # Skip to next sample
+
+            for output_key, cpp_output_filename_or_tuple in config['outputs_to_compare'].items():
+                is_python_specific_name = isinstance(cpp_output_filename_or_tuple, tuple)
+                cpp_output_filename = cpp_output_filename_or_tuple[0] if is_python_specific_name else cpp_output_filename_or_tuple
+                
+                # Corrected path construction for C++ ResNet tensors:
+                # The sample index is already part of the cpp_output_filename for ResNet outputs from C++.
+                # (e.g., sample_0_conv1_output.pt)
+                # So, we join cpp_resnet_dir directly with this filename.
+                # However, the C++ code actually saves ResNet outputs as sample_X_LAYERNAME.pt directly in cpp_resnet_dir,
+                # not in a per-sample subdirectory for ResNet outputs.
+                # Let's check how test_models.cpp saves them.
+                # test_models.cpp -> save_resnet_outputs -> file_path = resnet_output_dir + "/sample_" + std::to_string(sample_idx) + "_" + output_name;
+                # This means filenames are like "sample_0_conv1_output.pt" directly in "../test/output/resnet/"
+                
+                correct_cpp_tensor_filename = f"sample_{sample_idx}_{cpp_output_filename}"
+                cpp_tensor_path = os.path.join(cpp_resnet_dir, correct_cpp_tensor_filename)
+
+                # <<< START ADDED DEBUG PRINTS >>>
+                print(f"DEBUG RN_CMP: Attempting to load C++ tensor for '{output_key}' (sample {sample_idx}) from: {cpp_tensor_path}")
+                # <<< END ADDED DEBUG PRINTS >>>

                try:
-                    with torch.no_grad():
-                        py_model_resnet = self.models.get('ResNet')
-                        if py_model_resnet:
-                            current_features = preprocessed_py_image_for_conv1
-
-                            py_conv1_out = py_model_resnet.conv1(current_features)
-                            # Ensure self.py_resnet_output_dir is defined and is a Path object
-                            if not hasattr(self, 'py_resnet_output_dir') or not self.py_resnet_output_dir:
-                                self.py_resnet_output_dir = Path(self.python_output_dir) / 'resnet'
-                                self.py_resnet_output_dir.mkdir(parents=True, exist_ok=True)
-                            py_conv1_out_path = self.py_resnet_output_dir / f'sample_{i}_conv1_output_py.pt'
-                            torch.save(py_conv1_out.cpu(), str(py_conv1_out_path))
-
-                            # --- BN1 on CPU for debugging (Python) ---
-                            py_bn1_out = py_model_resnet.bn1(py_conv1_out)  # Original line
-
-                            py_relu1_out = py_model_resnet.relu(py_bn1_out) 
-                            py_maxpool_out = py_model_resnet.maxpool(py_relu1_out) 
-                            x_for_py_layer1_input = py_maxpool_out
-
-                            # Output of the first bottleneck block in layer1
-                            py_layer1_0_block_out_tensor = None # Initialize to avoid ref before assignment if try fails
-                            if hasattr(py_model_resnet, 'layer1') and len(py_model_resnet.layer1) > 0:
-                                try:
-                                    py_layer1_0_block_out_tensor = py_model_resnet.layer1[0](x_for_py_layer1_input) # REMOVED .clone() for consistency with best Layer1.0 result
-                                    # Ensure cpp_resnet_sample_dir is defined, if not, use a fallback or define it earlier
-                                    # Assuming cpp_resnet_sample_dir is defined like: cpp_resnet_sample_dir = Path(self.cpp_output_dir) / 'resnet'
-                                    # Which should be: cpp_resnet_dir = Path(self.cpp_output_dir) / 'resnet' # as per usage elsewhere
-                                    # And then: cpp_resnet_sample_dir = cpp_resnet_dir # if sample specific subdirs are not used for this
-                                    # For safety, let's use the already established cpp_output_resnet_dir path from later in the code
-                                    # cpp_output_resnet_dir = os.path.join(self.cpp_output_dir, 'resnet')
-                                    # Need to ensure cpp_output_resnet_dir is a Path object if used with /
-                                    # From later code: cpp_output_resnet_dir_path = Path(self.cpp_output_dir) / 'resnet' 
-
-                                    current_cpp_resnet_dir = Path(self.cpp_output_dir) / 'resnet' # Define it based on existing patterns
-                                    current_cpp_resnet_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists
-
-                                    py_layer1_0_block_save_path = current_cpp_resnet_dir / f'sample_{i}_layer1_0_block_output.pt'
-                                    torch.save(py_layer1_0_block_out_tensor.cpu(), str(py_layer1_0_block_save_path))
-                                    # print(f"DEBUG: Saved Python layer1[0] block output for sample {i} to {py_layer1_0_block_save_path}")
-                                except Exception as e_block:
-                                    print(f"ERROR: Failed to get/save Python layer1[0] block output for sample {i}: {e_block}")
-
-                            # Shortcut for layer1.0 (if exists)
-                            if hasattr(py_model_resnet, 'layer1') and len(py_model_resnet.layer1) > 0 and \
-                               hasattr(py_model_resnet.layer1[0], 'downsample') and py_model_resnet.layer1[0].downsample is not None:
-                                py_layer1_0_shortcut_out = py_model_resnet.layer1[0].downsample(x_for_py_layer1_input.clone())
-                            
-                            # Get full backbone outputs using the wrapper (which uses the raw image_tensor and preprocesses internally)
-                            # This ensures layer1, layer2, etc., are from the standard path.
-                            if self.python_wrapper:
-                                py_backbone_outputs = self.python_wrapper.extract_backbone(py_image_tensor) # py_image_tensor is raw
-                            else:
-                                print("ERROR: self.python_wrapper is None, cannot extract backbone features for ResNet outputs.")
-                                py_backbone_outputs = {} 
-
-                            py_layer1_out = py_backbone_outputs.get('layer1')
-                            py_layer2_out = py_backbone_outputs.get('layer2')
-                            py_layer3_out = py_backbone_outputs.get('layer3')
-                            py_layer4_out = py_backbone_outputs.get('layer4')
-                            py_features_out = py_backbone_outputs.get('layer4') # Typically layer4 is the final feature map
-                        else:
-                            print("ERROR: Python ResNet model not found in self.models")
-                except Exception as e:
-                    print(f"ERROR: Python ResNet backbone/shortcut processing failed for sample {i}: {e}")
-            else:
-                print(f"Warning: Skipping Python ResNet for sample {i}, image input not found at {py_image_input_path}")
+                    cpp_tensor = self.load_cpp_tensor(cpp_tensor_path, self.device)
+                    # <<< START ADDED DEBUG PRINTS >>>
+                    loaded_status = "None"
+                    if cpp_tensor is not None:
+                        loaded_status = f"Tensor with shape {cpp_tensor.shape}, dtype {cpp_tensor.dtype}, device {cpp_tensor.device}"
+                    print(f"DEBUG RN_CMP: Loaded C++ tensor for '{output_key}' (sample {sample_idx}): {loaded_status}")
+                    # <<< END ADDED DEBUG PRINTS >>>
+
+                    if cpp_tensor is None:
+                        print(f"Warning: C++ tensor {cpp_output_filename} for sample {sample_idx} ('{output_key}') is None or loading failed. Skipping comparison for this output.")
+                        # _compare_tensor_data will be called with cpp_tensor=None, which handles NaN population
+                        # Fall through to _compare_tensor_data to record NaNs
+                        # continue # This would skip the _compare_tensor_data call entirely
+
+                    # Get the corresponding Python tensor
+                    python_tensor = None
+                    python_output_save_path = os.path.join(python_resnet_save_dir, f"sample_{sample_idx}", cpp_output_filename) # Save with same name as C++ for consistency
+
+                    # Map the 'output_key' from config to the key used in 'python_outputs' dictionary
+                    # This requires knowing how 'outputs_to_compare' keys map to Python model output dict keys.
+                    # Example: 'Conv1' maps to 'conv1_output', 'Features' to 'features' (which might be 'layer4'), etc.
+                    
+                    py_dict_key = None
+                    if output_key == 'Conv1': 
+                        py_dict_key = 'conv1_pre_bn' # Python ResNet outputs combined conv1+bn1+relu as 'conv1'
+                    elif output_key == 'Debug ResNet Conv1->BN1 Input':
+                        py_dict_key = 'conv1_pre_bn' # Our new specific output layer
+                    elif output_key == 'BN1': 
+                        py_dict_key = 'bn1_output' # CHANGED to use the new hook
+                    elif output_key == 'BN1 Centered X':
+                        py_dict_key = 'bn1_centered_x_py'
+                    elif output_key == 'BN1 Var+Eps':
+                        py_dict_key = 'bn1_variance_plus_eps_py'
+                    elif output_key == 'BN1 InvStd':
+                        py_dict_key = 'bn1_inv_std_py'
+                    elif output_key == 'BN1 Normalized X':
+                        py_dict_key = 'bn1_normalized_x_py'
+                    elif output_key == 'ReLU1':
+                        py_dict_key = 'bn1_post_relu_pre' # Output of Python's BN1 + ReLU
+                    elif output_key == 'MaxPool':
+                        # MaxPool is applied *after* 'conv1' (conv1+bn1+relu) block in Python ResNet.
+                        # However, the Python ResNet forward doesn't have a separate 'maxpool' output key.
+                        # The output of layer1 is *after* maxpool.
+                        # C++ saves maxpool_output.pt *before* layer1.
+                        # This means we need to save python_outputs['conv1'] (after conv1,bn1,relu) then apply maxpool to it manually for comparison.
+                        # OR, recognize that C++ output for maxpool is input to layer1.
+                        # For now, this is tricky. Let's see if layer1 input in C++ matches python maxpool output.
+                        # The Python output named 'layer1' is after the nn.Sequential that IS layer1.
+                        # The input to C++ layer1 is the output of C++ maxpool.
+                        # The input to Python model.layer1 is the output of model.maxpool(model.relu(model.bn1(model.conv1(x)))).
+                        # So, Python's 'conv1' output, when passed through an nn.MaxPool2d, should match C++ 'maxpool_output.pt'.
+                        print(f"Warning: Direct Python equivalent for C++ 'MaxPool' output is complex. Requires manual maxpool application to Python's 'conv1' output. Skipping {output_key} for now.")
+                        continue # Skip this key for now
+                    elif output_key == 'Layer1': py_dict_key = 'layer1'
+                    elif output_key == 'Layer2': py_dict_key = 'layer2'
+                    elif output_key == 'Layer3': py_dict_key = 'layer3'
+                    elif output_key == 'Layer4': py_dict_key = 'layer4'
+                    elif output_key == 'Features': py_dict_key = 'layer4' # 'Features' is an alias for 'layer4'
+                    elif output_key == 'Layer1.0 Shortcut':
+                        # Shortcut outputs are not available from the Python ResNet forward method.
+                        print(f"Warning: Shortcut output '{output_key}' cannot be directly fetched from Python ResNet. Skipping.")
+                        continue
+                    else:
+                        print(f"Warning: Unknown output_key '{output_key}' in ResNet config for Python tensor mapping. Skipping.")
+                        continue

-            # Load C++ ResNet outputs
+                    if py_dict_key and py_dict_key in python_outputs:
+                        python_tensor = python_outputs[py_dict_key]
+                    else:
+                        # DEBUG: Print info if key is not found
+                        print(f"DEBUG RN_CMP: py_dict_key '{py_dict_key}' not found in python_outputs (keys: {list(python_outputs.keys())}) for output_key '{output_key}', sample {sample_idx}")
+                    
+                    if python_tensor is None:
+                        print(f"Warning: Python tensor for {output_key} is None for sample {sample_idx}. Skipping.")
+                        continue

-            # NEW: Debug directory listing
-            print(f"DEBUG: Listing contents of {cpp_output_resnet_dir_path} before loading tensors for sample {i}:")
-            try:
-                if cpp_output_resnet_dir_path.exists() and cpp_output_resnet_dir_path.is_dir():
-                    for item_path in cpp_output_resnet_dir_path.iterdir():
-                        print(f"  - {item_path.name}")
-                else:
-                    print(f"  Directory {cpp_output_resnet_dir_path} does not exist or is not a directory.")
-            except Exception as e_list:
-                print(f"  ERROR listing directory: {e_list}")
-            # END NEW # Removing this marker
-
-            time.sleep(0.5) # INCREASED to 0.5s delay to allow filesystem to sync
-
-            # Debug blocks for directory listing and direct open test were here and are now fully removed.
-
-            cpp_layer1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1.pt')
-            cpp_layer2_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer2.pt')
-            cpp_layer3_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer3.pt')
-            cpp_layer4_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer4.pt')
-            cpp_features_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_features.pt')
-            cpp_layer1_0_shortcut_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1_0_shortcut_output.pt')
-            cpp_maxpool_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_maxpool_output.pt')
-            cpp_conv1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_conv1_output.pt') # ADDED
-            cpp_bn1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_bn1_output.pt') # ADDED
-            cpp_relu1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_relu1_output.pt') # ADDED
-            cpp_layer1_0_block_output_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1_0_block_output.pt') # ADDED
-
-            cpp_layer1_out = self.load_cpp_tensor(cpp_layer1_path, self.device)
-            cpp_layer2_out = self.load_cpp_tensor(cpp_layer2_path, self.device)
-            cpp_layer3_out = self.load_cpp_tensor(cpp_layer3_path, self.device)
-            cpp_layer4_out = self.load_cpp_tensor(cpp_layer4_path, self.device)
-            cpp_features_out = self.load_cpp_tensor(cpp_features_path, self.device)
-            cpp_layer1_0_shortcut_out = self.load_cpp_tensor(cpp_layer1_0_shortcut_path, self.device)
-            cpp_maxpool_out = self.load_cpp_tensor(cpp_maxpool_path, self.device)
-            cpp_conv1_out = self.load_cpp_tensor(cpp_conv1_path, self.device) # ADDED
-            cpp_bn1_out = self.load_cpp_tensor(cpp_bn1_path, self.device) # ADDED
-            cpp_relu1_out = self.load_cpp_tensor(cpp_relu1_path, self.device) # ADDED
-            cpp_layer1_0_block_output_tensor = self.load_cpp_tensor(cpp_layer1_0_block_output_path, self.device) # ADDED
-
-            # Load the Python pre-BN conv1 output that was saved earlier
-            py_conv1_out_pre_bn_tensor = None
-            # Ensure self.py_resnet_output_dir is defined (it should be if the save operation worked)
-            if hasattr(self, 'py_resnet_output_dir') and self.py_resnet_output_dir:
-                py_conv1_out_pre_bn_path = self.py_resnet_output_dir / f'sample_{i}_conv1_output_py.pt'
-                if py_conv1_out_pre_bn_path.exists():
-                    try:
-                        py_conv1_out_pre_bn_tensor = torch.load(str(py_conv1_out_pre_bn_path), map_location=self.device)
-                    except Exception as e_load_py_conv1:
-                        print(f"Error loading Python conv1_output_py (pre-BN) for sample {i}: {e_load_py_conv1}")
-            else:
-                print(f"Warning: self.py_resnet_output_dir not defined, cannot load py_conv1_output_py.pt for sample {i}")
+                    # Save the Python tensor (always, for record-keeping)
+                    os.makedirs(os.path.dirname(python_output_save_path), exist_ok=True)
+                    torch.save(python_tensor.cpu(), python_output_save_path)
+                    # print(f"Saved Python tensor for {output_key} (sample {sample_idx}) to {python_output_save_path}")

-            # Comparisons
-            self._compare_tensor_data(py_conv1_out_pre_bn_tensor, cpp_conv1_out, "ResNet Conv1 Output (Pre-BN)", i, current_errors)
-            self._compare_tensor_data(py_conv1_out, cpp_conv1_out, "ResNet Conv1", i, current_errors)
-            self._compare_tensor_data(py_bn1_out, cpp_bn1_out, "ResNet BN1", i, current_errors)
-            self._compare_tensor_data(py_relu1_out, cpp_relu1_out, "ResNet ReLU1", i, current_errors)
-            self._compare_tensor_data(py_maxpool_out, cpp_maxpool_out, "ResNet MaxPool", i, current_errors)
-            self._compare_tensor_data(py_layer1_out, cpp_layer1_out, "ResNet Layer1", i, current_errors)
-            self._compare_tensor_data(py_layer2_out, cpp_layer2_out, "ResNet Layer2", i, current_errors)
-            self._compare_tensor_data(py_layer3_out, cpp_layer3_out, "ResNet Layer3", i, current_errors)
-            self._compare_tensor_data(py_layer4_out, cpp_layer4_out, "ResNet Layer4", i, current_errors)
-            self._compare_tensor_data(py_features_out, cpp_features_out, "ResNet Features", i, current_errors)
-            self._compare_tensor_data(py_layer1_0_shortcut_out, cpp_layer1_0_shortcut_out, "ResNet Layer1.0 Shortcut", i, current_errors)
-
-            if current_errors: self.all_comparison_stats[f"ResNet_Sample_{i}"] = current_errors
+
+                    # Perform comparison
+                    self._compare_tensor_data(python_tensor.to(self.device) if python_tensor is not None else None, 
+                                          cpp_tensor, 
+                                          output_key, 
+                                          sample_idx, 
+                                          current_errors) # current_errors is populated in place
+                    
+                    # The line above was changed to handle python_tensor being None before .to(self.device)
+                    # current_errors is populated by _compare_tensor_data directly.
+                    # self.all_comparison_stats is updated after this inner loop completes for the sample.
+
+                except FileNotFoundError:
+                    print(f"Warning: C++ output file not found: {cpp_tensor_path}. Skipping for sample {sample_idx}, output {output_key}.")
+                    # Populate NaNs for this missing C++ file
+                    self._compare_tensor_data(None, None, output_key, sample_idx, current_errors)
+                except Exception as e:
+                    print(f"Error comparing {output_key} for sample {sample_idx}: {e}")
+                    import traceback
+                    traceback.print_exc()
+                    # Populate NaNs on error
+                    self._compare_tensor_data(None, None, output_key, sample_idx, current_errors)
+            
+            # After processing all output_keys for this sample, store the collected current_errors
+            if current_errors: # If any comparisons were attempted (even if they resulted in NaNs)
+                self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = current_errors
+
+            # processed_samples += 1 # This variable is no longer used as loop is range-based
+        print("--- ResNet Output Comparison Complete ---")

    def generate_html_report(self):
        print("\nGenerating HTML report...")
@ -1200,73 +1330,247 @@ class ComparisonRunner:
            print("Preprocessed input comparison: ISSUES FOUND (details above).")

    def load_cpp_tensor(self, file_path_str, device, is_image=False):
-        file_path_obj = Path(file_path_str) # Convert to Path object early
-
-        # Removed debug print: print(f"DEBUG: load_cpp_tensor: Checking existence of Path object: '{file_path_obj}' (from string '{file_path_str}')")
-
-        if not file_path_obj.exists(): # Use Path object for exists check
-            print(f"ERROR: C++ tensor file not found (Path.exists check): {file_path_obj}")
+        if not os.path.exists(file_path_str):
            return None
+        
+        attempt_jit_extraction = False
+        loaded_object_from_direct_load = None
+
        try:
-            # Try loading as a JIT ScriptModule first (common for exported tensors that might have attributes)
-            # This also handles plain tensors saved with torch.save if they are not ScriptModules
-            loaded_obj = torch.jit.load(str(file_path_obj), map_location=device) # Convert Path to str for torch.jit.load
-            actual_tensor = None
-
-            if isinstance(loaded_obj, torch.jit.ScriptModule):
-                # Attempt to get tensor attribute directly, common for simple JIT-saved tensors
-                # Check for common weight/tensor attributes first
-                if hasattr(loaded_obj, 'tensor'): # Explicit "tensor" attribute
-                    if isinstance(loaded_obj.tensor, torch.Tensor):
-                        actual_tensor = loaded_obj.tensor
-                elif hasattr(loaded_obj, 'weight') and isinstance(loaded_obj.weight, torch.Tensor): # Common for conv/linear
-                     actual_tensor = loaded_obj.weight
-                # Heuristic: if it has parameters and only one, assume that's the one.
-                elif len(list(loaded_obj.parameters())) == 1:
-                    actual_tensor = list(loaded_obj.parameters())[0]
-                # Heuristic: if it has attributes that are tensors, try to find the primary one
-                else:
-                    tensor_attrs = [getattr(loaded_obj, attr) for attr in dir(loaded_obj) if isinstance(getattr(loaded_obj, attr, None), torch.Tensor)]
-                    if len(tensor_attrs) == 1:
-                        actual_tensor = tensor_attrs[0]
-                    elif len(tensor_attrs) > 1:
-                         # If multiple tensor attributes, try to find one that matches common patterns or is simply 'output'
-                        if hasattr(loaded_obj, 'output') and isinstance(loaded_obj.output, torch.Tensor):
-                            actual_tensor = loaded_obj.output
-                        else: # Heuristic: take the largest tensor if multiple exist and no clear primary one
-                            actual_tensor = max(tensor_attrs, key=lambda t: t.numel())
-                            # print(f"WARNING: Multiple tensor attributes in ScriptModule from {file_path_obj}, using largest: {actual_tensor.shape}")
-
-                if actual_tensor is None:
-                    print(f"ERROR: C++ tensor from {file_path_obj} is a ScriptModule, but couldn't extract a single tensor. StateDict keys: {list(loaded_obj.state_dict().keys()) if hasattr(loaded_obj, 'state_dict') else 'N/A'}")
-                    return None
+            # Attempt direct load first
+            loaded_object_from_direct_load = torch.load(file_path_str, map_location=device, weights_only=False)
+            
+            if isinstance(loaded_object_from_direct_load, torch.Tensor):
+                return loaded_object_from_direct_load.to(device) # Successfully loaded a tensor directly
+            else:
+                # Loaded something, but it's not a tensor. It's likely a JIT module.
+                attempt_jit_extraction = True
+                print(f"INFO: Initial torch.load of {file_path_str} yielded a non-Tensor (type: {type(loaded_object_from_direct_load)}). Will attempt JIT extraction.")
+
+        except Exception as e_initial_load:
+            # Initial load failed (e.g., it's a JIT module not readable by plain torch.load, or other error)
+            attempt_jit_extraction = True 
+            print(f"INFO: Initial torch.load failed for {file_path_str}: {e_initial_load}. Will attempt JIT extraction.")
+        
+        # Common JIT tensor extraction logic
+        def extract_tensor_from_jit_module(module_path, jit_loaded_obj, dev):
+            print(f"DEBUG JIT EXTRACTION: For {module_path}, loaded_obj type: {type(jit_loaded_obj)}")
+            print(f"DEBUG JIT EXTRACTION: str(loaded_obj): {str(jit_loaded_obj)}")
+            # print(f"DEBUG JIT EXTRACTION: dir(loaded_obj): {dir(jit_loaded_obj)}") # Verbose

-            elif isinstance(loaded_obj, torch.Tensor):
-                actual_tensor = loaded_obj
+            extracted_tensor = None
+
+            # 1. Try calling if 'forward' method exists
+            if hasattr(jit_loaded_obj, 'forward') and callable(getattr(jit_loaded_obj, 'forward')):
+                print(f"DEBUG JIT EXTRACTION: Attempting jit_loaded_obj.forward()")
+                try:
+                    extracted_tensor = jit_loaded_obj.forward()
+                    if not isinstance(extracted_tensor, torch.Tensor):
+                        print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward() did not return a tensor, got {type(extracted_tensor)}. Trying with dummy input.")
+                        extracted_tensor = None # Reset before trying with dummy
+                        try:
+                            print(f"DEBUG JIT EXTRACTION: Attempting jit_loaded_obj.forward(torch.empty(0))")
+                            extracted_tensor = jit_loaded_obj.forward(torch.empty(0, device=dev))
+                            if not isinstance(extracted_tensor, torch.Tensor):
+                                print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward(dummy) also did not return a tensor, got {type(extracted_tensor)}")
+                                extracted_tensor = None
+                        except Exception as e_fwd_dummy:
+                            print(f"DEBUG JIT EXTRACTION: Error calling jit_loaded_obj.forward(dummy): {e_fwd_dummy}")
+                            extracted_tensor = None
+                except Exception as e_fwd: # This covers cases where forward exists but call fails (e.g. signature mismatch)
+                    print(f"DEBUG JIT EXTRACTION: Error calling jit_loaded_obj.forward(): {e_fwd}. Trying with dummy input as fallback.")
+                    extracted_tensor = None # Reset
+                    try:
+                        print(f"DEBUG JIT EXTRACTION: Attempting jit_loaded_obj.forward(torch.empty(0)) after error.")
+                        extracted_tensor = jit_loaded_obj.forward(torch.empty(0, device=dev))
+                        if not isinstance(extracted_tensor, torch.Tensor):
+                            print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward(dummy) after error also did not return a tensor, got {type(extracted_tensor)}")
+                            extracted_tensor = None
+                    except Exception as e_fwd_dummy_after_error:
+                        print(f"DEBUG JIT EXTRACTION: Error calling jit_loaded_obj.forward(dummy) after initial fwd error: {e_fwd_dummy_after_error}")
+                        extracted_tensor = None
+            
+            # 1b. Try calling the module directly if forward attribute exists (covers some cases)
+            # This is after trying explicit .forward() as direct call might have side effects or different interpretation
+            if extracted_tensor is None and callable(jit_loaded_obj) and hasattr(jit_loaded_obj, 'forward'):
+                 print(f"DEBUG JIT EXTRACTION: Attempting callable jit_loaded_obj()")
+                 try:
+                    extracted_tensor = jit_loaded_obj()
+                    if not isinstance(extracted_tensor, torch.Tensor):
+                        print(f"DEBUG JIT EXTRACTION: callable jit_loaded_obj() did not return a tensor, got {type(extracted_tensor)}")
+                        extracted_tensor = None
+                 except Exception as e_call_obj:
+                    print(f"DEBUG JIT EXTRACTION: Error calling callable jit_loaded_obj() (it had a forward attr): {e_call_obj}")
+                    extracted_tensor = None
+
+
+            # 2. Check if 'forward' attribute *itself* is a tensor
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'forward') and isinstance(getattr(jit_loaded_obj, 'forward'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward IS a tensor.")
+                extracted_tensor = getattr(jit_loaded_obj, 'forward')
+
+            # 3. Look for common direct tensor attributes
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'tensor') and isinstance(getattr(jit_loaded_obj, 'tensor'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Found tensor in jit_loaded_obj.tensor")
+                extracted_tensor = jit_loaded_obj.tensor
+            
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'data') and isinstance(getattr(jit_loaded_obj, 'data'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Found tensor in jit_loaded_obj.data")
+                extracted_tensor = jit_loaded_obj.data
+            
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'tensor_data') and isinstance(getattr(jit_loaded_obj, 'tensor_data'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Found tensor in jit_loaded_obj.tensor_data")
+                extracted_tensor = jit_loaded_obj.tensor_data
+
+            # 4. Iterate through named_buffers (common for wrapped tensors)
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Iterating named_buffers for a tensor...")
+                try:
+                    for name, buffer_tensor in jit_loaded_obj.named_buffers():
+                        if isinstance(buffer_tensor, torch.Tensor):
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in named_buffers: {name}")
+                            extracted_tensor = buffer_tensor
+                            break
+                except Exception as e_buffers:
+                    print(f"DEBUG JIT EXTRACTION: Error iterating named_buffers: {e_buffers}")
+
+
+            # 5. Iterate through named_parameters
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Iterating named_parameters for a tensor...")
+                try:
+                    for name, param_tensor in jit_loaded_obj.named_parameters():
+                        if isinstance(param_tensor, torch.Tensor):
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in named_parameters: {name}")
+                            extracted_tensor = param_tensor
+                            break
+                except Exception as e_params:
+                    print(f"DEBUG JIT EXTRACTION: Error iterating named_parameters: {e_params}")
+            
+            # 6. Iterate through members (attributes) using inspect.getmembers - potentially fragile
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Attempting to iterate members using inspect.getmembers...")
+                try:
+                    for name, member_obj in inspect.getmembers(jit_loaded_obj):
+                        if isinstance(member_obj, torch.Tensor):
+                            # Avoid re-picking already checked common names if they are somehow also members
+                            if name not in ['tensor', 'data', 'tensor_data', 'forward']:
+                                print(f"DEBUG JIT EXTRACTION: Found tensor in member (inspect.getmembers): {name}")
+                                extracted_tensor = member_obj
+                                break
+                except RuntimeError as e_inspect:
+                    # Specifically catch RuntimeError that was observed: "Method 'forward' is not defined"
+                    print(f"DEBUG JIT EXTRACTION: inspect.getmembers failed with RuntimeError: {e_inspect}. Skipping this method.")
+                except Exception as e_inspect_other:
+                    print(f"DEBUG JIT EXTRACTION: inspect.getmembers failed with other Exception: {e_inspect_other}. Skipping this method.")
+
+            # 7. Iterate through named_children and inspect
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Iterating named_children...")
+                try:
+                    for child_name, child_module in jit_loaded_obj.named_children():
+                        print(f"DEBUG JIT EXTRACTION: Inspecting child: {child_name} of type {type(child_module)}")
+                        # Try common ways to get tensor from child
+                        if hasattr(child_module, 'forward') and callable(getattr(child_module, 'forward')) :
+                            try:
+                                temp_tensor = child_module.forward()
+                                if isinstance(temp_tensor, torch.Tensor):
+                                    print(f"DEBUG JIT EXTRACTION: Found tensor by calling child {child_name}.forward()")
+                                    extracted_tensor = temp_tensor; break
+                            except: pass
+                        
+                        if extracted_tensor is None and callable(child_module) and hasattr(child_module, 'forward'): # Added hasattr forward here
+                           try:
+                               temp_tensor = child_module()
+                               if isinstance(temp_tensor, torch.Tensor):
+                                   print(f"DEBUG JIT EXTRACTION: Found tensor by calling child {child_name}()")
+                                   extracted_tensor = temp_tensor; break
+                           except: pass
+
+                        if extracted_tensor is None and hasattr(child_module, 'forward') and isinstance(getattr(child_module, 'forward'), torch.Tensor):
+                            extracted_tensor = getattr(child_module, 'forward')
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.forward (as attribute)")
+                            break
+                        
+                        if extracted_tensor is None and hasattr(child_module, 'tensor') and isinstance(getattr(child_module, 'tensor'), torch.Tensor):
+                            extracted_tensor = child_module.tensor
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.tensor")
+                            break
+                        if extracted_tensor is None and hasattr(child_module, 'data') and isinstance(getattr(child_module, 'data'), torch.Tensor):
+                            extracted_tensor = child_module.data
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.data")
+                            break
+                        if extracted_tensor is None and hasattr(child_module, 'tensor_data') and isinstance(getattr(child_module, 'tensor_data'), torch.Tensor):
+                            extracted_tensor = child_module.tensor_data
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.tensor_data")
+                            break
+                        
+                        if extracted_tensor is None: # Check general members of child if direct attributes fail
+                            try:
+                                for name, member_obj in inspect.getmembers(child_module):
+                                     if isinstance(member_obj, torch.Tensor):
+                                        print(f"DEBUG JIT EXTRACTION: Found tensor in member {name} of child {child_name}")
+                                        extracted_tensor = member_obj; break
+                                if extracted_tensor is not None: break
+                            except Exception as e_child_inspect:
+                                print(f"DEBUG JIT EXTRACTION: inspect.getmembers on child {child_name} failed: {e_child_inspect}")
+                                
+                    if extracted_tensor is not None:
+                         print(f"DEBUG JIT EXTRACTION: Tensor found in a child module.")
+                    else:
+                         print(f"DEBUG JIT EXTRACTION: Tensor not found in direct children.")
+                except Exception as e_children:
+                    print(f"DEBUG JIT EXTRACTION: Error iterating named_children: {e_children}")
+
+
+            if isinstance(extracted_tensor, torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Successfully extracted tensor of shape {extracted_tensor.shape} from JIT module {module_path}")
+                return extracted_tensor.to(dev)
            else:
-                print(f"ERROR: C++ tensor loaded from {file_path_obj} with torch.jit.load is not a Tensor or ScriptModule. Type: {type(loaded_obj)}")
+                print(f"Warning: JIT EXTRACTION: Could not extract tensor from JIT module: {module_path}. Final extracted_type: {type(extracted_tensor)}. THIS FILE WILL BE SKIPPED.")
                return None
+
+        if attempt_jit_extraction:
+            # If primary_jit_load_needed was true, loaded_object_from_direct_load might be the JIT module already.
+            # Otherwise, we need to load it with torch.jit.load.
+            # The critical part is that C++ outputs are *always* JIT modules now if not raw tensors.
            
-            tensor = actual_tensor.to(device).float() # Ensure tensor is on the correct device and float
-            return tensor
-        except Exception as e:
-            # If torch.jit.load fails (e.g. it's a plain tensor not loadable by JIT), try torch.load
-            # This also catches errors from the processing above if actual_tensor remains None
-            # print(f"INFO: torch.jit.load failed for {file_path_obj} ({e}), attempting torch.load as fallback.")
-            try:
-                tensor = torch.load(str(file_path_obj), map_location=device) # Convert Path to str for torch.load
-                if not isinstance(tensor, torch.Tensor):
-                    print(f"ERROR: Fallback torch.load for {file_path_obj} did not return a tensor. Type: {type(tensor)}")
+            jit_module_to_process = None
+            if loaded_object_from_direct_load is not None and not isinstance(loaded_object_from_direct_load, torch.Tensor):
+                # This means torch.load succeeded but returned a JIT module directly
+                # (common for files saved with torch.jit.save that are actually modules)
+                print(f"DEBUG JIT: Using object from initial torch.load (type: {type(loaded_object_from_direct_load)}) for JIT extraction for {file_path_str}.")
+                jit_module_to_process = loaded_object_from_direct_load
+            else:
+                # This means initial torch.load either failed OR it was an image and returned a JIT module (handled above),
+                # OR it was not an image and returned a tensor (already returned).
+                # So, if we are here, it means torch.load failed, or we need to fresh load as JIT.
+                try:
+                    print(f"DEBUG JIT: Attempting torch.jit.load for {file_path_str} as fallback/primary JIT path.")
+                    jit_module_to_process = torch.jit.load(file_path_str, map_location=device)
+                except Exception as e_jit_load_explicit:
+                    print(f"Error: torch.jit.load also failed for {file_path_str}: {e_jit_load_explicit}. Traceback: {traceback.format_exc()}. SKIPPING.")
                    return None
-                return tensor.to(device).float() # Ensure tensor is on the correct device and float
-            except Exception as e2:
-                print(f"ERROR: Failed to load C++ tensor from {file_path_obj}. JIT load error: {e}. Torch load error: {e2}")
-                import traceback
-                traceback.print_exc()
+            
+            if jit_module_to_process is not None:
+                final_tensor = extract_tensor_from_jit_module(file_path_str, jit_module_to_process, device)
+                if final_tensor is not None:
+                    return final_tensor
+                else:
+                    print(f"Warning: JIT extraction path for {file_path_str} (using {type(jit_module_to_process)}) failed to extract tensor. SKIPPING file.")
+                    return None
+            else:
+                # This case should be rare if torch.jit.load was attempted and failed, as it would have returned None above.
+                print(f"Warning: jit_module_to_process is None for {file_path_str} before calling extraction. SKIPPING file.")
                return None

-    def _compare_tensor_data(self, tensor1, tensor2, name, sample_idx, current_errors):
-        """Compare two tensors and return error metrics."""
+        # If we reach here, it means initial torch.load returned a tensor (and it was returned),
+        # or all attempts to load and extract have failed.
+        print(f"Warning: load_cpp_tensor is returning None for {file_path_str} after all attempts. This indicates an issue with file content or loading logic for this specific file type when is_image={is_image}.")
+        return None
+
+    def _compare_tensor_data(self, tensor1, tensor2, name, sample_idx, current_errors_dict_to_populate):
+        """Compare two tensors and return error metrics. Modifies current_errors_dict_to_populate in place."""
        num_metrics = 11 # mae, max_err, diff_arr, mean_py_val, std_abs_err, l2_py, l2_cpp, l2_diff, cos_sim, pearson, mre
        nan_metrics_tuple = (
            float('nan'), float('nan'), [], float('nan'), float('nan'), # Original 5
@ -1280,27 +1584,26 @@ class ComparisonRunner:
                t1_cpu_temp = tensor1.cpu().detach().numpy().astype(np.float32)
                py_mean = np.mean(t1_cpu_temp)
                py_l2 = np.linalg.norm(t1_cpu_temp.flatten())
-            # If only tensor2 is None, we can't calculate C++ l2 or comparison metrics
-            # If only tensor1 is None, py_mean and py_l2 remain NaN.
            
-            current_errors[name] = (
+            # Populate current_errors_dict_to_populate directly
+            current_errors_dict_to_populate[name] = (
                float('nan'), float('nan'), [], py_mean, float('nan'),
                py_l2, float('nan'), float('nan'), float('nan'), float('nan'), float('nan')
            )
            print(f"Warning: Cannot compare '{name}' for sample {sample_idx}, one or both tensors are None.")
-            return
+            return # Return None as the function modifies dict in place

        t1_cpu = tensor1.cpu().detach().numpy().astype(np.float32)
        t2_cpu = tensor2.cpu().detach().numpy().astype(np.float32)

        if t1_cpu.shape != t2_cpu.shape:
            print(f"Warning: Shape mismatch for '{name}' sample {sample_idx}. Py: {t1_cpu.shape}, Cpp: {t2_cpu.shape}. Skipping most comparisons.")
-            current_errors[name] = (
+            current_errors_dict_to_populate[name] = (
                float('nan'), float('nan'), [], np.mean(t1_cpu), float('nan'), # MAE, MaxErr, diff_arr, MeanPy, StdAbsErr
                np.linalg.norm(t1_cpu.flatten()), np.linalg.norm(t2_cpu.flatten()), float('nan'), # L2Py, L2Cpp, L2Diff
                float('nan'), float('nan'), float('nan') # CosSim, Pearson, MRE
            )
-            return
+            return # Return None
        
        # All calculations from here assume shapes match and tensors are not None
        t1_flat = t1_cpu.flatten()
@ -1355,7 +1658,8 @@ class ComparisonRunner:
        # Using (abs(t1_cpu) + epsilon) in denominator handles this.
        mean_rel_err = np.mean(abs_diff_elements / (np.abs(t1_cpu) + epsilon_rel_err))

-        current_errors[name] = (
+        # Populate current_errors_dict_to_populate directly
+        current_errors_dict_to_populate[name] = (
            mae, max_err, diff_arr_for_hist, mean_py_val, std_abs_err,
            l2_norm_py, l2_norm_cpp, l2_norm_diff, cosine_sim, pearson_corr, mean_rel_err
        )
--- a/test/test_models.cpp
+++ b/test/test_models.cpp
@ -127,7 +127,10 @@ int main(int argc, char* argv[]) {
        std::vector<std::string> output_layers_resnet = {
            "conv1_output", "bn1_output", "relu1_output", "maxpool_output",
            "layer1", "layer2", "layer3", "layer4", "features", 
-            "layer1_0_shortcut_output", "layer1_0_block_output"
+            "layer1_0_shortcut_output", "layer1_0_block_output",
+            "debug_resnet_conv1_output_for_bn1_input",
+            // New BN1 intermediate outputs
+            "bn1_centered_x", "bn1_variance_plus_eps", "bn1_inv_std", "bn1_normalized_x"
        };
        resnet_model_opt.emplace(cimp::resnet::resnet50(resnet_weights_path, output_layers_resnet, device)); 
        (*resnet_model_opt)->to(device);
@ -291,6 +294,14 @@ int main(int argc, char* argv[]) {
                } else {
                    std::cerr << "  Skipping BBRegressor predict_iou for sample " << i << " (iou_feats or mod_vectors empty)." << std::endl;
                }
+
+                // Save debug intermediate outputs
+                torch::Tensor cpp_conv3_1t_out = (*bb_regressor_model_opt_wrapped).debug_get_conv3_1t_output(resnet_outputs["layer2"].clone());
+                save_tensor_to_file(cpp_conv3_1t_out, (bb_reg_out_dir / (sample_suffix + "_debug_conv3_1t_output.pt")).string());
+
+                torch::Tensor cpp_conv4_1t_out = (*bb_regressor_model_opt_wrapped).debug_get_conv4_1t_output(resnet_outputs["layer3"].clone());
+                save_tensor_to_file(cpp_conv4_1t_out, (bb_reg_out_dir / (sample_suffix + "_debug_conv4_1t_output.pt")).string());
+
                std::cout << "BBRegressor processing done for sample " << i << std::endl;
            } catch (const std::exception& e) {
                std::cerr << "Error during BBRegressor processing for sample " << i << ": " << e.what() << std::endl;