diff --git a/cimp/resnet/resnet.cpp b/cimp/resnet/resnet.cpp
index 6304d7f..1ecf121 100644
--- a/cimp/resnet/resnet.cpp
+++ b/cimp/resnet/resnet.cpp
@@ -71,9 +71,9 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
     conv1->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv1.weight", device);
     bn1->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.weight", device);
     bn1->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.bias", device);
-    bn1->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_mean", device);
-    bn1->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_var", device);
-    bn1->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.num_batches_tracked", device);
+    bn1->running_mean = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_mean", device);
+    bn1->running_var = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_var", device);
+    bn1->num_batches_tracked = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.num_batches_tracked", device);
     register_module("conv1", conv1);
     register_module("bn1", bn1);
 
@@ -83,9 +83,9 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
     conv2->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv2.weight", device);
     bn2->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.weight", device);
     bn2->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.bias", device);
-    bn2->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_mean", device);
-    bn2->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_var", device);
-    bn2->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.num_batches_tracked", device);
+    bn2->running_mean = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_mean", device);
+    bn2->running_var = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_var", device);
+    bn2->num_batches_tracked = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.num_batches_tracked", device);
     register_module("conv2", conv2);
     register_module("bn2", bn2);
 
@@ -95,9 +95,9 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
     conv3->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv3.weight", device);
     bn3->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.weight", device);
     bn3->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.bias", device);
-    bn3->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_mean", device);
-    bn3->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_var", device);
-    bn3->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.num_batches_tracked", device);
+    bn3->running_mean = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_mean", device);
+    bn3->running_var = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_var", device);
+    bn3->num_batches_tracked = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.num_batches_tracked", device);
     register_module("conv3", conv3);
     register_module("bn3", bn3);
 
@@ -118,17 +118,85 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,
 // Forward method implementation for BottleneckImpl
 torch::Tensor BottleneckImpl::forward(torch::Tensor x) {
     torch::Tensor identity = x;
+    torch::ScalarType original_dtype = x.scalar_type();
 
+    // conv1 -> bn1 -> relu
     x = conv1->forward(x);
-    x = bn1->forward(x);
+
+    if (!this->is_training() && bn1) {
+        const auto& bn_module = *bn1;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+
+        auto c = x.size(1);
+        running_mean_double = running_mean_double.reshape({1, c, 1, 1});
+        running_var_double = running_var_double.reshape({1, c, 1, 1});
+        if (weight_double.defined()) weight_double = weight_double.reshape({1, c, 1, 1});
+        if (bias_double.defined()) bias_double = bias_double.reshape({1, c, 1, 1});
+        
+        torch::Tensor out_double = (input_double - running_mean_double) / (torch::sqrt(running_var_double + eps_double));
+        if (weight_double.defined()) out_double = out_double * weight_double;
+        if (bias_double.defined()) out_double = out_double + bias_double;
+        x = out_double.to(original_dtype);
+    } else if (bn1) {
+        x = bn1->forward(x);
+    }
     x = relu->forward(x);
 
+    // conv2 -> bn2 -> relu
     x = conv2->forward(x);
-    x = bn2->forward(x);
+    if (!this->is_training() && bn2) {
+        const auto& bn_module = *bn2;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+
+        auto c = x.size(1);
+        running_mean_double = running_mean_double.reshape({1, c, 1, 1});
+        running_var_double = running_var_double.reshape({1, c, 1, 1});
+        if (weight_double.defined()) weight_double = weight_double.reshape({1, c, 1, 1});
+        if (bias_double.defined()) bias_double = bias_double.reshape({1, c, 1, 1});
+
+        torch::Tensor out_double = (input_double - running_mean_double) / (torch::sqrt(running_var_double + eps_double));
+        if (weight_double.defined()) out_double = out_double * weight_double;
+        if (bias_double.defined()) out_double = out_double + bias_double;
+        x = out_double.to(original_dtype);
+    } else if (bn2) {
+        x = bn2->forward(x);
+    }
     x = relu->forward(x);
 
+    // conv3 -> bn3
     x = conv3->forward(x);
-    x = bn3->forward(x);
+    if (!this->is_training() && bn3) {
+        const auto& bn_module = *bn3;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+        
+        auto c = x.size(1);
+        running_mean_double = running_mean_double.reshape({1, c, 1, 1});
+        running_var_double = running_var_double.reshape({1, c, 1, 1});
+        if (weight_double.defined()) weight_double = weight_double.reshape({1, c, 1, 1});
+        if (bias_double.defined()) bias_double = bias_double.reshape({1, c, 1, 1});
+
+        torch::Tensor out_double = (input_double - running_mean_double) / (torch::sqrt(running_var_double + eps_double));
+        if (weight_double.defined()) out_double = out_double * weight_double;
+        if (bias_double.defined()) out_double = out_double + bias_double;
+        x = out_double.to(original_dtype);
+    } else if (bn3) {
+        x = bn3->forward(x);
+    }
 
     if (this->projection_shortcut) {
         identity = this->projection_shortcut->forward(identity);
@@ -150,18 +218,16 @@ ResNetImpl::ResNetImpl(const std::string& base_weights_dir_path,
     conv1 = torch::nn::Conv2d(torch::nn::Conv2dOptions(3, 64, 7).stride(2).padding(3).bias(false));
     bn1 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(64).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
     this->conv1->weight = load_named_tensor(this->_base_weights_dir, "conv1.weight", device);
+    
+    // Directly assign to the public member tensors of the bn1 module
     this->bn1->weight = load_named_tensor(this->_base_weights_dir, "bn1.weight", device);
     this->bn1->bias = load_named_tensor(this->_base_weights_dir, "bn1.bias", device);
-    
-    this->bn1->named_buffers()["running_mean"] = load_named_tensor(this->_base_weights_dir, "bn1.running_mean", device);
-    this->bn1->named_buffers()["running_var"] = load_named_tensor(this->_base_weights_dir, "bn1.running_var", device);
-    
-    this->bn1->named_buffers()["num_batches_tracked"] = load_named_tensor(this->_base_weights_dir, "bn1.num_batches_tracked", device);
-    register_module("conv1", conv1);
-    register_module("bn1", bn1);
+    this->bn1->running_mean = load_named_tensor(this->_base_weights_dir, "bn1.running_mean", device);
+    this->bn1->running_var = load_named_tensor(this->_base_weights_dir, "bn1.running_var", device);
+    this->bn1->num_batches_tracked = load_named_tensor(this->_base_weights_dir, "bn1.num_batches_tracked", device);
 
-    std::cout << "CPP ResNetImpl::bn1 running_mean sum: " << std::fixed << std::setprecision(10) << this->bn1->running_mean.sum().item<double>() << std::endl;
-    std::cout << "CPP ResNetImpl::bn1 running_var sum: " << std::fixed << std::setprecision(10) << this->bn1->running_var.sum().item<double>() << std::endl;
+    register_module("conv1", conv1);
+    register_module("bn1", bn1); // bn1 is already populated correctly
 
     relu = torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true));
     maxpool = torch::nn::MaxPool2d(torch::nn::MaxPool2dOptions(3).stride(2).padding(1));
@@ -195,9 +261,9 @@ torch::nn::Sequential ResNetImpl::_make_layer(int64_t planes_for_block, int64_t
         conv_down->weight = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "0.weight", device);
         bn_down->weight = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.weight", device);
         bn_down->bias = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.bias", device);
-        bn_down->named_buffers()["running_mean"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_mean", device);
-        bn_down->named_buffers()["running_var"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_var", device);
-        bn_down->named_buffers()["num_batches_tracked"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.num_batches_tracked", device);
+        bn_down->running_mean = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_mean", device);
+        bn_down->running_var = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_var", device);
+        bn_down->num_batches_tracked = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.num_batches_tracked", device);
 
         ds_seq->push_back(conv_down);
         ds_seq->push_back(bn_down);
@@ -229,9 +295,50 @@ std::map<std::string, torch::Tensor> ResNetImpl::forward(torch::Tensor x) {
     };
 
     x = conv1->forward(x);
-    if (should_output("conv1_output")) outputs["conv1_output"] = x; 
-    
-    x = bn1->forward(x);
+    if (should_output("conv1_output")) outputs["conv1_output"] = x;
+    if (should_output("debug_resnet_conv1_output_for_bn1_input")) {
+        outputs["debug_resnet_conv1_output_for_bn1_input"] = x.clone(); 
+    }
+    torch::ScalarType original_dtype_resnet_bn1 = x.scalar_type();
+
+    // Apply bn1
+    if (!this->is_training() && bn1) {
+        const auto& bn_module = *bn1;
+        torch::Tensor input_double = x.to(torch::kFloat64);
+        torch::Tensor weight_double = bn_module.weight.defined() ? bn_module.weight.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor bias_double = bn_module.bias.defined() ? bn_module.bias.to(torch::kFloat64) : torch::Tensor();
+        torch::Tensor running_mean_double = bn_module.running_mean.to(torch::kFloat64);
+        torch::Tensor running_var_double = bn_module.running_var.to(torch::kFloat64);
+        double eps_double = bn_module.options.eps();
+
+        auto c = x.size(1);
+        torch::Tensor reshaped_running_mean = running_mean_double.reshape({1, c, 1, 1});
+        torch::Tensor reshaped_running_var = running_var_double.reshape({1, c, 1, 1});
+        torch::Tensor reshaped_weight = weight_double.defined() ? weight_double.reshape({1, c, 1, 1}) : torch::Tensor();
+        torch::Tensor reshaped_bias = bias_double.defined() ? bias_double.reshape({1, c, 1, 1}) : torch::Tensor();
+        
+        torch::Tensor centered_x = input_double - reshaped_running_mean;
+        if (should_output("bn1_centered_x")) outputs["bn1_centered_x"] = centered_x.clone();
+
+        torch::Tensor variance_plus_eps = reshaped_running_var + eps_double;
+        if (should_output("bn1_variance_plus_eps")) outputs["bn1_variance_plus_eps"] = variance_plus_eps.clone();
+
+        torch::Tensor inv_std = torch::rsqrt(variance_plus_eps); // Using rsqrt for potential match
+        if (should_output("bn1_inv_std")) outputs["bn1_inv_std"] = inv_std.clone();
+        
+        torch::Tensor normalized_x = centered_x * inv_std;
+        if (should_output("bn1_normalized_x")) outputs["bn1_normalized_x"] = normalized_x.clone();
+
+        torch::Tensor out_double = normalized_x;
+        if (reshaped_weight.defined()) out_double = out_double * reshaped_weight;
+        if (reshaped_bias.defined()) out_double = out_double + reshaped_bias;
+        
+        x = out_double.to(original_dtype_resnet_bn1);
+    } else if (bn1) { // Training mode or if manual is disabled
+        x = bn1->forward(x);
+    }
+    // End apply bn1
+
     if (should_output("bn1_output")) outputs["bn1_output"] = x; 
     
     x = relu->forward(x); 
diff --git a/test/compare_models.py b/test/compare_models.py
index cee1182..ae11553 100644
--- a/test/compare_models.py
+++ b/test/compare_models.py
@@ -37,13 +37,23 @@ def get_model_configs(root_dir_param):
     return {
         # ... (existing model_configs definitions)
         'ResNet': {
-            'python_model_loader': lambda: DiMPTorchScriptWrapper(os.path.join(root_dir_param, 'pytracking_models/dimp50_ Ausdruck_ep0050.pth.tar')),
+            'python_model_loader': lambda: DiMPTorchScriptWrapper(os.path.join(root_dir_param, 'pytracking_models/dimp50_ausdruck_ep0050.pth.tar')),
             'cpp_output_subdir': 'resnet',
-            'python_output_subdir': 'resnet_py', # If Python outputs are saved separately
+            'python_output_subdir': 'resnet_py', 
             'outputs_to_compare': {
-                'Conv1': 'conv1_output.pt', # ADDED
-                'BN1': 'bn1_output.pt', # ADDED
-                'ReLU1': 'relu1_output.pt', # ADDED for completeness before MaxPool
+                'Conv1': ('conv1_output.pt', 'conv1'), 
+                'Debug ResNet Conv1->BN1 Input': ('debug_resnet_conv1_output_for_bn1_input.pt', 'conv1_pre_bn'),
+                
+                # BN1 final output (manual C++ vs manual Python pre-ReLU)
+                'BN1': ('bn1_output.pt', 'bn1_post_relu_pre'), 
+
+                # BN1 Intermediate comparisons
+                'BN1 Centered X': ('bn1_centered_x.pt', 'bn1_centered_x_py'),
+                'BN1 Var+Eps': ('bn1_variance_plus_eps.pt', 'bn1_variance_plus_eps_py'),
+                'BN1 InvStd': ('bn1_inv_std.pt', 'bn1_inv_std_py'),
+                'BN1 Normalized X': ('bn1_normalized_x.pt', 'bn1_normalized_x_py'),
+
+                'ReLU1': ('relu1_output.pt', 'conv1'), 
                 'MaxPool': 'maxpool_output.pt',
                 'Features': 'features.pt', 
                 'Layer1': 'layer1.pt',
@@ -523,6 +533,9 @@ class ComparisonRunner:
             cpp_mod_vec0_path = cpp_output_bb_reg_dir_path / f'sample_{i}_mod_vec0.pt'
             cpp_mod_vec1_path = cpp_output_bb_reg_dir_path / f'sample_{i}_mod_vec1.pt'
             cpp_iou_scores_path = cpp_output_bb_reg_dir_path / f'sample_{i}_iou_scores.pt'
+            # Paths for debug C++ outputs
+            cpp_debug_conv3_1t_path = cpp_output_bb_reg_dir_path / f'sample_{i}_debug_conv3_1t_output.pt'
+            cpp_debug_conv4_1t_path = cpp_output_bb_reg_dir_path / f'sample_{i}_debug_conv4_1t_output.pt'
 
             # Load initial inputs for Python model
             py_image_tensor = self.load_cpp_tensor(py_image_input_path, self.device)
@@ -549,6 +562,31 @@ class ComparisonRunner:
             else:
                 print(f"Warning: Skipping Python BB Regressor for sample {i}, image input not found at {py_image_input_path}")
 
+            # ---- Intermediate debug outputs for conv3_1t and conv4_1t ----
+            py_debug_conv3_1t_out = None
+            py_debug_conv4_1t_out = None
+
+            if py_feat_layer2 is not None:
+                try:
+                    _feat2_for_debug_conv3_1t = py_feat_layer2
+                    if _feat2_for_debug_conv3_1t.dim() == 5:
+                        _feat2_for_debug_conv3_1t = _feat2_for_debug_conv3_1t.reshape(-1, *_feat2_for_debug_conv3_1t.shape[-3:])
+                    with torch.no_grad(): # Ensure no_grad context
+                        py_debug_conv3_1t_out = self.bb_regressor_from_source.conv3_1t(_feat2_for_debug_conv3_1t)
+                except Exception as e:
+                    print(f"ERROR calculating Python Debug_Conv3_1t for sample {i}: {e}")
+            
+            if py_feat_layer3 is not None:
+                try:
+                    _feat3_for_debug_conv4_1t = py_feat_layer3
+                    if _feat3_for_debug_conv4_1t.dim() == 5:
+                        _feat3_for_debug_conv4_1t = _feat3_for_debug_conv4_1t.reshape(-1, *_feat3_for_debug_conv4_1t.shape[-3:])
+                    with torch.no_grad(): # Ensure no_grad context
+                        py_debug_conv4_1t_out = self.bb_regressor_from_source.conv4_1t(_feat3_for_debug_conv4_1t)
+                except Exception as e:
+                    print(f"ERROR calculating Python Debug_Conv4_1t for sample {i}: {e}")
+            # ---- End intermediate debug outputs ----
+
             # Get Python IoU features
             py_iou_feat_list = [None, None] # Initialize as a list of two Nones
             if py_feat_layer2 is not None and py_feat_layer3 is not None:
@@ -622,8 +660,13 @@ class ComparisonRunner:
             cpp_mod_vec0 = self.load_cpp_tensor(cpp_mod_vec0_path, self.device)
             cpp_mod_vec1 = self.load_cpp_tensor(cpp_mod_vec1_path, self.device)
             cpp_iou_scores = self.load_cpp_tensor(cpp_iou_scores_path, self.device)
+            # Load debug C++ tensors
+            cpp_debug_conv3_1t_tensor = self.load_cpp_tensor(cpp_debug_conv3_1t_path, self.device)
+            cpp_debug_conv4_1t_tensor = self.load_cpp_tensor(cpp_debug_conv4_1t_path, self.device)
 
             # Comparisons
+            self._compare_tensor_data(py_debug_conv3_1t_out, cpp_debug_conv3_1t_tensor, "BBReg Debug_Conv3_1t", i, current_errors)
+            self._compare_tensor_data(py_debug_conv4_1t_out, cpp_debug_conv4_1t_tensor, "BBReg Debug_Conv4_1t", i, current_errors)
             self._compare_tensor_data(py_iou_feat_list[0], cpp_iou_feat0, "BBReg PyIoUFeat0 vs CppIoUFeat0", i, current_errors)
             self._compare_tensor_data(py_iou_feat_list[1], cpp_iou_feat1, "BBReg PyIoUFeat1 vs CppIoUFeat1", i, current_errors)
             self._compare_tensor_data(py_modulation_list[0], cpp_mod_vec0, "BBReg PyMod0 vs CppMod0", i, current_errors)
@@ -633,224 +676,311 @@ class ComparisonRunner:
             if current_errors: self.all_comparison_stats[f"BBReg_Sample_{i}"] = current_errors
 
     def compare_resnet_outputs(self):
-        print("Comparing ResNet outputs...")
-        print("\n--- Types at START of compare_resnet_outputs: ---")
-        if 'ResNet' in self.models: print(f"  self.models['ResNet'] type: {type(self.models['ResNet'])}")
-        if 'Classifier' in self.models: print(f"  self.models['Classifier'] type: {type(self.models['Classifier'])}")
-        if 'BBRegressor' in self.models: print(f"  self.models['BBRegressor'] type: {type(self.models['BBRegressor'])}")
+        print("\\n--- Comparing ResNet Outputs ---")
+        if not self.models.get('ResNet'):
+            print("PYTHON: ResNet model not loaded, skipping ResNet comparison.")
+            return
 
-        py_input_common_dir = os.path.join(self.root_dir, 'test', 'input_samples', 'common')
-        cpp_output_resnet_dir = os.path.join(self.cpp_output_dir, 'resnet')
-        # Ensure self.py_resnet_output_dir is defined, e.g., in __init__ or where other py output dirs are
-        if not hasattr(self, 'py_resnet_output_dir') or not self.py_resnet_output_dir:
-            self.py_resnet_output_dir = Path(self.python_output_dir) / 'resnet'
-            self.py_resnet_output_dir.mkdir(parents=True, exist_ok=True)
+        resnet_model = self.models['ResNet']
+        config = self.model_configs['ResNet']
+        cpp_resnet_dir = os.path.join(self.cpp_output_dir, config['cpp_output_subdir'])
+        
+        python_resnet_save_dir = os.path.join(self.python_output_dir, config.get('python_output_subdir', config['cpp_output_subdir']))
+        if not os.path.exists(python_resnet_save_dir):
+            os.makedirs(python_resnet_save_dir, exist_ok=True)
+
+        num_samples_to_process = self.num_samples
+        if num_samples_to_process == -1: # If -1, determine from available C++ output files
+            # This logic can be complex if C++ output is sparse. For now, let's assume if -1 it means process all *common* inputs.
+            # A safer way for -1 would be to count common input samples first.
+            common_input_glob = os.path.join(self.root_dir, "test", "input_samples", "common", "sample_*_image.pt")
+            num_samples_to_process = len(glob.glob(common_input_glob))
+            print(f"INFO: num_samples set to -1, determined {num_samples_to_process} common input samples.")
+
+        processed_samples_count = 0 # Renamed from processed_samples to avoid conflict
+        
+        sample_input_base_dir = os.path.join(self.root_dir, "test", "input_samples", "common")
+        
+        # Loop exactly self.num_samples times (or detected count if -1)
+        for sample_idx in tqdm(range(num_samples_to_process), desc="Comparing ResNet samples"):
+            current_errors = {} # Initialize for each sample
+            python_intermediate_outputs_cache = {} # Reset for each sample
+
+            # Construct the input file path based on sample_idx
+            sample_input_file_path = os.path.join(sample_input_base_dir, f"sample_{sample_idx}_image.pt")
+
+            if not os.path.exists(sample_input_file_path):
+                print(f"Warning: Input sample file {sample_input_file_path} not found for sample index {sample_idx}. Skipping ResNet sample.")
+                empty_errors_for_skipped_sample = {}
+                for output_key_config in config['outputs_to_compare'].keys():
+                    self._compare_tensor_data(None, None, output_key_config, sample_idx, empty_errors_for_skipped_sample)
+                if empty_errors_for_skipped_sample:
+                     self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = empty_errors_for_skipped_sample
+                continue
+            
+            # --- START REINSTATED INPUT LOADING AND PREPROCESSING ---
+            input_tensor = self.load_cpp_tensor(sample_input_file_path, self.device, is_image=True)
+
+            if input_tensor is None:
+                print(f"Warning: Failed to load a valid tensor for ResNet input sample {sample_input_file_path} (sample {sample_idx}) using self.load_cpp_tensor. Skipping.")
+                # Populate NaNs for all expected outputs for this sample
+                empty_errors_for_skipped_sample = {}
+                for output_key_config in config['outputs_to_compare'].keys():
+                    self._compare_tensor_data(None, None, output_key_config, sample_idx, empty_errors_for_skipped_sample)
+                if empty_errors_for_skipped_sample:
+                    self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = empty_errors_for_skipped_sample
+                continue
 
-        # Define Path objects for directory checks
-        py_input_common_dir_path = Path(py_input_common_dir)
-        cpp_output_resnet_dir_path = Path(cpp_output_resnet_dir)
-
-        comparison_configs = [
-            ("ResNet Conv1 Output (Pre-BN)", "_conv1_output_py.pt", "_conv1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Conv1", "_conv1_output.pt", "_conv1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir), # Assumes Py also saved conv1 output if it was meant to be same as C++ pre-bn
-            ("ResNet BN1", "_bn1_output.pt", "_bn1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet ReLU1", "_relu1_output.pt", "_relu1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet MaxPool", "_maxpool_output.pt", "_maxpool_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer1.0 Block Output", "_layer1_0_block_output.pt", "_layer1_0_block_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer1.0 Shortcut Output", "_layer1_0_shortcut_output.pt", "_layer1_0_shortcut_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer1", "_layer1_output.pt", "_layer1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer2", "_layer2_output.pt", "_layer2_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer3", "_layer3_output.pt", "_layer3_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Layer4", "_layer4_output.pt", "_layer4_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
-            ("ResNet Features", "_features_output.pt", "_features_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir) 
-        ]
-
-        if not py_input_common_dir_path.exists() or not cpp_output_resnet_dir_path.exists():
-            print(f"ResNet input ({py_input_common_dir_path}) or C++ ResNet output dir ({cpp_output_resnet_dir_path}) not found. Skipping ResNet comparison.")
-            # Populate NaN for all expected ResNet comparisons if dirs are missing
-            for i in range(self.num_samples):
-                sample_key_base = f"ResNet_Sample_{i}"
-                current_errors = {}
-                self._compare_tensor_data(None, None, "ResNet Layer1", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Layer2", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Layer3", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Layer4", i, current_errors)
-                self._compare_tensor_data(None, None, "ResNet Features", i, current_errors)
-                self.all_comparison_stats[sample_key_base] = current_errors
-            return
+            if not isinstance(input_tensor, torch.Tensor):
+                print(f"Warning: self.load_cpp_tensor for {sample_input_file_path} did not return a Tensor (got {type(input_tensor)}). Skipping sample {sample_idx}.")
+                # Populate NaNs for all expected outputs for this sample
+                empty_errors_for_skipped_sample = {}
+                for output_key_config in config['outputs_to_compare'].keys():
+                    self._compare_tensor_data(None, None, output_key_config, sample_idx, empty_errors_for_skipped_sample)
+                if empty_errors_for_skipped_sample:
+                    self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = empty_errors_for_skipped_sample
+                continue
 
-        for i in tqdm(range(self.num_samples), desc="ResNet samples"):
-            current_errors = {} # For this sample
+            # Preprocess the input tensor for Python's ResNet
+            if hasattr(self.python_wrapper, 'preprocess_image'):
+                processed_input_tensor = self.python_wrapper.preprocess_image(input_tensor.clone()) # Use clone
+            else:
+                print("Warning: python_wrapper.preprocess_image not found. Using input_tensor as is.")
+                processed_input_tensor = input_tensor.to(self.device) # Ensure device
+            # --- END REINSTATED INPUT LOADING AND PREPROCESSING ---
 
-            py_image_input_path = py_input_common_dir_path / f'sample_{i}_image.pt'
-            py_image_tensor = self.load_cpp_tensor(py_image_input_path, self.device)
+            # Initialize dictionaries to store Python-side outputs for the current sample
+            python_outputs = {} # To store outputs from the Python model for this sample
 
-            py_conv1_out, py_bn1_out, py_relu1_out, py_maxpool_out, py_layer1_out, py_layer2_out, py_layer3_out, py_layer4_out, py_features_out = None, None, None, None, None, None, None, None, None # ADDED py_conv1_out, py_bn1_out, py_relu1_out
-            py_layer1_0_shortcut_out = None
+            try:
+                # Python ResNet forward pass (assuming it's a JIT model or similar)
+                # The output of a JIT ResNet model might be a dictionary or a list/tuple of tensors
+                # We need to ensure we can map these to the 'outputs_to_compare' keys
+                print(f"PYTHON ResNet forward pass for sample {sample_idx}...")
+                
+                # For ResNet, the output is a dictionary from its forward method.
+                # output_layers = list(config['outputs_to_compare'].keys()) # This might be too broad initially
+                
+                # Define the layers we actually need from the Python ResNet forward pass.
+                # These should match the keys used in the Python ResNet's forward method.
+                # e.g., ['layer1', 'layer2', 'layer3', 'layer4', 'conv1_output', 'bn1_output', etc.]
+                # For now, let's define specific layers needed for the comparison.
+                # The JIT ResNet model we have should output a dictionary.
+
+                py_output_layers_needed = ['conv1', 'layer1', 'layer2', 'layer3', 'layer4']
+                # Add 'conv1_pre_bn' if we need to compare the input to BN1
+                if 'Debug ResNet Conv1->BN1 Input' in config['outputs_to_compare']:
+                    py_output_layers_needed.append('conv1_pre_bn')
+                
+                # If we are comparing the direct C++ BN1 output, we need 'bn1_output' from Python
+                if 'BN1' in config['outputs_to_compare']:
+                    py_output_layers_needed.append('bn1_output') 
+                
+                # If we are comparing the C++ ReLU1 output (after BN1 and ReLU), we need 'bn1_post_relu_pre' from Python
+                if 'ReLU1' in config['outputs_to_compare']:
+                    py_output_layers_needed.append('bn1_post_relu_pre')
+
+                # Add Python-side BN1 intermediate layer names if they are in outputs_to_compare
+                # The config value (cpp_output_filename_or_tuple) is not directly used here for this part,
+                # we care about the py_dict_key that will be derived from the C++ key.
+                bn1_intermediate_py_keys_to_request = []
+                if 'BN1 Centered X' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_centered_x_py')
+                if 'BN1 Var+Eps' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_variance_plus_eps_py')
+                if 'BN1 InvStd' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_inv_std_py')
+                if 'BN1 Normalized X' in config['outputs_to_compare']:
+                    bn1_intermediate_py_keys_to_request.append('bn1_normalized_x_py')
+                
+                for py_key in bn1_intermediate_py_keys_to_request:
+                    if py_key not in py_output_layers_needed:
+                        py_output_layers_needed.append(py_key)
 
-            if py_image_tensor is not None:
-                # Save Python's preprocessed input to conv1
-                # This py_image_tensor is already preprocessed by DiMPTorchScriptWrapper.extract_backbone -> preprocess_image
-                # which is called before this compare_resnet_outputs function if we follow the logic for py_feat_layer2, py_feat_layer3 in compare_bb_regressor
-                # However, here in compare_resnet_outputs, py_image_tensor comes from load_cpp_tensor(py_image_input_path, ...)
-                # which is the RAW image. Preprocessing for python side happens inside self.python_wrapper.extract_backbone
-                # or when we manually call py_model_resnet.conv1(py_image_tensor)
-                # Let's get the preprocessed image from the wrapper as that's the true input to Python's ResNet
+                # Add 'fc' if configured, though not typically used in these comparisons
+                if 'fc' in config['outputs_to_compare']:
+                     py_output_layers_needed.append('fc')
                 
-                # The input to python_wrapper.extract_backbone is the raw image tensor
-                # It then calls self.preprocess_image(im) and then self.net.extract_backbone_features(im, layers)
-                # So, py_image_tensor IS the raw image. We need to get the preprocessed one.
-
-                preprocessed_py_image_for_conv1 = None
-                if self.python_wrapper:
-                    # Manually preprocess for saving, mimicking what extract_backbone would do before its first conv
-                    preprocessed_py_image_for_conv1 = self.python_wrapper.preprocess_image(py_image_tensor.clone()) # Clone to avoid in-place modification of py_image_tensor
-                    py_preprocessed_save_path = Path(self.cpp_output_dir) / 'resnet' / f'sample_{i}_image_preprocessed_python.pt'
-                    # Ensure self.cpp_output_dir / resnet exists
-                    (Path(self.cpp_output_dir) / 'resnet').mkdir(parents=True, exist_ok=True)
-                    torch.save(preprocessed_py_image_for_conv1.cpu(), str(py_preprocessed_save_path))
-                    print(f"Saved Python preprocessed image for sample {i} to {py_preprocessed_save_path}")
+                # Deduplicate, just in case (though construction above should be fine)
+                py_output_layers_needed = list(OrderedDict.fromkeys(py_output_layers_needed))
+                
+                print(f"DEBUG: Requesting these layers from Python ResNet: {py_output_layers_needed}")
+
+                # Call the Python ResNet forward
+                # The `self.models['ResNet']` should be the loaded JIT model
+                # It expects the output_layers argument.
+                # The DiMPTorchScriptWrapper's backbone should also support this.
+                if hasattr(resnet_model, 'forward') and callable(getattr(resnet_model, 'forward')) and 'output_layers' in inspect.signature(resnet_model.forward).parameters:
+                    python_model_outputs_dict = resnet_model.forward(processed_input_tensor, output_layers=py_output_layers_needed)
+                elif hasattr(self.python_wrapper, 'extract_backbone') and callable(getattr(self.python_wrapper, 'extract_backbone')):
+                    # This is the case if ResNet is accessed via the DiMPTorchScriptWrapper's extract_backbone,
+                    # which internally calls the backbone's forward with output_layers.
+                    python_model_outputs_dict = self.python_wrapper.extract_backbone(input_tensor.clone()) # extract_backbone handles preprocessing
+                else:
+                    print(f"ERROR: Cannot call forward on Python ResNet model. Type: {type(resnet_model)}")
+                    continue
+
+                # DEBUG: Print keys from Python model output
+                if isinstance(python_model_outputs_dict, dict):
+                    print(f"DEBUG RN_CMP: Keys from python_model_outputs_dict (sample {sample_idx}): {list(python_model_outputs_dict.keys())}")
                 else:
-                    print("ERROR: self.python_wrapper not available to get preprocessed image for Python.")
+                    print(f"DEBUG RN_CMP: python_model_outputs_dict is not a dict (sample {sample_idx}), type: {type(python_model_outputs_dict)}")
+
+                # Populate python_outputs based on the python_model_outputs_dict
+                # This maps the Python output names to the keys used in 'outputs_to_compare'
+                if isinstance(python_model_outputs_dict, dict):
+                    python_outputs = python_model_outputs_dict
+                    # If 'features' is an alias for 'layer4' in Python output
+                    if 'layer4' in python_outputs and 'features' not in python_outputs:
+                         python_outputs['features'] = python_outputs['layer4']
+                    if 'conv1_output' in python_outputs:
+                        python_intermediate_outputs_cache['conv1_output'] = python_outputs['conv1_output']
+
+                else:
+                    print(f"ERROR: Python ResNet output is not a dict. Got {type(python_model_outputs_dict)}")
+                    # Handle tuple/list output if necessary, mapping by order or specific logic.
+                    # For now, we assume dict output from our ResNet.
+                    continue
+
+
+            except Exception as e:
+                print(f"Error during Python ResNet forward pass for sample {sample_idx}: {e}")
+                import traceback
+                traceback.print_exc()
+                continue # Skip to next sample
+
+            for output_key, cpp_output_filename_or_tuple in config['outputs_to_compare'].items():
+                is_python_specific_name = isinstance(cpp_output_filename_or_tuple, tuple)
+                cpp_output_filename = cpp_output_filename_or_tuple[0] if is_python_specific_name else cpp_output_filename_or_tuple
+                
+                # Corrected path construction for C++ ResNet tensors:
+                # The sample index is already part of the cpp_output_filename for ResNet outputs from C++.
+                # (e.g., sample_0_conv1_output.pt)
+                # So, we join cpp_resnet_dir directly with this filename.
+                # However, the C++ code actually saves ResNet outputs as sample_X_LAYERNAME.pt directly in cpp_resnet_dir,
+                # not in a per-sample subdirectory for ResNet outputs.
+                # Let's check how test_models.cpp saves them.
+                # test_models.cpp -> save_resnet_outputs -> file_path = resnet_output_dir + "/sample_" + std::to_string(sample_idx) + "_" + output_name;
+                # This means filenames are like "sample_0_conv1_output.pt" directly in "../test/output/resnet/"
+                
+                correct_cpp_tensor_filename = f"sample_{sample_idx}_{cpp_output_filename}"
+                cpp_tensor_path = os.path.join(cpp_resnet_dir, correct_cpp_tensor_filename)
+
+                # <<< START ADDED DEBUG PRINTS >>>
+                print(f"DEBUG RN_CMP: Attempting to load C++ tensor for '{output_key}' (sample {sample_idx}) from: {cpp_tensor_path}")
+                # <<< END ADDED DEBUG PRINTS >>>
 
                 try:
-                    with torch.no_grad():
-                        py_model_resnet = self.models.get('ResNet')
-                        if py_model_resnet:
-                            current_features = preprocessed_py_image_for_conv1
-
-                            py_conv1_out = py_model_resnet.conv1(current_features)
-                            # Ensure self.py_resnet_output_dir is defined and is a Path object
-                            if not hasattr(self, 'py_resnet_output_dir') or not self.py_resnet_output_dir:
-                                self.py_resnet_output_dir = Path(self.python_output_dir) / 'resnet'
-                                self.py_resnet_output_dir.mkdir(parents=True, exist_ok=True)
-                            py_conv1_out_path = self.py_resnet_output_dir / f'sample_{i}_conv1_output_py.pt'
-                            torch.save(py_conv1_out.cpu(), str(py_conv1_out_path))
-
-                            # --- BN1 on CPU for debugging (Python) ---
-                            py_bn1_out = py_model_resnet.bn1(py_conv1_out)  # Original line
-
-                            py_relu1_out = py_model_resnet.relu(py_bn1_out) 
-                            py_maxpool_out = py_model_resnet.maxpool(py_relu1_out) 
-                            x_for_py_layer1_input = py_maxpool_out
-
-                            # Output of the first bottleneck block in layer1
-                            py_layer1_0_block_out_tensor = None # Initialize to avoid ref before assignment if try fails
-                            if hasattr(py_model_resnet, 'layer1') and len(py_model_resnet.layer1) > 0:
-                                try:
-                                    py_layer1_0_block_out_tensor = py_model_resnet.layer1[0](x_for_py_layer1_input) # REMOVED .clone() for consistency with best Layer1.0 result
-                                    # Ensure cpp_resnet_sample_dir is defined, if not, use a fallback or define it earlier
-                                    # Assuming cpp_resnet_sample_dir is defined like: cpp_resnet_sample_dir = Path(self.cpp_output_dir) / 'resnet'
-                                    # Which should be: cpp_resnet_dir = Path(self.cpp_output_dir) / 'resnet' # as per usage elsewhere
-                                    # And then: cpp_resnet_sample_dir = cpp_resnet_dir # if sample specific subdirs are not used for this
-                                    # For safety, let's use the already established cpp_output_resnet_dir path from later in the code
-                                    # cpp_output_resnet_dir = os.path.join(self.cpp_output_dir, 'resnet')
-                                    # Need to ensure cpp_output_resnet_dir is a Path object if used with /
-                                    # From later code: cpp_output_resnet_dir_path = Path(self.cpp_output_dir) / 'resnet' 
-
-                                    current_cpp_resnet_dir = Path(self.cpp_output_dir) / 'resnet' # Define it based on existing patterns
-                                    current_cpp_resnet_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists
-
-                                    py_layer1_0_block_save_path = current_cpp_resnet_dir / f'sample_{i}_layer1_0_block_output.pt'
-                                    torch.save(py_layer1_0_block_out_tensor.cpu(), str(py_layer1_0_block_save_path))
-                                    # print(f"DEBUG: Saved Python layer1[0] block output for sample {i} to {py_layer1_0_block_save_path}")
-                                except Exception as e_block:
-                                    print(f"ERROR: Failed to get/save Python layer1[0] block output for sample {i}: {e_block}")
-
-                            # Shortcut for layer1.0 (if exists)
-                            if hasattr(py_model_resnet, 'layer1') and len(py_model_resnet.layer1) > 0 and \
-                               hasattr(py_model_resnet.layer1[0], 'downsample') and py_model_resnet.layer1[0].downsample is not None:
-                                py_layer1_0_shortcut_out = py_model_resnet.layer1[0].downsample(x_for_py_layer1_input.clone())
-                            
-                            # Get full backbone outputs using the wrapper (which uses the raw image_tensor and preprocesses internally)
-                            # This ensures layer1, layer2, etc., are from the standard path.
-                            if self.python_wrapper:
-                                py_backbone_outputs = self.python_wrapper.extract_backbone(py_image_tensor) # py_image_tensor is raw
-                            else:
-                                print("ERROR: self.python_wrapper is None, cannot extract backbone features for ResNet outputs.")
-                                py_backbone_outputs = {} 
-
-                            py_layer1_out = py_backbone_outputs.get('layer1')
-                            py_layer2_out = py_backbone_outputs.get('layer2')
-                            py_layer3_out = py_backbone_outputs.get('layer3')
-                            py_layer4_out = py_backbone_outputs.get('layer4')
-                            py_features_out = py_backbone_outputs.get('layer4') # Typically layer4 is the final feature map
-                        else:
-                            print("ERROR: Python ResNet model not found in self.models")
-                except Exception as e:
-                    print(f"ERROR: Python ResNet backbone/shortcut processing failed for sample {i}: {e}")
-            else:
-                print(f"Warning: Skipping Python ResNet for sample {i}, image input not found at {py_image_input_path}")
+                    cpp_tensor = self.load_cpp_tensor(cpp_tensor_path, self.device)
+                    # <<< START ADDED DEBUG PRINTS >>>
+                    loaded_status = "None"
+                    if cpp_tensor is not None:
+                        loaded_status = f"Tensor with shape {cpp_tensor.shape}, dtype {cpp_tensor.dtype}, device {cpp_tensor.device}"
+                    print(f"DEBUG RN_CMP: Loaded C++ tensor for '{output_key}' (sample {sample_idx}): {loaded_status}")
+                    # <<< END ADDED DEBUG PRINTS >>>
+
+                    if cpp_tensor is None:
+                        print(f"Warning: C++ tensor {cpp_output_filename} for sample {sample_idx} ('{output_key}') is None or loading failed. Skipping comparison for this output.")
+                        # _compare_tensor_data will be called with cpp_tensor=None, which handles NaN population
+                        # Fall through to _compare_tensor_data to record NaNs
+                        # continue # This would skip the _compare_tensor_data call entirely
+
+                    # Get the corresponding Python tensor
+                    python_tensor = None
+                    python_output_save_path = os.path.join(python_resnet_save_dir, f"sample_{sample_idx}", cpp_output_filename) # Save with same name as C++ for consistency
+
+                    # Map the 'output_key' from config to the key used in 'python_outputs' dictionary
+                    # This requires knowing how 'outputs_to_compare' keys map to Python model output dict keys.
+                    # Example: 'Conv1' maps to 'conv1_output', 'Features' to 'features' (which might be 'layer4'), etc.
+                    
+                    py_dict_key = None
+                    if output_key == 'Conv1': 
+                        py_dict_key = 'conv1_pre_bn' # Python ResNet outputs combined conv1+bn1+relu as 'conv1'
+                    elif output_key == 'Debug ResNet Conv1->BN1 Input':
+                        py_dict_key = 'conv1_pre_bn' # Our new specific output layer
+                    elif output_key == 'BN1': 
+                        py_dict_key = 'bn1_output' # CHANGED to use the new hook
+                    elif output_key == 'BN1 Centered X':
+                        py_dict_key = 'bn1_centered_x_py'
+                    elif output_key == 'BN1 Var+Eps':
+                        py_dict_key = 'bn1_variance_plus_eps_py'
+                    elif output_key == 'BN1 InvStd':
+                        py_dict_key = 'bn1_inv_std_py'
+                    elif output_key == 'BN1 Normalized X':
+                        py_dict_key = 'bn1_normalized_x_py'
+                    elif output_key == 'ReLU1':
+                        py_dict_key = 'bn1_post_relu_pre' # Output of Python's BN1 + ReLU
+                    elif output_key == 'MaxPool':
+                        # MaxPool is applied *after* 'conv1' (conv1+bn1+relu) block in Python ResNet.
+                        # However, the Python ResNet forward doesn't have a separate 'maxpool' output key.
+                        # The output of layer1 is *after* maxpool.
+                        # C++ saves maxpool_output.pt *before* layer1.
+                        # This means we need to save python_outputs['conv1'] (after conv1,bn1,relu) then apply maxpool to it manually for comparison.
+                        # OR, recognize that C++ output for maxpool is input to layer1.
+                        # For now, this is tricky. Let's see if layer1 input in C++ matches python maxpool output.
+                        # The Python output named 'layer1' is after the nn.Sequential that IS layer1.
+                        # The input to C++ layer1 is the output of C++ maxpool.
+                        # The input to Python model.layer1 is the output of model.maxpool(model.relu(model.bn1(model.conv1(x)))).
+                        # So, Python's 'conv1' output, when passed through an nn.MaxPool2d, should match C++ 'maxpool_output.pt'.
+                        print(f"Warning: Direct Python equivalent for C++ 'MaxPool' output is complex. Requires manual maxpool application to Python's 'conv1' output. Skipping {output_key} for now.")
+                        continue # Skip this key for now
+                    elif output_key == 'Layer1': py_dict_key = 'layer1'
+                    elif output_key == 'Layer2': py_dict_key = 'layer2'
+                    elif output_key == 'Layer3': py_dict_key = 'layer3'
+                    elif output_key == 'Layer4': py_dict_key = 'layer4'
+                    elif output_key == 'Features': py_dict_key = 'layer4' # 'Features' is an alias for 'layer4'
+                    elif output_key == 'Layer1.0 Shortcut':
+                        # Shortcut outputs are not available from the Python ResNet forward method.
+                        print(f"Warning: Shortcut output '{output_key}' cannot be directly fetched from Python ResNet. Skipping.")
+                        continue
+                    else:
+                        print(f"Warning: Unknown output_key '{output_key}' in ResNet config for Python tensor mapping. Skipping.")
+                        continue
 
-            # Load C++ ResNet outputs
+                    if py_dict_key and py_dict_key in python_outputs:
+                        python_tensor = python_outputs[py_dict_key]
+                    else:
+                        # DEBUG: Print info if key is not found
+                        print(f"DEBUG RN_CMP: py_dict_key '{py_dict_key}' not found in python_outputs (keys: {list(python_outputs.keys())}) for output_key '{output_key}', sample {sample_idx}")
+                    
+                    if python_tensor is None:
+                        print(f"Warning: Python tensor for {output_key} is None for sample {sample_idx}. Skipping.")
+                        continue
 
-            # NEW: Debug directory listing
-            print(f"DEBUG: Listing contents of {cpp_output_resnet_dir_path} before loading tensors for sample {i}:")
-            try:
-                if cpp_output_resnet_dir_path.exists() and cpp_output_resnet_dir_path.is_dir():
-                    for item_path in cpp_output_resnet_dir_path.iterdir():
-                        print(f"  - {item_path.name}")
-                else:
-                    print(f"  Directory {cpp_output_resnet_dir_path} does not exist or is not a directory.")
-            except Exception as e_list:
-                print(f"  ERROR listing directory: {e_list}")
-            # END NEW # Removing this marker
-
-            time.sleep(0.5) # INCREASED to 0.5s delay to allow filesystem to sync
-
-            # Debug blocks for directory listing and direct open test were here and are now fully removed.
-
-            cpp_layer1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1.pt')
-            cpp_layer2_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer2.pt')
-            cpp_layer3_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer3.pt')
-            cpp_layer4_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer4.pt')
-            cpp_features_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_features.pt')
-            cpp_layer1_0_shortcut_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1_0_shortcut_output.pt')
-            cpp_maxpool_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_maxpool_output.pt')
-            cpp_conv1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_conv1_output.pt') # ADDED
-            cpp_bn1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_bn1_output.pt') # ADDED
-            cpp_relu1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_relu1_output.pt') # ADDED
-            cpp_layer1_0_block_output_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1_0_block_output.pt') # ADDED
-
-            cpp_layer1_out = self.load_cpp_tensor(cpp_layer1_path, self.device)
-            cpp_layer2_out = self.load_cpp_tensor(cpp_layer2_path, self.device)
-            cpp_layer3_out = self.load_cpp_tensor(cpp_layer3_path, self.device)
-            cpp_layer4_out = self.load_cpp_tensor(cpp_layer4_path, self.device)
-            cpp_features_out = self.load_cpp_tensor(cpp_features_path, self.device)
-            cpp_layer1_0_shortcut_out = self.load_cpp_tensor(cpp_layer1_0_shortcut_path, self.device)
-            cpp_maxpool_out = self.load_cpp_tensor(cpp_maxpool_path, self.device)
-            cpp_conv1_out = self.load_cpp_tensor(cpp_conv1_path, self.device) # ADDED
-            cpp_bn1_out = self.load_cpp_tensor(cpp_bn1_path, self.device) # ADDED
-            cpp_relu1_out = self.load_cpp_tensor(cpp_relu1_path, self.device) # ADDED
-            cpp_layer1_0_block_output_tensor = self.load_cpp_tensor(cpp_layer1_0_block_output_path, self.device) # ADDED
-
-            # Load the Python pre-BN conv1 output that was saved earlier
-            py_conv1_out_pre_bn_tensor = None
-            # Ensure self.py_resnet_output_dir is defined (it should be if the save operation worked)
-            if hasattr(self, 'py_resnet_output_dir') and self.py_resnet_output_dir:
-                py_conv1_out_pre_bn_path = self.py_resnet_output_dir / f'sample_{i}_conv1_output_py.pt'
-                if py_conv1_out_pre_bn_path.exists():
-                    try:
-                        py_conv1_out_pre_bn_tensor = torch.load(str(py_conv1_out_pre_bn_path), map_location=self.device)
-                    except Exception as e_load_py_conv1:
-                        print(f"Error loading Python conv1_output_py (pre-BN) for sample {i}: {e_load_py_conv1}")
-            else:
-                print(f"Warning: self.py_resnet_output_dir not defined, cannot load py_conv1_output_py.pt for sample {i}")
+                    # Save the Python tensor (always, for record-keeping)
+                    os.makedirs(os.path.dirname(python_output_save_path), exist_ok=True)
+                    torch.save(python_tensor.cpu(), python_output_save_path)
+                    # print(f"Saved Python tensor for {output_key} (sample {sample_idx}) to {python_output_save_path}")
 
-            # Comparisons
-            self._compare_tensor_data(py_conv1_out_pre_bn_tensor, cpp_conv1_out, "ResNet Conv1 Output (Pre-BN)", i, current_errors)
-            self._compare_tensor_data(py_conv1_out, cpp_conv1_out, "ResNet Conv1", i, current_errors)
-            self._compare_tensor_data(py_bn1_out, cpp_bn1_out, "ResNet BN1", i, current_errors)
-            self._compare_tensor_data(py_relu1_out, cpp_relu1_out, "ResNet ReLU1", i, current_errors)
-            self._compare_tensor_data(py_maxpool_out, cpp_maxpool_out, "ResNet MaxPool", i, current_errors)
-            self._compare_tensor_data(py_layer1_out, cpp_layer1_out, "ResNet Layer1", i, current_errors)
-            self._compare_tensor_data(py_layer2_out, cpp_layer2_out, "ResNet Layer2", i, current_errors)
-            self._compare_tensor_data(py_layer3_out, cpp_layer3_out, "ResNet Layer3", i, current_errors)
-            self._compare_tensor_data(py_layer4_out, cpp_layer4_out, "ResNet Layer4", i, current_errors)
-            self._compare_tensor_data(py_features_out, cpp_features_out, "ResNet Features", i, current_errors)
-            self._compare_tensor_data(py_layer1_0_shortcut_out, cpp_layer1_0_shortcut_out, "ResNet Layer1.0 Shortcut", i, current_errors)
-
-            if current_errors: self.all_comparison_stats[f"ResNet_Sample_{i}"] = current_errors
+
+                    # Perform comparison
+                    self._compare_tensor_data(python_tensor.to(self.device) if python_tensor is not None else None, 
+                                          cpp_tensor, 
+                                          output_key, 
+                                          sample_idx, 
+                                          current_errors) # current_errors is populated in place
+                    
+                    # The line above was changed to handle python_tensor being None before .to(self.device)
+                    # current_errors is populated by _compare_tensor_data directly.
+                    # self.all_comparison_stats is updated after this inner loop completes for the sample.
+
+                except FileNotFoundError:
+                    print(f"Warning: C++ output file not found: {cpp_tensor_path}. Skipping for sample {sample_idx}, output {output_key}.")
+                    # Populate NaNs for this missing C++ file
+                    self._compare_tensor_data(None, None, output_key, sample_idx, current_errors)
+                except Exception as e:
+                    print(f"Error comparing {output_key} for sample {sample_idx}: {e}")
+                    import traceback
+                    traceback.print_exc()
+                    # Populate NaNs on error
+                    self._compare_tensor_data(None, None, output_key, sample_idx, current_errors)
+            
+            # After processing all output_keys for this sample, store the collected current_errors
+            if current_errors: # If any comparisons were attempted (even if they resulted in NaNs)
+                self.all_comparison_stats[f"ResNet_Sample_{sample_idx}"] = current_errors
+
+            # processed_samples += 1 # This variable is no longer used as loop is range-based
+        print("--- ResNet Output Comparison Complete ---")
 
     def generate_html_report(self):
         print("\nGenerating HTML report...")
@@ -1200,73 +1330,247 @@ class ComparisonRunner:
             print("Preprocessed input comparison: ISSUES FOUND (details above).")
 
     def load_cpp_tensor(self, file_path_str, device, is_image=False):
-        file_path_obj = Path(file_path_str) # Convert to Path object early
-
-        # Removed debug print: print(f"DEBUG: load_cpp_tensor: Checking existence of Path object: '{file_path_obj}' (from string '{file_path_str}')")
-
-        if not file_path_obj.exists(): # Use Path object for exists check
-            print(f"ERROR: C++ tensor file not found (Path.exists check): {file_path_obj}")
+        if not os.path.exists(file_path_str):
             return None
+        
+        attempt_jit_extraction = False
+        loaded_object_from_direct_load = None
+
         try:
-            # Try loading as a JIT ScriptModule first (common for exported tensors that might have attributes)
-            # This also handles plain tensors saved with torch.save if they are not ScriptModules
-            loaded_obj = torch.jit.load(str(file_path_obj), map_location=device) # Convert Path to str for torch.jit.load
-            actual_tensor = None
-
-            if isinstance(loaded_obj, torch.jit.ScriptModule):
-                # Attempt to get tensor attribute directly, common for simple JIT-saved tensors
-                # Check for common weight/tensor attributes first
-                if hasattr(loaded_obj, 'tensor'): # Explicit "tensor" attribute
-                    if isinstance(loaded_obj.tensor, torch.Tensor):
-                        actual_tensor = loaded_obj.tensor
-                elif hasattr(loaded_obj, 'weight') and isinstance(loaded_obj.weight, torch.Tensor): # Common for conv/linear
-                     actual_tensor = loaded_obj.weight
-                # Heuristic: if it has parameters and only one, assume that's the one.
-                elif len(list(loaded_obj.parameters())) == 1:
-                    actual_tensor = list(loaded_obj.parameters())[0]
-                # Heuristic: if it has attributes that are tensors, try to find the primary one
-                else:
-                    tensor_attrs = [getattr(loaded_obj, attr) for attr in dir(loaded_obj) if isinstance(getattr(loaded_obj, attr, None), torch.Tensor)]
-                    if len(tensor_attrs) == 1:
-                        actual_tensor = tensor_attrs[0]
-                    elif len(tensor_attrs) > 1:
-                         # If multiple tensor attributes, try to find one that matches common patterns or is simply 'output'
-                        if hasattr(loaded_obj, 'output') and isinstance(loaded_obj.output, torch.Tensor):
-                            actual_tensor = loaded_obj.output
-                        else: # Heuristic: take the largest tensor if multiple exist and no clear primary one
-                            actual_tensor = max(tensor_attrs, key=lambda t: t.numel())
-                            # print(f"WARNING: Multiple tensor attributes in ScriptModule from {file_path_obj}, using largest: {actual_tensor.shape}")
-
-                if actual_tensor is None:
-                    print(f"ERROR: C++ tensor from {file_path_obj} is a ScriptModule, but couldn't extract a single tensor. StateDict keys: {list(loaded_obj.state_dict().keys()) if hasattr(loaded_obj, 'state_dict') else 'N/A'}")
-                    return None
+            # Attempt direct load first
+            loaded_object_from_direct_load = torch.load(file_path_str, map_location=device, weights_only=False)
+            
+            if isinstance(loaded_object_from_direct_load, torch.Tensor):
+                return loaded_object_from_direct_load.to(device) # Successfully loaded a tensor directly
+            else:
+                # Loaded something, but it's not a tensor. It's likely a JIT module.
+                attempt_jit_extraction = True
+                print(f"INFO: Initial torch.load of {file_path_str} yielded a non-Tensor (type: {type(loaded_object_from_direct_load)}). Will attempt JIT extraction.")
+
+        except Exception as e_initial_load:
+            # Initial load failed (e.g., it's a JIT module not readable by plain torch.load, or other error)
+            attempt_jit_extraction = True 
+            print(f"INFO: Initial torch.load failed for {file_path_str}: {e_initial_load}. Will attempt JIT extraction.")
+        
+        # Common JIT tensor extraction logic
+        def extract_tensor_from_jit_module(module_path, jit_loaded_obj, dev):
+            print(f"DEBUG JIT EXTRACTION: For {module_path}, loaded_obj type: {type(jit_loaded_obj)}")
+            print(f"DEBUG JIT EXTRACTION: str(loaded_obj): {str(jit_loaded_obj)}")
+            # print(f"DEBUG JIT EXTRACTION: dir(loaded_obj): {dir(jit_loaded_obj)}") # Verbose
 
-            elif isinstance(loaded_obj, torch.Tensor):
-                actual_tensor = loaded_obj
+            extracted_tensor = None
+
+            # 1. Try calling if 'forward' method exists
+            if hasattr(jit_loaded_obj, 'forward') and callable(getattr(jit_loaded_obj, 'forward')):
+                print(f"DEBUG JIT EXTRACTION: Attempting jit_loaded_obj.forward()")
+                try:
+                    extracted_tensor = jit_loaded_obj.forward()
+                    if not isinstance(extracted_tensor, torch.Tensor):
+                        print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward() did not return a tensor, got {type(extracted_tensor)}. Trying with dummy input.")
+                        extracted_tensor = None # Reset before trying with dummy
+                        try:
+                            print(f"DEBUG JIT EXTRACTION: Attempting jit_loaded_obj.forward(torch.empty(0))")
+                            extracted_tensor = jit_loaded_obj.forward(torch.empty(0, device=dev))
+                            if not isinstance(extracted_tensor, torch.Tensor):
+                                print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward(dummy) also did not return a tensor, got {type(extracted_tensor)}")
+                                extracted_tensor = None
+                        except Exception as e_fwd_dummy:
+                            print(f"DEBUG JIT EXTRACTION: Error calling jit_loaded_obj.forward(dummy): {e_fwd_dummy}")
+                            extracted_tensor = None
+                except Exception as e_fwd: # This covers cases where forward exists but call fails (e.g. signature mismatch)
+                    print(f"DEBUG JIT EXTRACTION: Error calling jit_loaded_obj.forward(): {e_fwd}. Trying with dummy input as fallback.")
+                    extracted_tensor = None # Reset
+                    try:
+                        print(f"DEBUG JIT EXTRACTION: Attempting jit_loaded_obj.forward(torch.empty(0)) after error.")
+                        extracted_tensor = jit_loaded_obj.forward(torch.empty(0, device=dev))
+                        if not isinstance(extracted_tensor, torch.Tensor):
+                            print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward(dummy) after error also did not return a tensor, got {type(extracted_tensor)}")
+                            extracted_tensor = None
+                    except Exception as e_fwd_dummy_after_error:
+                        print(f"DEBUG JIT EXTRACTION: Error calling jit_loaded_obj.forward(dummy) after initial fwd error: {e_fwd_dummy_after_error}")
+                        extracted_tensor = None
+            
+            # 1b. Try calling the module directly if forward attribute exists (covers some cases)
+            # This is after trying explicit .forward() as direct call might have side effects or different interpretation
+            if extracted_tensor is None and callable(jit_loaded_obj) and hasattr(jit_loaded_obj, 'forward'):
+                 print(f"DEBUG JIT EXTRACTION: Attempting callable jit_loaded_obj()")
+                 try:
+                    extracted_tensor = jit_loaded_obj()
+                    if not isinstance(extracted_tensor, torch.Tensor):
+                        print(f"DEBUG JIT EXTRACTION: callable jit_loaded_obj() did not return a tensor, got {type(extracted_tensor)}")
+                        extracted_tensor = None
+                 except Exception as e_call_obj:
+                    print(f"DEBUG JIT EXTRACTION: Error calling callable jit_loaded_obj() (it had a forward attr): {e_call_obj}")
+                    extracted_tensor = None
+
+
+            # 2. Check if 'forward' attribute *itself* is a tensor
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'forward') and isinstance(getattr(jit_loaded_obj, 'forward'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: jit_loaded_obj.forward IS a tensor.")
+                extracted_tensor = getattr(jit_loaded_obj, 'forward')
+
+            # 3. Look for common direct tensor attributes
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'tensor') and isinstance(getattr(jit_loaded_obj, 'tensor'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Found tensor in jit_loaded_obj.tensor")
+                extracted_tensor = jit_loaded_obj.tensor
+            
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'data') and isinstance(getattr(jit_loaded_obj, 'data'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Found tensor in jit_loaded_obj.data")
+                extracted_tensor = jit_loaded_obj.data
+            
+            if extracted_tensor is None and hasattr(jit_loaded_obj, 'tensor_data') and isinstance(getattr(jit_loaded_obj, 'tensor_data'), torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Found tensor in jit_loaded_obj.tensor_data")
+                extracted_tensor = jit_loaded_obj.tensor_data
+
+            # 4. Iterate through named_buffers (common for wrapped tensors)
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Iterating named_buffers for a tensor...")
+                try:
+                    for name, buffer_tensor in jit_loaded_obj.named_buffers():
+                        if isinstance(buffer_tensor, torch.Tensor):
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in named_buffers: {name}")
+                            extracted_tensor = buffer_tensor
+                            break
+                except Exception as e_buffers:
+                    print(f"DEBUG JIT EXTRACTION: Error iterating named_buffers: {e_buffers}")
+
+
+            # 5. Iterate through named_parameters
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Iterating named_parameters for a tensor...")
+                try:
+                    for name, param_tensor in jit_loaded_obj.named_parameters():
+                        if isinstance(param_tensor, torch.Tensor):
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in named_parameters: {name}")
+                            extracted_tensor = param_tensor
+                            break
+                except Exception as e_params:
+                    print(f"DEBUG JIT EXTRACTION: Error iterating named_parameters: {e_params}")
+            
+            # 6. Iterate through members (attributes) using inspect.getmembers - potentially fragile
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Attempting to iterate members using inspect.getmembers...")
+                try:
+                    for name, member_obj in inspect.getmembers(jit_loaded_obj):
+                        if isinstance(member_obj, torch.Tensor):
+                            # Avoid re-picking already checked common names if they are somehow also members
+                            if name not in ['tensor', 'data', 'tensor_data', 'forward']:
+                                print(f"DEBUG JIT EXTRACTION: Found tensor in member (inspect.getmembers): {name}")
+                                extracted_tensor = member_obj
+                                break
+                except RuntimeError as e_inspect:
+                    # Specifically catch RuntimeError that was observed: "Method 'forward' is not defined"
+                    print(f"DEBUG JIT EXTRACTION: inspect.getmembers failed with RuntimeError: {e_inspect}. Skipping this method.")
+                except Exception as e_inspect_other:
+                    print(f"DEBUG JIT EXTRACTION: inspect.getmembers failed with other Exception: {e_inspect_other}. Skipping this method.")
+
+            # 7. Iterate through named_children and inspect
+            if extracted_tensor is None:
+                print(f"DEBUG JIT EXTRACTION: Iterating named_children...")
+                try:
+                    for child_name, child_module in jit_loaded_obj.named_children():
+                        print(f"DEBUG JIT EXTRACTION: Inspecting child: {child_name} of type {type(child_module)}")
+                        # Try common ways to get tensor from child
+                        if hasattr(child_module, 'forward') and callable(getattr(child_module, 'forward')) :
+                            try:
+                                temp_tensor = child_module.forward()
+                                if isinstance(temp_tensor, torch.Tensor):
+                                    print(f"DEBUG JIT EXTRACTION: Found tensor by calling child {child_name}.forward()")
+                                    extracted_tensor = temp_tensor; break
+                            except: pass
+                        
+                        if extracted_tensor is None and callable(child_module) and hasattr(child_module, 'forward'): # Added hasattr forward here
+                           try:
+                               temp_tensor = child_module()
+                               if isinstance(temp_tensor, torch.Tensor):
+                                   print(f"DEBUG JIT EXTRACTION: Found tensor by calling child {child_name}()")
+                                   extracted_tensor = temp_tensor; break
+                           except: pass
+
+                        if extracted_tensor is None and hasattr(child_module, 'forward') and isinstance(getattr(child_module, 'forward'), torch.Tensor):
+                            extracted_tensor = getattr(child_module, 'forward')
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.forward (as attribute)")
+                            break
+                        
+                        if extracted_tensor is None and hasattr(child_module, 'tensor') and isinstance(getattr(child_module, 'tensor'), torch.Tensor):
+                            extracted_tensor = child_module.tensor
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.tensor")
+                            break
+                        if extracted_tensor is None and hasattr(child_module, 'data') and isinstance(getattr(child_module, 'data'), torch.Tensor):
+                            extracted_tensor = child_module.data
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.data")
+                            break
+                        if extracted_tensor is None and hasattr(child_module, 'tensor_data') and isinstance(getattr(child_module, 'tensor_data'), torch.Tensor):
+                            extracted_tensor = child_module.tensor_data
+                            print(f"DEBUG JIT EXTRACTION: Found tensor in child {child_name}.tensor_data")
+                            break
+                        
+                        if extracted_tensor is None: # Check general members of child if direct attributes fail
+                            try:
+                                for name, member_obj in inspect.getmembers(child_module):
+                                     if isinstance(member_obj, torch.Tensor):
+                                        print(f"DEBUG JIT EXTRACTION: Found tensor in member {name} of child {child_name}")
+                                        extracted_tensor = member_obj; break
+                                if extracted_tensor is not None: break
+                            except Exception as e_child_inspect:
+                                print(f"DEBUG JIT EXTRACTION: inspect.getmembers on child {child_name} failed: {e_child_inspect}")
+                                
+                    if extracted_tensor is not None:
+                         print(f"DEBUG JIT EXTRACTION: Tensor found in a child module.")
+                    else:
+                         print(f"DEBUG JIT EXTRACTION: Tensor not found in direct children.")
+                except Exception as e_children:
+                    print(f"DEBUG JIT EXTRACTION: Error iterating named_children: {e_children}")
+
+
+            if isinstance(extracted_tensor, torch.Tensor):
+                print(f"DEBUG JIT EXTRACTION: Successfully extracted tensor of shape {extracted_tensor.shape} from JIT module {module_path}")
+                return extracted_tensor.to(dev)
             else:
-                print(f"ERROR: C++ tensor loaded from {file_path_obj} with torch.jit.load is not a Tensor or ScriptModule. Type: {type(loaded_obj)}")
+                print(f"Warning: JIT EXTRACTION: Could not extract tensor from JIT module: {module_path}. Final extracted_type: {type(extracted_tensor)}. THIS FILE WILL BE SKIPPED.")
                 return None
+
+        if attempt_jit_extraction:
+            # If primary_jit_load_needed was true, loaded_object_from_direct_load might be the JIT module already.
+            # Otherwise, we need to load it with torch.jit.load.
+            # The critical part is that C++ outputs are *always* JIT modules now if not raw tensors.
             
-            tensor = actual_tensor.to(device).float() # Ensure tensor is on the correct device and float
-            return tensor
-        except Exception as e:
-            # If torch.jit.load fails (e.g. it's a plain tensor not loadable by JIT), try torch.load
-            # This also catches errors from the processing above if actual_tensor remains None
-            # print(f"INFO: torch.jit.load failed for {file_path_obj} ({e}), attempting torch.load as fallback.")
-            try:
-                tensor = torch.load(str(file_path_obj), map_location=device) # Convert Path to str for torch.load
-                if not isinstance(tensor, torch.Tensor):
-                    print(f"ERROR: Fallback torch.load for {file_path_obj} did not return a tensor. Type: {type(tensor)}")
+            jit_module_to_process = None
+            if loaded_object_from_direct_load is not None and not isinstance(loaded_object_from_direct_load, torch.Tensor):
+                # This means torch.load succeeded but returned a JIT module directly
+                # (common for files saved with torch.jit.save that are actually modules)
+                print(f"DEBUG JIT: Using object from initial torch.load (type: {type(loaded_object_from_direct_load)}) for JIT extraction for {file_path_str}.")
+                jit_module_to_process = loaded_object_from_direct_load
+            else:
+                # This means initial torch.load either failed OR it was an image and returned a JIT module (handled above),
+                # OR it was not an image and returned a tensor (already returned).
+                # So, if we are here, it means torch.load failed, or we need to fresh load as JIT.
+                try:
+                    print(f"DEBUG JIT: Attempting torch.jit.load for {file_path_str} as fallback/primary JIT path.")
+                    jit_module_to_process = torch.jit.load(file_path_str, map_location=device)
+                except Exception as e_jit_load_explicit:
+                    print(f"Error: torch.jit.load also failed for {file_path_str}: {e_jit_load_explicit}. Traceback: {traceback.format_exc()}. SKIPPING.")
                     return None
-                return tensor.to(device).float() # Ensure tensor is on the correct device and float
-            except Exception as e2:
-                print(f"ERROR: Failed to load C++ tensor from {file_path_obj}. JIT load error: {e}. Torch load error: {e2}")
-                import traceback
-                traceback.print_exc()
+            
+            if jit_module_to_process is not None:
+                final_tensor = extract_tensor_from_jit_module(file_path_str, jit_module_to_process, device)
+                if final_tensor is not None:
+                    return final_tensor
+                else:
+                    print(f"Warning: JIT extraction path for {file_path_str} (using {type(jit_module_to_process)}) failed to extract tensor. SKIPPING file.")
+                    return None
+            else:
+                # This case should be rare if torch.jit.load was attempted and failed, as it would have returned None above.
+                print(f"Warning: jit_module_to_process is None for {file_path_str} before calling extraction. SKIPPING file.")
                 return None
 
-    def _compare_tensor_data(self, tensor1, tensor2, name, sample_idx, current_errors):
-        """Compare two tensors and return error metrics."""
+        # If we reach here, it means initial torch.load returned a tensor (and it was returned),
+        # or all attempts to load and extract have failed.
+        print(f"Warning: load_cpp_tensor is returning None for {file_path_str} after all attempts. This indicates an issue with file content or loading logic for this specific file type when is_image={is_image}.")
+        return None
+
+    def _compare_tensor_data(self, tensor1, tensor2, name, sample_idx, current_errors_dict_to_populate):
+        """Compare two tensors and return error metrics. Modifies current_errors_dict_to_populate in place."""
         num_metrics = 11 # mae, max_err, diff_arr, mean_py_val, std_abs_err, l2_py, l2_cpp, l2_diff, cos_sim, pearson, mre
         nan_metrics_tuple = (
             float('nan'), float('nan'), [], float('nan'), float('nan'), # Original 5
@@ -1280,27 +1584,26 @@ class ComparisonRunner:
                 t1_cpu_temp = tensor1.cpu().detach().numpy().astype(np.float32)
                 py_mean = np.mean(t1_cpu_temp)
                 py_l2 = np.linalg.norm(t1_cpu_temp.flatten())
-            # If only tensor2 is None, we can't calculate C++ l2 or comparison metrics
-            # If only tensor1 is None, py_mean and py_l2 remain NaN.
             
-            current_errors[name] = (
+            # Populate current_errors_dict_to_populate directly
+            current_errors_dict_to_populate[name] = (
                 float('nan'), float('nan'), [], py_mean, float('nan'),
                 py_l2, float('nan'), float('nan'), float('nan'), float('nan'), float('nan')
             )
             print(f"Warning: Cannot compare '{name}' for sample {sample_idx}, one or both tensors are None.")
-            return
+            return # Return None as the function modifies dict in place
 
         t1_cpu = tensor1.cpu().detach().numpy().astype(np.float32)
         t2_cpu = tensor2.cpu().detach().numpy().astype(np.float32)
 
         if t1_cpu.shape != t2_cpu.shape:
             print(f"Warning: Shape mismatch for '{name}' sample {sample_idx}. Py: {t1_cpu.shape}, Cpp: {t2_cpu.shape}. Skipping most comparisons.")
-            current_errors[name] = (
+            current_errors_dict_to_populate[name] = (
                 float('nan'), float('nan'), [], np.mean(t1_cpu), float('nan'), # MAE, MaxErr, diff_arr, MeanPy, StdAbsErr
                 np.linalg.norm(t1_cpu.flatten()), np.linalg.norm(t2_cpu.flatten()), float('nan'), # L2Py, L2Cpp, L2Diff
                 float('nan'), float('nan'), float('nan') # CosSim, Pearson, MRE
             )
-            return
+            return # Return None
         
         # All calculations from here assume shapes match and tensors are not None
         t1_flat = t1_cpu.flatten()
@@ -1355,7 +1658,8 @@ class ComparisonRunner:
         # Using (abs(t1_cpu) + epsilon) in denominator handles this.
         mean_rel_err = np.mean(abs_diff_elements / (np.abs(t1_cpu) + epsilon_rel_err))
 
-        current_errors[name] = (
+        # Populate current_errors_dict_to_populate directly
+        current_errors_dict_to_populate[name] = (
             mae, max_err, diff_arr_for_hist, mean_py_val, std_abs_err,
             l2_norm_py, l2_norm_cpp, l2_norm_diff, cosine_sim, pearson_corr, mean_rel_err
         )
diff --git a/test/test_models.cpp b/test/test_models.cpp
index 912167c..9415cfc 100644
--- a/test/test_models.cpp
+++ b/test/test_models.cpp
@@ -127,7 +127,10 @@ int main(int argc, char* argv[]) {
         std::vector<std::string> output_layers_resnet = {
             "conv1_output", "bn1_output", "relu1_output", "maxpool_output",
             "layer1", "layer2", "layer3", "layer4", "features", 
-            "layer1_0_shortcut_output", "layer1_0_block_output"
+            "layer1_0_shortcut_output", "layer1_0_block_output",
+            "debug_resnet_conv1_output_for_bn1_input",
+            // New BN1 intermediate outputs
+            "bn1_centered_x", "bn1_variance_plus_eps", "bn1_inv_std", "bn1_normalized_x"
         };
         resnet_model_opt.emplace(cimp::resnet::resnet50(resnet_weights_path, output_layers_resnet, device)); 
         (*resnet_model_opt)->to(device);
@@ -291,6 +294,14 @@ int main(int argc, char* argv[]) {
                 } else {
                     std::cerr << "  Skipping BBRegressor predict_iou for sample " << i << " (iou_feats or mod_vectors empty)." << std::endl;
                 }
+
+                // Save debug intermediate outputs
+                torch::Tensor cpp_conv3_1t_out = (*bb_regressor_model_opt_wrapped).debug_get_conv3_1t_output(resnet_outputs["layer2"].clone());
+                save_tensor_to_file(cpp_conv3_1t_out, (bb_reg_out_dir / (sample_suffix + "_debug_conv3_1t_output.pt")).string());
+
+                torch::Tensor cpp_conv4_1t_out = (*bb_regressor_model_opt_wrapped).debug_get_conv4_1t_output(resnet_outputs["layer3"].clone());
+                save_tensor_to_file(cpp_conv4_1t_out, (bb_reg_out_dir / (sample_suffix + "_debug_conv4_1t_output.pt")).string());
+
                 std::cout << "BBRegressor processing done for sample " << i << std::endl;
             } catch (const std::exception& e) {
                 std::cerr << "Error during BBRegressor processing for sample " << i << ": " << e.what() << std::endl;