Fix NameError in compare_models; Add ResNet BN1 debug prints; Prepare to address other modules

5 months ago · e61886fb23
1527 changed files with 377 additions and 953 deletions
--- a/build/CMakeCache.txt
+++ b/build/CMakeCache.txt
@ -201,8 +201,8 @@ CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
 //Path to a program.
 CMAKE_STRIP:FILEPATH=/usr/bin/strip

-//No help, variable specified on the command line.
-CMAKE_TOOLCHAIN_FILE:UNINITIALIZED=../vcpkg/scripts/buildsystems/vcpkg.cmake
+//The CMake toolchain file
+CMAKE_TOOLCHAIN_FILE:FILEPATH=/media/mht/ADATA/repos/cpp_tracker/vcpkg/scripts/buildsystems/vcpkg.cmake

 //If this value is on, makefiles will be generated without the
 // .SILENT directive, and all commands will be echoed to the console
@ -465,15 +465,6 @@ _VCPKG_INSTALLED_DIR:PATH=/media/mht/ADATA/repos/cpp_tracker/build/vcpkg_install
 //Path to a library.
 c10_LIBRARY:FILEPATH=/home/mht/libtorch_1.8.0_cu111/libtorch/lib/libc10.so

-//Value Computed by CMake
-cpp_tracker_BINARY_DIR:STATIC=/media/mht/ADATA/repos/cpp_tracker/build
-
-//Value Computed by CMake
-cpp_tracker_IS_TOP_LEVEL:STATIC=ON
-
-//Value Computed by CMake
-cpp_tracker_SOURCE_DIR:STATIC=/media/mht/ADATA/repos/cpp_tracker
-

 ########################
 # INTERNAL cache entries
--- a/build/CMakeFiles/3.22.1/CMakeDetermineCompilerABI_CUDA.bin
+++ b/build/CMakeFiles/3.22.1/CMakeDetermineCompilerABI_CUDA.bin
--- a/build/CMakeFiles/CMakeOutput.log
+++ b/build/CMakeFiles/CMakeOutput.log
--- a/build/CMakeFiles/Makefile2
+++ b/build/CMakeFiles/Makefile2
@ -164,8 +164,8 @@ CMakeFiles/resnet.dir/clean:
 # Target rules for target CMakeFiles/tracking_demo.dir

 # All Build rule for target.
-CMakeFiles/tracking_demo.dir/all: CMakeFiles/bb_regressor.dir/all
 CMakeFiles/tracking_demo.dir/all: CMakeFiles/resnet.dir/all
+CMakeFiles/tracking_demo.dir/all: CMakeFiles/bb_regressor.dir/all
 CMakeFiles/tracking_demo.dir/all: CMakeFiles/classifier.dir/all
 	$(MAKE) $(MAKESILENT) -f CMakeFiles/tracking_demo.dir/build.make CMakeFiles/tracking_demo.dir/depend
 	$(MAKE) $(MAKESILENT) -f CMakeFiles/tracking_demo.dir/build.make CMakeFiles/tracking_demo.dir/build
@ -192,8 +192,8 @@ CMakeFiles/tracking_demo.dir/clean:
 # Target rules for target CMakeFiles/test_models.dir

 # All Build rule for target.
-CMakeFiles/test_models.dir/all: CMakeFiles/bb_regressor.dir/all
 CMakeFiles/test_models.dir/all: CMakeFiles/resnet.dir/all
+CMakeFiles/test_models.dir/all: CMakeFiles/bb_regressor.dir/all
 CMakeFiles/test_models.dir/all: CMakeFiles/classifier.dir/all
 	$(MAKE) $(MAKESILENT) -f CMakeFiles/test_models.dir/build.make CMakeFiles/test_models.dir/depend
 	$(MAKE) $(MAKESILENT) -f CMakeFiles/test_models.dir/build.make CMakeFiles/test_models.dir/build
--- a/build/CMakeFiles/bb_regressor.dir/cimp/bb_regressor/bb_regressor.cpp.o
+++ b/build/CMakeFiles/bb_regressor.dir/cimp/bb_regressor/bb_regressor.cpp.o
--- a/build/CMakeFiles/classifier.dir/cimp/classifier/classifier.cpp.o
+++ b/build/CMakeFiles/classifier.dir/cimp/classifier/classifier.cpp.o
--- a/build/install_manifest.txt
+++ b/build/install_manifest.txt
@ -1,2 +0,0 @@
-/media/mht/ADATA/repos/cpp_tracker/bin/tracking_demo
-/media/mht/ADATA/repos/cpp_tracker/bin/test_models
--- a/build/libbb_regressor.a
+++ b/build/libbb_regressor.a
--- a/build/libclassifier.a
+++ b/build/libclassifier.a
--- a/build/tracking_demo
+++ b/build/tracking_demo
--- a/cimp/bb_regressor/bb_regressor.cpp
+++ b/cimp/bb_regressor/bb_regressor.cpp
@ -507,49 +507,32 @@ void BBRegressor::to(torch::Device device) {
 }

 // Get IoU features from backbone features
-std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat2) {
-    // Convert to double precision for better numerical stability
-    auto feat2_double0 = feat2[0].to(torch::kFloat64);
-    auto feat2_double1 = feat2[1].to(torch::kFloat64);
+std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat2_input) {
+    torch::Tensor feat3_t_original = feat2_input[0];
+    torch::Tensor feat4_t_original = feat2_input[1];
    
    // Reshape exactly as in Python implementation
-    // In Python: feat2 = [f.reshape(-1, *f.shape[-3:]) if f.dim()==5 else f for f in feat2]
-    if (feat2_double0.dim() == 5) {
-        auto shape = feat2_double0.sizes();
-        feat2_double0 = feat2_double0.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
+    if (feat3_t_original.dim() == 5) {
+        auto shape = feat3_t_original.sizes();
+        feat3_t_original = feat3_t_original.reshape({-1, shape[2], shape[3], shape[4]});
    }
-    
-    if (feat2_double1.dim() == 5) {
-        auto shape = feat2_double1.sizes();
-        feat2_double1 = feat2_double1.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
+    if (feat4_t_original.dim() == 5) {
+        auto shape = feat4_t_original.sizes();
+        feat4_t_original = feat4_t_original.reshape({-1, shape[2], shape[3], shape[4]});
    }
    
-    // Convert back to float32 for convolution operations
-    feat2[0] = feat2_double0.to(torch::kFloat32).contiguous();
-    feat2[1] = feat2_double1.to(torch::kFloat32).contiguous();
-    
-    // Apply convolutions exactly as in Python
-    torch::Tensor feat3_t = feat2[0];
-    torch::Tensor feat4_t = feat2[1];
+    // Ensure inputs to conv are contiguous and kFloat32 (ResNet output should be float32)
+    torch::Tensor feat3_t = feat3_t_original.contiguous().to(torch::kFloat32);
+    torch::Tensor feat4_t = feat4_t_original.contiguous().to(torch::kFloat32);
    
-    // Ensure we're in evaluation mode
    torch::NoGradGuard no_grad;
    
-    // Apply convolutions just like Python version
    torch::Tensor c3_t_1 = conv3_1t->forward(feat3_t);
-    c3_t_1 = c3_t_1.contiguous();
-    
    torch::Tensor c3_t = conv3_2t->forward(c3_t_1);
-    c3_t = c3_t.contiguous();
-    
    torch::Tensor c4_t_1 = conv4_1t->forward(feat4_t);
-    c4_t_1 = c4_t_1.contiguous();
-    
    torch::Tensor c4_t = conv4_2t->forward(c4_t_1);
-    c4_t = c4_t.contiguous();
    
-    // Return results
-    return {c3_t, c4_t};
+    return {c3_t.contiguous(), c4_t.contiguous()}; // Ensure output is contiguous and float32
 }

 // Get modulation vectors for the target
--- a/cimp/classifier/classifier.h
+++ b/cimp/classifier/classifier.h
@ -69,6 +69,9 @@ private:
        torch::Tensor weight;
        InstanceL2Norm norm;
        
+        // Constructor to initialize norm with specific scale
+        FeatureExtractor() : norm(true, 1e-5f, 0.011048543456039804f) {}
+
        torch::Tensor forward(torch::Tensor x);
        torch::Tensor extract_feat(torch::Tensor x);
        void load_weights(const std::string& weights_dir, torch::Device device);
--- a/cimp/resnet/resnet.cpp
+++ b/cimp/resnet/resnet.cpp
@ -5,6 +5,7 @@
 #include <optional> // ensure this is included
 #include <fstream> // Added for std::ifstream
 #include <vector>  // Added for std::vector
+#include <iomanip> // Added for std::fixed and std::setprecision

 namespace cimp {
 namespace resnet {
@ -66,34 +67,37 @@ BottleneckImpl::BottleneckImpl(const std::string& base_weights_dir,

    // conv1
    conv1 = torch::nn::Conv2d(torch::nn::Conv2dOptions(inplanes, planes, 1).bias(false));
-    bn1 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes));
+    bn1 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
    conv1->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv1.weight", device);
    bn1->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.weight", device);
    bn1->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.bias", device);
    bn1->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_mean", device);
    bn1->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.running_var", device);
+    bn1->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn1.num_batches_tracked", device);
    register_module("conv1", conv1);
    register_module("bn1", bn1);

    // conv2
    conv2 = torch::nn::Conv2d(torch::nn::Conv2dOptions(planes, planes, 3).stride(stride_member).padding(1).bias(false));
-    bn2 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes));
+    bn2 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
    conv2->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv2.weight", device);
    bn2->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.weight", device);
    bn2->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.bias", device);
    bn2->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_mean", device);
    bn2->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.running_var", device);
+    bn2->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn2.num_batches_tracked", device);
    register_module("conv2", conv2);
    register_module("bn2", bn2);

    // conv3
    conv3 = torch::nn::Conv2d(torch::nn::Conv2dOptions(planes, planes * expansion_factor, 1).bias(false));
-    bn3 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes * expansion_factor));
+    bn3 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes * expansion_factor).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
    conv3->weight = load_named_tensor(base_weights_dir, block_param_prefix + "conv3.weight", device);
    bn3->weight = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.weight", device);
    bn3->bias = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.bias", device);
    bn3->named_buffers()["running_mean"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_mean", device);
    bn3->named_buffers()["running_var"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.running_var", device);
+    bn3->named_buffers()["num_batches_tracked"] = load_named_tensor(base_weights_dir, block_param_prefix + "bn3.num_batches_tracked", device);
    register_module("conv3", conv3);
    register_module("bn3", bn3);

@ -144,15 +148,21 @@ ResNetImpl::ResNetImpl(const std::string& base_weights_dir_path,
    : _output_layers(output_layers_param), _base_weights_dir(base_weights_dir_path) {

    conv1 = torch::nn::Conv2d(torch::nn::Conv2dOptions(3, 64, 7).stride(2).padding(3).bias(false));
-    bn1 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(64));
+    bn1 = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(64).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
    this->conv1->weight = load_named_tensor(this->_base_weights_dir, "conv1.weight", device);
    this->bn1->weight = load_named_tensor(this->_base_weights_dir, "bn1.weight", device);
    this->bn1->bias = load_named_tensor(this->_base_weights_dir, "bn1.bias", device);
+    
    this->bn1->named_buffers()["running_mean"] = load_named_tensor(this->_base_weights_dir, "bn1.running_mean", device);
    this->bn1->named_buffers()["running_var"] = load_named_tensor(this->_base_weights_dir, "bn1.running_var", device);
+    
+    this->bn1->named_buffers()["num_batches_tracked"] = load_named_tensor(this->_base_weights_dir, "bn1.num_batches_tracked", device);
    register_module("conv1", conv1);
    register_module("bn1", bn1);

+    std::cout << "CPP ResNetImpl::bn1 running_mean sum: " << std::fixed << std::setprecision(10) << this->bn1->running_mean.sum().item<double>() << std::endl;
+    std::cout << "CPP ResNetImpl::bn1 running_var sum: " << std::fixed << std::setprecision(10) << this->bn1->running_var.sum().item<double>() << std::endl;
+
    relu = torch::nn::ReLU(torch::nn::ReLUOptions().inplace(true));
    maxpool = torch::nn::MaxPool2d(torch::nn::MaxPool2dOptions(3).stride(2).padding(1));
    register_module("relu", relu);
@ -178,7 +188,7 @@ torch::nn::Sequential ResNetImpl::_make_layer(int64_t planes_for_block, int64_t
    if (stride_for_first_block != 1 || this->inplanes != planes_for_block * ResNetImpl::expansion) {
        torch::nn::Sequential ds_seq;
        auto conv_down = torch::nn::Conv2d(torch::nn::Conv2dOptions(this->inplanes, planes_for_block * ResNetImpl::expansion, 1).stride(stride_for_first_block).bias(false));
-        auto bn_down = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes_for_block * ResNetImpl::expansion));
+        auto bn_down = torch::nn::BatchNorm2d(torch::nn::BatchNorm2dOptions(planes_for_block * ResNetImpl::expansion).eps(static_cast<float>(1e-5)).momentum(0.1).affine(true).track_running_stats(true));
        
        std::string ds_block_prefix = layer_param_prefix + "0.downsample.";

@ -187,6 +197,7 @@ torch::nn::Sequential ResNetImpl::_make_layer(int64_t planes_for_block, int64_t
        bn_down->bias = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.bias", device);
        bn_down->named_buffers()["running_mean"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_mean", device);
        bn_down->named_buffers()["running_var"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.running_var", device);
+        bn_down->named_buffers()["num_batches_tracked"] = load_named_tensor(this->_base_weights_dir, ds_block_prefix + "1.num_batches_tracked", device);

        ds_seq->push_back(conv_down);
        ds_seq->push_back(bn_down);
@ -212,66 +223,78 @@ torch::nn::Sequential ResNetImpl::_make_layer(int64_t planes_for_block, int64_t

 std::map<std::string, torch::Tensor> ResNetImpl::forward(torch::Tensor x) {
    std::map<std::string, torch::Tensor> outputs;
+
    auto should_output = [&](const std::string& layer_name) {
        return std::find(_output_layers.begin(), _output_layers.end(), layer_name) != _output_layers.end();
    };

-    // Original GPU path for conv1
    x = conv1->forward(x);
    if (should_output("conv1_output")) outputs["conv1_output"] = x; 
    
-    x = bn1->forward(x.clone()); 
+    x = bn1->forward(x);
    if (should_output("bn1_output")) outputs["bn1_output"] = x; 
    
-    x = relu->forward(x.clone()); 
+    x = relu->forward(x); 
    if (should_output("relu1_output")) outputs["relu1_output"] = x; 
    
-    torch::Tensor x_pre_layer1 = maxpool->forward(x.clone()); 
+    torch::Tensor x_pre_layer1 = maxpool->forward(x);
    if (should_output("maxpool_output")) outputs["maxpool_output"] = x_pre_layer1;

-    // Pass x_pre_layer1 to layer1
-    torch::Tensor x_after_layer1 = layer1->forward(x_pre_layer1.clone()); // Use .clone() if layer1 might modify input inplace, good practice
+    // Save output of layer1.0 block if requested
+    if (should_output("layer1_0_block_output")) {
+        if (layer1 && !layer1->is_empty()) {
+            try {
+                // Get the base module pointer
+                std::shared_ptr<torch::nn::Module> base_module_ptr = layer1->ptr(0);
+                // Try to cast it to our BottleneckImpl (which is a torch::nn::Module)
+                auto bottleneck_impl_ptr = std::dynamic_pointer_cast<cimp::resnet::BottleneckImpl>(base_module_ptr);
+                
+                if (bottleneck_impl_ptr) {
+                    // Now call forward on the BottleneckImpl instance
+                    outputs["layer1_0_block_output"] = bottleneck_impl_ptr->forward(x_pre_layer1); 
+                } else {
+                    std::cerr << "ERROR: layer1->ptr(0) could not be dynamically cast to BottleneckImpl! Module type: " 
+                              << (base_module_ptr ? typeid(*base_module_ptr).name() : "null") << std::endl;
+                }
+            } catch (const std::exception& e) {
+                 std::cerr << "EXCEPTION while getting layer1_0_block_output: " << e.what() << std::endl;
+            }
+        }
+    }
+
+    torch::Tensor x_after_layer1 = layer1->forward(x_pre_layer1); 
    if (should_output("layer1")) outputs["layer1"] = x_after_layer1;

    if (should_output("layer1_0_shortcut_output")) {
        if (layer1 && !layer1->is_empty()) {
            try {
-                // Get the first module (Bottleneck) from layer1 Sequential container
                std::shared_ptr<torch::nn::Module> first_block_module_ptr = layer1->ptr(0);
-                // Attempt to dynamically cast to Bottleneck type
                auto bottleneck_module_holder = std::dynamic_pointer_cast<cimp::resnet::BottleneckImpl>(first_block_module_ptr);
                
-                if (bottleneck_module_holder) { // Check if cast was successful
-                    // Accessing projection_shortcut directly from BottleneckImpl
+                if (bottleneck_module_holder) { 
                    if (bottleneck_module_holder->projection_shortcut) {
-                        torch::Tensor shortcut_out = bottleneck_module_holder->projection_shortcut->forward(x_pre_layer1.clone());
+                        torch::Tensor shortcut_out = bottleneck_module_holder->projection_shortcut->forward(x_pre_layer1); 
                        outputs["layer1_0_shortcut_output"] = shortcut_out;
-                    } else {
-                        // std::cout << "DEBUG: layer1.0 projection_shortcut is null." << std::endl;
                    }
-                } else {
-                    // std::cerr << "ERROR: Failed to cast first block of layer1 to BottleneckImpl." << std::endl;
                }
            } catch (const std::exception& e) {
                // std::cerr << "ERROR: Exception while getting layer1_0_shortcut_output: " << e.what() << std::endl;
            }
-        } else {
-            // std::cout << "DEBUG: layer1 is null or empty, cannot get shortcut output." << std::endl;
        }
    }

-    torch::Tensor x_current = x_after_layer1; // Continue with the output of layer1
+    torch::Tensor x_current = x_after_layer1; 

-    x_current = layer2->forward(x_current.clone());
+    x_current = layer2->forward(x_current); 
    if (should_output("layer2")) outputs["layer2"] = x_current;

-    x_current = layer3->forward(x_current.clone());
+    x_current = layer3->forward(x_current); 
    if (should_output("layer3")) outputs["layer3"] = x_current;

-    x_current = layer4->forward(x_current.clone());
+    x_current = layer4->forward(x_current); 
    if (should_output("layer4")) outputs["layer4"] = x_current;
    
-    if (should_output("features")) outputs["features"] = x_current; // 'features' is typically layer4 output
+    if (should_output("features")) outputs["features"] = x_current;

    return outputs;
 }
--- a/ltr/models/bbreg/pycache/atom_iou_net.cpython-311.pyc
+++ b/ltr/models/bbreg/pycache/atom_iou_net.cpython-311.pyc
--- a/pytracking/features/pycache/net_wrappers.cpython-311.pyc
+++ b/pytracking/features/pycache/net_wrappers.cpython-311.pyc
--- a/test/compare_models.py
+++ b/test/compare_models.py
@ -98,6 +98,34 @@ class ComparisonRunner:
        if not os.path.exists(self.comparison_dir):
            os.makedirs(self.comparison_dir)

+        print("PYTHON: Attempting to load 'traced_resnet50.pth'...")
+        try:
+            self.models['ResNet'] = torch.jit.load('traced_resnet50.pth', map_location=self.device)
+            print("PYTHON: Successfully loaded 'traced_resnet50.pth'.")
+            self.models['ResNet'].eval()
+            print("PYTHON: ResNet JIT model set to eval().")
+        except Exception as e:
+            print(f"PYTHON: CRITICAL ERROR loading 'traced_resnet50.pth': {e}")
+            self.models['ResNet'] = None # Ensure it's None if loading failed
+
+        # Print sums of ResNet.bn1 running_mean and running_var from state_dict
+        print("PYTHON: Attempting to access ResNet state_dict (if model loaded)...")
+        if self.models.get('ResNet'):
+            try:
+                resnet_state_dict = self.models['ResNet'].state_dict()
+                print("PYTHON ResNet state_dict keys:", list(resnet_state_dict.keys())) # PRINT ALL KEYS
+                py_bn1_running_mean = resnet_state_dict.get('bn1.running_mean')
+                py_bn1_running_var = resnet_state_dict.get('bn1.running_var')
+                if py_bn1_running_mean is not None and py_bn1_running_var is not None:
+                    print(f"PYTHON ResNet.bn1 running_mean sum (from state_dict): {py_bn1_running_mean.sum().item():.10f}")
+                    print(f"PYTHON ResNet.bn1 running_var sum (from state_dict): {py_bn1_running_var.sum().item():.10f}")
+                else:
+                    print("PYTHON: ResNet.bn1 running_mean or running_var is None in state_dict.")
+            except Exception as e:
+                print(f"PYTHON: Error accessing ResNet.bn1 state_dict: {e}")
+
+        # Load other models if necessary (e.g., BBRegressor, Classifier)
+
    def load_python_models(self):
        print("DEBUG: ComparisonRunner.load_python_models() ENTERED") # DEBUG PRINT
        """Initialize Python models"""
@ -301,6 +329,93 @@ class ComparisonRunner:
            print("  Skipping layer1.0 parameter comparison: ResNet model or its layer1 not found/empty.")
        print("--- END CURRENTLY USED layer1.0 PARAMS COMPARISON ---\n") # Corrected to \n

+        # --- START WEIGHT COMPARISON FOR layer1.1 and layer1.2 ---
+        for block_idx_in_layer1 in [1, 2]: # For layer1.1 and layer1.2
+            print(f"\n--- COMPARING CURRENTLY USED layer1.{block_idx_in_layer1} PARAMS (Python vs C++) ---")
+            layer1_block_prefix = f"layer1.{block_idx_in_layer1}."
+            # Components within a standard bottleneck block (no downsample for these)
+            block_components = {
+                "conv1": ["weight"],
+                "bn1": ["weight", "bias", "running_mean", "running_var", "num_batches_tracked"],
+                "conv2": ["weight"],
+                "bn2": ["weight", "bias", "running_mean", "running_var", "num_batches_tracked"],
+                "conv3": ["weight"],
+                "bn3": ["weight", "bias", "running_mean", "running_var", "num_batches_tracked"],
+            }
+
+            if self.models.get('ResNet') and hasattr(self.models['ResNet'], 'layer1') and len(self.models['ResNet'].layer1) > block_idx_in_layer1:
+                py_layer1_block_module = self.models['ResNet'].layer1[block_idx_in_layer1]
+                
+                for comp_name, param_list in block_components.items():
+                    py_comp_module = py_layer1_block_module
+                    try:
+                        # No nested modules like 'downsample' for these blocks
+                        py_comp_module = getattr(py_comp_module, comp_name)
+                    except AttributeError:
+                        print(f"  Python ResNet model's layer1.{block_idx_in_layer1} does not have component {comp_name}. Skipping.")
+                        continue
+
+                    for p_name in param_list:
+                        py_param_tensor_name = f"{layer1_block_prefix}{comp_name}.{p_name}"
+                        # C++ saves files like layer1_0_bn1_weight.pt or layer1_1_bn1_weight.pt
+                        cpp_param_filename = f"{layer1_block_prefix.replace('.', '_')}{comp_name.replace('.', '_')}_{p_name}.pt"
+                        
+                        py_param_tensor = None
+                        if hasattr(py_comp_module, p_name):
+                            param_tensor_val = getattr(py_comp_module, p_name)
+                            if param_tensor_val is not None:
+                                py_param_tensor = param_tensor_val.detach().cpu()
+                                print(f"  Python ResNet {py_param_tensor_name} shape: {py_param_tensor.shape}")
+                            else:
+                                print(f"  Python ResNet {py_param_tensor_name} is None.")
+                        elif p_name == "num_batches_tracked" and isinstance(py_comp_module, torch.nn.BatchNorm2d):
+                            # PyTorch stores num_batches_tracked in _buffers, not as a direct attribute usually
+                            if py_comp_module.num_batches_tracked is not None:
+                                py_param_tensor = py_comp_module.num_batches_tracked.detach().cpu()
+                                print(f"  Python ResNet {py_param_tensor_name} (from buffer) shape: {py_param_tensor.shape}")
+                            else:
+                                print(f"  Python ResNet {py_param_tensor_name} (from buffer) is None.")
+                        else:
+                            print(f"  Python ResNet module {comp_name} does not have param/buffer {p_name}.")
+
+                        cpp_param_path = os.path.join(self.root_dir, "exported_weights/backbone_regenerated", cpp_param_filename)
+                        cpp_param_tensor = None
+                        if os.path.exists(cpp_param_path):
+                            try:
+                                cpp_param_tensor = torch.load(cpp_param_path, map_location='cpu', weights_only=False)
+                            except Exception as e:
+                                print(f"    Error loading C++ {cpp_param_filename} from {cpp_param_path}: {e}")
+                        else:
+                            print(f"    Warning: C++ {cpp_param_filename} file not found: {cpp_param_path}")
+                        
+                        print(f"  Comparison for {py_param_tensor_name} vs {cpp_param_filename}:")
+                        if py_param_tensor is not None and cpp_param_tensor is not None:
+                            if isinstance(py_param_tensor, torch.Tensor) and isinstance(cpp_param_tensor, torch.Tensor):
+                                # Ensure tensors are float for allclose if one is int (e.g. num_batches_tracked)
+                                py_param_tensor_float = py_param_tensor.float()
+                                cpp_param_tensor_float = cpp_param_tensor.float()
+                                all_close = torch.allclose(py_param_tensor_float, cpp_param_tensor_float)
+                                print(f"    torch.allclose: {all_close}")
+                                if not all_close:
+                                    abs_diff = torch.abs(py_param_tensor_float - cpp_param_tensor_float)
+                                    mae = torch.mean(abs_diff).item()
+                                    max_abs_err = torch.max(abs_diff).item()
+                                    print(f"      MAE (Weight/Buffer): {mae:.4e}")
+                                    print(f"      Max Abs Err (Weight/Buffer): {max_abs_err:.4e}")
+                                    # Also print L2 norms for context
+                                    l2_py = torch.linalg.norm(py_param_tensor_float.flatten()).item()
+                                    l2_cpp = torch.linalg.norm(cpp_param_tensor_float.flatten()).item()
+                                    print(f"      L2 Norm Python: {l2_py:.4e}")
+                                    print(f"      L2 Norm C++: {l2_cpp:.4e}")
+                            else:
+                                print(f"    Skipping comparison due to type mismatch after loading for {py_param_tensor_name}.")
+                        else:
+                            print(f"    Skipping comparison because one or both tensors could not be obtained for {py_param_tensor_name}.")
+            else:
+                print(f"  Skipping layer1.{block_idx_in_layer1} parameter comparison: ResNet model or its layer1 not found/long enough.")
+            print(f"--- END CURRENTLY USED layer1.{block_idx_in_layer1} PARAMS COMPARISON ---\n")
+        # --- END WEIGHT COMPARISON FOR layer1.1 and layer1.2 ---
+
        # --- END TEMPORARY WEIGHT COMPARISON --- # This marker is now after layer1.0 checks

        print("\n--- Types at END of load_python_models: ---")
@ -524,14 +639,32 @@ class ComparisonRunner:
        if 'Classifier' in self.models: print(f"  self.models['Classifier'] type: {type(self.models['Classifier'])}")
        if 'BBRegressor' in self.models: print(f"  self.models['BBRegressor'] type: {type(self.models['BBRegressor'])}")

-        print("\nComparing ResNet outputs...")
        py_input_common_dir = os.path.join(self.root_dir, 'test', 'input_samples', 'common')
        cpp_output_resnet_dir = os.path.join(self.cpp_output_dir, 'resnet')
+        # Ensure self.py_resnet_output_dir is defined, e.g., in __init__ or where other py output dirs are
+        if not hasattr(self, 'py_resnet_output_dir') or not self.py_resnet_output_dir:
+            self.py_resnet_output_dir = Path(self.python_output_dir) / 'resnet'
+            self.py_resnet_output_dir.mkdir(parents=True, exist_ok=True)

-        # Convert to Path objects for exists() check, though os.path.exists also works with strings
+        # Define Path objects for directory checks
        py_input_common_dir_path = Path(py_input_common_dir)
        cpp_output_resnet_dir_path = Path(cpp_output_resnet_dir)

+        comparison_configs = [
+            ("ResNet Conv1 Output (Pre-BN)", "_conv1_output_py.pt", "_conv1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Conv1", "_conv1_output.pt", "_conv1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir), # Assumes Py also saved conv1 output if it was meant to be same as C++ pre-bn
+            ("ResNet BN1", "_bn1_output.pt", "_bn1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet ReLU1", "_relu1_output.pt", "_relu1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet MaxPool", "_maxpool_output.pt", "_maxpool_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Layer1.0 Block Output", "_layer1_0_block_output.pt", "_layer1_0_block_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Layer1.0 Shortcut Output", "_layer1_0_shortcut_output.pt", "_layer1_0_shortcut_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Layer1", "_layer1_output.pt", "_layer1_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Layer2", "_layer2_output.pt", "_layer2_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Layer3", "_layer3_output.pt", "_layer3_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Layer4", "_layer4_output.pt", "_layer4_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir),
+            ("ResNet Features", "_features_output.pt", "_features_output.pt", self.py_resnet_output_dir, cpp_output_resnet_dir) 
+        ]
+
        if not py_input_common_dir_path.exists() or not cpp_output_resnet_dir_path.exists():
            print(f"ResNet input ({py_input_common_dir_path}) or C++ ResNet output dir ({cpp_output_resnet_dir_path}) not found. Skipping ResNet comparison.")
            # Populate NaN for all expected ResNet comparisons if dirs are missing
@ -584,14 +717,45 @@ class ComparisonRunner:
                    with torch.no_grad():
                        py_model_resnet = self.models.get('ResNet')
                        if py_model_resnet:
-                            # Original GPU path for all Python ResNet layers
-                            current_features = preprocessed_py_image_for_conv1 # Start with preprocessed image
+                            current_features = preprocessed_py_image_for_conv1

                            py_conv1_out = py_model_resnet.conv1(current_features)
-                            py_bn1_out = py_model_resnet.bn1(py_conv1_out.clone()) 
-                            py_relu1_out = py_model_resnet.relu(py_bn1_out.clone()) 
-                            py_maxpool_out = py_model_resnet.maxpool(py_relu1_out.clone()) 
-                            x_for_py_layer1_input = py_maxpool_out # This is the input to layer1 block
+                            # Ensure self.py_resnet_output_dir is defined and is a Path object
+                            if not hasattr(self, 'py_resnet_output_dir') or not self.py_resnet_output_dir:
+                                self.py_resnet_output_dir = Path(self.python_output_dir) / 'resnet'
+                                self.py_resnet_output_dir.mkdir(parents=True, exist_ok=True)
+                            py_conv1_out_path = self.py_resnet_output_dir / f'sample_{i}_conv1_output_py.pt'
+                            torch.save(py_conv1_out.cpu(), str(py_conv1_out_path))
+
+                            # --- BN1 on CPU for debugging (Python) ---
+                            py_bn1_out = py_model_resnet.bn1(py_conv1_out)  # Original line
+
+                            py_relu1_out = py_model_resnet.relu(py_bn1_out) 
+                            py_maxpool_out = py_model_resnet.maxpool(py_relu1_out) 
+                            x_for_py_layer1_input = py_maxpool_out
+
+                            # Output of the first bottleneck block in layer1
+                            py_layer1_0_block_out_tensor = None # Initialize to avoid ref before assignment if try fails
+                            if hasattr(py_model_resnet, 'layer1') and len(py_model_resnet.layer1) > 0:
+                                try:
+                                    py_layer1_0_block_out_tensor = py_model_resnet.layer1[0](x_for_py_layer1_input) # REMOVED .clone() for consistency with best Layer1.0 result
+                                    # Ensure cpp_resnet_sample_dir is defined, if not, use a fallback or define it earlier
+                                    # Assuming cpp_resnet_sample_dir is defined like: cpp_resnet_sample_dir = Path(self.cpp_output_dir) / 'resnet'
+                                    # Which should be: cpp_resnet_dir = Path(self.cpp_output_dir) / 'resnet' # as per usage elsewhere
+                                    # And then: cpp_resnet_sample_dir = cpp_resnet_dir # if sample specific subdirs are not used for this
+                                    # For safety, let's use the already established cpp_output_resnet_dir path from later in the code
+                                    # cpp_output_resnet_dir = os.path.join(self.cpp_output_dir, 'resnet')
+                                    # Need to ensure cpp_output_resnet_dir is a Path object if used with /
+                                    # From later code: cpp_output_resnet_dir_path = Path(self.cpp_output_dir) / 'resnet' 
+
+                                    current_cpp_resnet_dir = Path(self.cpp_output_dir) / 'resnet' # Define it based on existing patterns
+                                    current_cpp_resnet_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists
+
+                                    py_layer1_0_block_save_path = current_cpp_resnet_dir / f'sample_{i}_layer1_0_block_output.pt'
+                                    torch.save(py_layer1_0_block_out_tensor.cpu(), str(py_layer1_0_block_save_path))
+                                    # print(f"DEBUG: Saved Python layer1[0] block output for sample {i} to {py_layer1_0_block_save_path}")
+                                except Exception as e_block:
+                                    print(f"ERROR: Failed to get/save Python layer1[0] block output for sample {i}: {e_block}")

                            # Shortcut for layer1.0 (if exists)
                            if hasattr(py_model_resnet, 'layer1') and len(py_model_resnet.layer1) > 0 and \
@ -646,6 +810,7 @@ class ComparisonRunner:
            cpp_conv1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_conv1_output.pt') # ADDED
            cpp_bn1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_bn1_output.pt') # ADDED
            cpp_relu1_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_relu1_output.pt') # ADDED
+            cpp_layer1_0_block_output_path = os.path.join(cpp_output_resnet_dir, f'sample_{i}_layer1_0_block_output.pt') # ADDED

            cpp_layer1_out = self.load_cpp_tensor(cpp_layer1_path, self.device)
            cpp_layer2_out = self.load_cpp_tensor(cpp_layer2_path, self.device)
@ -657,17 +822,33 @@ class ComparisonRunner:
            cpp_conv1_out = self.load_cpp_tensor(cpp_conv1_path, self.device) # ADDED
            cpp_bn1_out = self.load_cpp_tensor(cpp_bn1_path, self.device) # ADDED
            cpp_relu1_out = self.load_cpp_tensor(cpp_relu1_path, self.device) # ADDED
+            cpp_layer1_0_block_output_tensor = self.load_cpp_tensor(cpp_layer1_0_block_output_path, self.device) # ADDED
+
+            # Load the Python pre-BN conv1 output that was saved earlier
+            py_conv1_out_pre_bn_tensor = None
+            # Ensure self.py_resnet_output_dir is defined (it should be if the save operation worked)
+            if hasattr(self, 'py_resnet_output_dir') and self.py_resnet_output_dir:
+                py_conv1_out_pre_bn_path = self.py_resnet_output_dir / f'sample_{i}_conv1_output_py.pt'
+                if py_conv1_out_pre_bn_path.exists():
+                    try:
+                        py_conv1_out_pre_bn_tensor = torch.load(str(py_conv1_out_pre_bn_path), map_location=self.device)
+                    except Exception as e_load_py_conv1:
+                        print(f"Error loading Python conv1_output_py (pre-BN) for sample {i}: {e_load_py_conv1}")
+            else:
+                print(f"Warning: self.py_resnet_output_dir not defined, cannot load py_conv1_output_py.pt for sample {i}")

-            self._compare_tensor_data(py_conv1_out, cpp_conv1_out, "ResNet Conv1", i, current_errors) # REVERTED to py_conv1_out
-            self._compare_tensor_data(py_bn1_out, cpp_bn1_out, "ResNet BN1", i, current_errors) 
-            self._compare_tensor_data(py_relu1_out, cpp_relu1_out, "ResNet ReLU1", i, current_errors) 
+            # Comparisons
+            self._compare_tensor_data(py_conv1_out_pre_bn_tensor, cpp_conv1_out, "ResNet Conv1 Output (Pre-BN)", i, current_errors)
+            self._compare_tensor_data(py_conv1_out, cpp_conv1_out, "ResNet Conv1", i, current_errors)
+            self._compare_tensor_data(py_bn1_out, cpp_bn1_out, "ResNet BN1", i, current_errors)
+            self._compare_tensor_data(py_relu1_out, cpp_relu1_out, "ResNet ReLU1", i, current_errors)
            self._compare_tensor_data(py_maxpool_out, cpp_maxpool_out, "ResNet MaxPool", i, current_errors)
            self._compare_tensor_data(py_layer1_out, cpp_layer1_out, "ResNet Layer1", i, current_errors)
-            self._compare_tensor_data(py_layer1_0_shortcut_out, cpp_layer1_0_shortcut_out, "ResNet Layer1.0 Shortcut", i, current_errors)
            self._compare_tensor_data(py_layer2_out, cpp_layer2_out, "ResNet Layer2", i, current_errors)
            self._compare_tensor_data(py_layer3_out, cpp_layer3_out, "ResNet Layer3", i, current_errors)
            self._compare_tensor_data(py_layer4_out, cpp_layer4_out, "ResNet Layer4", i, current_errors)
            self._compare_tensor_data(py_features_out, cpp_features_out, "ResNet Features", i, current_errors)
+            self._compare_tensor_data(py_layer1_0_shortcut_out, cpp_layer1_0_shortcut_out, "ResNet Layer1.0 Shortcut", i, current_errors)

            if current_errors: self.all_comparison_stats[f"ResNet_Sample_{i}"] = current_errors

--- a/test/output_py/bb_regressor/sample_0_debug_c3_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_0_debug_c3_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_0_debug_c4_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_0_debug_c4_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_0_debug_roi2_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_0_debug_roi2_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_0_fc3_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_0_fc3_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_0_fc4_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_0_fc4_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_0_feat_prod0_py.pt
+++ b/test/output_py/bb_regressor/sample_0_feat_prod0_py.pt
--- a/test/output_py/bb_regressor/sample_0_feat_prod1_py.pt
+++ b/test/output_py/bb_regressor/sample_0_feat_prod1_py.pt
--- a/test/output_py/bb_regressor/sample_0_iou_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_0_iou_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_0_iou_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_0_iou_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_0_iou_feat_final_py.pt
+++ b/test/output_py/bb_regressor/sample_0_iou_feat_final_py.pt
--- a/test/output_py/bb_regressor/sample_0_iou_scores_py.pt
+++ b/test/output_py/bb_regressor/sample_0_iou_scores_py.pt
--- a/test/output_py/bb_regressor/sample_0_mod0_py.pt
+++ b/test/output_py/bb_regressor/sample_0_mod0_py.pt
--- a/test/output_py/bb_regressor/sample_0_mod1_py.pt
+++ b/test/output_py/bb_regressor/sample_0_mod1_py.pt
--- a/test/output_py/bb_regressor/sample_0_mod_vec0_py.pt
+++ b/test/output_py/bb_regressor/sample_0_mod_vec0_py.pt
--- a/test/output_py/bb_regressor/sample_0_mod_vec1_py.pt
+++ b/test/output_py/bb_regressor/sample_0_mod_vec1_py.pt
--- a/test/output_py/bb_regressor/sample_0_pooled_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_0_pooled_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_0_pooled_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_0_pooled_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_10_debug_c3_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_10_debug_c3_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_10_debug_c4_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_10_debug_c4_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_10_debug_roi2_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_10_debug_roi2_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_10_fc3_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_10_fc3_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_10_fc4_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_10_fc4_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_10_feat_prod0_py.pt
+++ b/test/output_py/bb_regressor/sample_10_feat_prod0_py.pt
--- a/test/output_py/bb_regressor/sample_10_feat_prod1_py.pt
+++ b/test/output_py/bb_regressor/sample_10_feat_prod1_py.pt
--- a/test/output_py/bb_regressor/sample_10_iou_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_10_iou_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_10_iou_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_10_iou_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_10_iou_feat_final_py.pt
+++ b/test/output_py/bb_regressor/sample_10_iou_feat_final_py.pt
--- a/test/output_py/bb_regressor/sample_10_iou_scores_py.pt
+++ b/test/output_py/bb_regressor/sample_10_iou_scores_py.pt
--- a/test/output_py/bb_regressor/sample_10_mod0_py.pt
+++ b/test/output_py/bb_regressor/sample_10_mod0_py.pt
--- a/test/output_py/bb_regressor/sample_10_mod1_py.pt
+++ b/test/output_py/bb_regressor/sample_10_mod1_py.pt
--- a/test/output_py/bb_regressor/sample_10_pooled_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_10_pooled_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_10_pooled_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_10_pooled_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_11_debug_c3_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_11_debug_c3_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_11_debug_c4_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_11_debug_c4_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_11_debug_roi2_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_11_debug_roi2_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_11_fc3_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_11_fc3_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_11_fc4_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_11_fc4_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_11_feat_prod0_py.pt
+++ b/test/output_py/bb_regressor/sample_11_feat_prod0_py.pt
--- a/test/output_py/bb_regressor/sample_11_feat_prod1_py.pt
+++ b/test/output_py/bb_regressor/sample_11_feat_prod1_py.pt
--- a/test/output_py/bb_regressor/sample_11_iou_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_11_iou_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_11_iou_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_11_iou_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_11_iou_feat_final_py.pt
+++ b/test/output_py/bb_regressor/sample_11_iou_feat_final_py.pt
--- a/test/output_py/bb_regressor/sample_11_iou_scores_py.pt
+++ b/test/output_py/bb_regressor/sample_11_iou_scores_py.pt
--- a/test/output_py/bb_regressor/sample_11_mod0_py.pt
+++ b/test/output_py/bb_regressor/sample_11_mod0_py.pt
--- a/test/output_py/bb_regressor/sample_11_mod1_py.pt
+++ b/test/output_py/bb_regressor/sample_11_mod1_py.pt
--- a/test/output_py/bb_regressor/sample_11_pooled_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_11_pooled_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_11_pooled_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_11_pooled_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_12_debug_c3_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_12_debug_c3_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_12_debug_c4_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_12_debug_c4_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_12_debug_roi2_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_12_debug_roi2_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_12_fc3_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_12_fc3_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_12_fc4_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_12_fc4_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_12_feat_prod0_py.pt
+++ b/test/output_py/bb_regressor/sample_12_feat_prod0_py.pt
--- a/test/output_py/bb_regressor/sample_12_feat_prod1_py.pt
+++ b/test/output_py/bb_regressor/sample_12_feat_prod1_py.pt
--- a/test/output_py/bb_regressor/sample_12_iou_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_12_iou_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_12_iou_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_12_iou_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_12_iou_feat_final_py.pt
+++ b/test/output_py/bb_regressor/sample_12_iou_feat_final_py.pt
--- a/test/output_py/bb_regressor/sample_12_iou_scores_py.pt
+++ b/test/output_py/bb_regressor/sample_12_iou_scores_py.pt
--- a/test/output_py/bb_regressor/sample_12_mod0_py.pt
+++ b/test/output_py/bb_regressor/sample_12_mod0_py.pt
--- a/test/output_py/bb_regressor/sample_12_mod1_py.pt
+++ b/test/output_py/bb_regressor/sample_12_mod1_py.pt
--- a/test/output_py/bb_regressor/sample_12_pooled_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_12_pooled_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_12_pooled_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_12_pooled_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_13_debug_c3_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_13_debug_c3_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_13_debug_c4_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_13_debug_c4_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_13_debug_roi2_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_13_debug_roi2_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_13_fc3_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_13_fc3_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_13_fc4_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_13_fc4_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_13_feat_prod0_py.pt
+++ b/test/output_py/bb_regressor/sample_13_feat_prod0_py.pt
--- a/test/output_py/bb_regressor/sample_13_feat_prod1_py.pt
+++ b/test/output_py/bb_regressor/sample_13_feat_prod1_py.pt
--- a/test/output_py/bb_regressor/sample_13_iou_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_13_iou_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_13_iou_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_13_iou_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_13_iou_feat_final_py.pt
+++ b/test/output_py/bb_regressor/sample_13_iou_feat_final_py.pt
--- a/test/output_py/bb_regressor/sample_13_iou_scores_py.pt
+++ b/test/output_py/bb_regressor/sample_13_iou_scores_py.pt
--- a/test/output_py/bb_regressor/sample_13_mod0_py.pt
+++ b/test/output_py/bb_regressor/sample_13_mod0_py.pt
--- a/test/output_py/bb_regressor/sample_13_mod1_py.pt
+++ b/test/output_py/bb_regressor/sample_13_mod1_py.pt
--- a/test/output_py/bb_regressor/sample_13_pooled_feat0_py.pt
+++ b/test/output_py/bb_regressor/sample_13_pooled_feat0_py.pt
--- a/test/output_py/bb_regressor/sample_13_pooled_feat1_py.pt
+++ b/test/output_py/bb_regressor/sample_13_pooled_feat1_py.pt
--- a/test/output_py/bb_regressor/sample_14_debug_c3_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_14_debug_c3_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_14_debug_c4_t_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_14_debug_c4_t_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_14_debug_roi2_prepool_py.pt
+++ b/test/output_py/bb_regressor/sample_14_debug_roi2_prepool_py.pt
--- a/test/output_py/bb_regressor/sample_14_fc3_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_14_fc3_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_14_fc4_rt_out_py.pt
+++ b/test/output_py/bb_regressor/sample_14_fc4_rt_out_py.pt
--- a/test/output_py/bb_regressor/sample_14_feat_prod0_py.pt
+++ b/test/output_py/bb_regressor/sample_14_feat_prod0_py.pt
--- a/test/output_py/bb_regressor/sample_14_feat_prod1_py.pt
+++ b/test/output_py/bb_regressor/sample_14_feat_prod1_py.pt