diff --git a/cimp/bb_regressor/bb_regressor.cpp b/cimp/bb_regressor/bb_regressor.cpp
index e711946..74a5e2b 100644
--- a/cimp/bb_regressor/bb_regressor.cpp
+++ b/cimp/bb_regressor/bb_regressor.cpp
@@ -8,10 +8,13 @@
 // Add CUDA includes for required CUDA implementation
 #include <cuda_runtime.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 
 // Use the PrRoIPooling implementation
 #include "prroi_pooling_gpu.h"
 #include "prroi_pooling_gpu_impl.cuh"
+#include "utils.h"
 
 // PrRoIPool2D implementation (requires CUDA)
 PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale) 
@@ -507,7 +510,8 @@ void BBRegressor::to(torch::Device device) {
 }
 
 // Get IoU features from backbone features
-std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat2_input) {
+std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor> feat2_input, int sample_idx) {
+    std::cout << "[DEBUG] Entered get_iou_feat with sample_idx=" << sample_idx << std::endl;
     torch::Tensor feat3_t_original = feat2_input[0];
     torch::Tensor feat4_t_original = feat2_input[1];
     
@@ -527,11 +531,62 @@ std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor>
     
     torch::NoGradGuard no_grad;
     
-    torch::Tensor c3_t_1 = conv3_1t->forward(feat3_t);
-    torch::Tensor c3_t = conv3_2t->forward(c3_t_1);
-    torch::Tensor c4_t_1 = conv4_1t->forward(feat4_t);
-    torch::Tensor c4_t = conv4_2t->forward(c4_t_1);
-    
+    // Ensure debug directory exists for sample 0
+    if (sample_idx == 0) {
+        const char* debug_dir = "test/output/bb_regressor";
+        struct stat st = {0};
+        if (stat(debug_dir, &st) == -1) {
+            mkdir(debug_dir, 0777);
+        }
+    }
+    // conv3_1t
+    auto c3_1t_conv = conv3_1t[0]->as<torch::nn::Conv2d>()->forward(feat3_t);
+    auto c3_1t_bn = conv3_1t[1]->as<torch::nn::BatchNorm2d>()->forward(c3_1t_conv);
+    auto c3_1t_relu = conv3_1t[2]->as<torch::nn::ReLU>()->forward(c3_1t_bn);
+    if (sample_idx == 0) {
+        std::cout << "[DEBUG] About to save debug tensors for sample_idx == 0" << std::endl;
+        save_tensor_to_file(c3_1t_bn.cpu(), "test/output/bb_regressor/sample_0_debug_conv3_1t_bn.pt");
+        save_tensor_to_file(c3_1t_relu.cpu(), "test/output/bb_regressor/sample_0_debug_conv3_1t_relu.pt");
+        std::cout << "conv3_1t_bn: dtype=" << c3_1t_bn.dtype() << ", device=" << c3_1t_bn.device() << ", shape=" << c3_1t_bn.sizes() << std::endl;
+        std::cout << "conv3_1t_relu: dtype=" << c3_1t_relu.dtype() << ", device=" << c3_1t_relu.device() << ", shape=" << c3_1t_relu.sizes() << std::endl;
+    }
+    auto c3_t_1 = c3_1t_relu;
+    // conv3_2t
+    auto c3_2t_conv = conv3_2t[0]->as<torch::nn::Conv2d>()->forward(c3_t_1);
+    auto c3_2t_bn = conv3_2t[1]->as<torch::nn::BatchNorm2d>()->forward(c3_2t_conv);
+    auto c3_2t_relu = conv3_2t[2]->as<torch::nn::ReLU>()->forward(c3_2t_bn);
+    if (sample_idx == 0) {
+        std::cout << "[DEBUG] About to save debug tensors for conv3_2t, sample_idx == 0" << std::endl;
+        save_tensor_to_file(c3_2t_bn.cpu(), "test/output/bb_regressor/sample_0_debug_conv3_2t_bn.pt");
+        save_tensor_to_file(c3_2t_relu.cpu(), "test/output/bb_regressor/sample_0_debug_conv3_2t_relu.pt");
+        std::cout << "conv3_2t_bn: dtype=" << c3_2t_bn.dtype() << ", device=" << c3_2t_bn.device() << ", shape=" << c3_2t_bn.sizes() << std::endl;
+        std::cout << "conv3_2t_relu: dtype=" << c3_2t_relu.dtype() << ", device=" << c3_2t_relu.device() << ", shape=" << c3_2t_relu.sizes() << std::endl;
+    }
+    auto c3_t = c3_2t_relu;
+    // conv4_1t
+    auto c4_1t_conv = conv4_1t[0]->as<torch::nn::Conv2d>()->forward(feat4_t);
+    auto c4_1t_bn = conv4_1t[1]->as<torch::nn::BatchNorm2d>()->forward(c4_1t_conv);
+    auto c4_1t_relu = conv4_1t[2]->as<torch::nn::ReLU>()->forward(c4_1t_bn);
+    if (sample_idx == 0) {
+        std::cout << "[DEBUG] About to save debug tensors for conv4_1t, sample_idx == 0" << std::endl;
+        save_tensor_to_file(c4_1t_bn.cpu(), "test/output/bb_regressor/sample_0_debug_conv4_1t_bn.pt");
+        save_tensor_to_file(c4_1t_relu.cpu(), "test/output/bb_regressor/sample_0_debug_conv4_1t_relu.pt");
+        std::cout << "conv4_1t_bn: dtype=" << c4_1t_bn.dtype() << ", device=" << c4_1t_bn.device() << ", shape=" << c4_1t_bn.sizes() << std::endl;
+        std::cout << "conv4_1t_relu: dtype=" << c4_1t_relu.dtype() << ", device=" << c4_1t_relu.device() << ", shape=" << c4_1t_relu.sizes() << std::endl;
+    }
+    auto c4_t_1 = c4_1t_relu;
+    // conv4_2t
+    auto c4_2t_conv = conv4_2t[0]->as<torch::nn::Conv2d>()->forward(c4_t_1);
+    auto c4_2t_bn = conv4_2t[1]->as<torch::nn::BatchNorm2d>()->forward(c4_2t_conv);
+    auto c4_2t_relu = conv4_2t[2]->as<torch::nn::ReLU>()->forward(c4_2t_bn);
+    if (sample_idx == 0) {
+        std::cout << "[DEBUG] About to save debug tensors for conv4_2t, sample_idx == 0" << std::endl;
+        save_tensor_to_file(c4_2t_bn.cpu(), "test/output/bb_regressor/sample_0_debug_conv4_2t_bn.pt");
+        save_tensor_to_file(c4_2t_relu.cpu(), "test/output/bb_regressor/sample_0_debug_conv4_2t_relu.pt");
+        std::cout << "conv4_2t_bn: dtype=" << c4_2t_bn.dtype() << ", device=" << c4_2t_bn.device() << ", shape=" << c4_2t_bn.sizes() << std::endl;
+        std::cout << "conv4_2t_relu: dtype=" << c4_2t_relu.dtype() << ", device=" << c4_2t_relu.device() << ", shape=" << c4_2t_relu.sizes() << std::endl;
+    }
+    auto c4_t = c4_2t_relu;
     return {c3_t.contiguous(), c4_t.contiguous()}; // Ensure output is contiguous and float32
 }
 
diff --git a/ltr/models/bbreg/atom_iou_net.py b/ltr/models/bbreg/atom_iou_net.py
index 61412c7..26cf781 100644
--- a/ltr/models/bbreg/atom_iou_net.py
+++ b/ltr/models/bbreg/atom_iou_net.py
@@ -2,6 +2,7 @@ import torch.nn as nn
 import torch
 from ltr.models.layers.blocks import LinearBlock
 from ltr.external.PreciseRoIPooling.pytorch.prroi_pool import PrRoIPool2D
+import os
 torch.cuda.empty_cache()
 
 
@@ -143,11 +144,50 @@ class AtomIoUNet(nn.Module):
 
         return fc34_3_r, fc34_4_r
 
-    def get_iou_feat(self, feat2):
-        """Get IoU prediction features from a 4 or 5 dimensional backbone input."""
+    def get_iou_feat(self, feat2, sample_idx=None):
         feat2 = [f.reshape(-1, *f.shape[-3:]) if f.dim()==5 else f for f in feat2]
         feat3_t, feat4_t = feat2
-        c3_t = self.conv3_2t(self.conv3_1t(feat3_t))
-        c4_t = self.conv4_2t(self.conv4_1t(feat4_t))
-
-        return c3_t, c4_t
+        debug_dir = 'test/output_py/bb_regressor/'
+        if sample_idx == 0:
+            os.makedirs(debug_dir, exist_ok=True)
+        # conv3_1t
+        c3_1t_conv = self.conv3_1t[0](feat3_t)
+        c3_1t_bn = self.conv3_1t[1](c3_1t_conv)
+        c3_1t_relu = self.conv3_1t[2](c3_1t_bn)
+        if sample_idx == 0:
+            torch.save(c3_1t_bn.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv3_1t_bn_py.pt'))
+            torch.save(c3_1t_relu.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv3_1t_relu_py.pt'))
+            print(f"conv3_1t_bn: dtype={c3_1t_bn.dtype}, device={c3_1t_bn.device}, shape={tuple(c3_1t_bn.shape)}")
+            print(f"conv3_1t_relu: dtype={c3_1t_relu.dtype}, device={c3_1t_relu.device}, shape={tuple(c3_1t_relu.shape)}")
+        c3_t_1 = c3_1t_relu
+        # conv3_2t
+        c3_2t_conv = self.conv3_2t[0](c3_t_1)
+        c3_2t_bn = self.conv3_2t[1](c3_2t_conv)
+        c3_2t_relu = self.conv3_2t[2](c3_2t_bn)
+        if sample_idx == 0:
+            torch.save(c3_2t_bn.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv3_2t_bn_py.pt'))
+            torch.save(c3_2t_relu.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv3_2t_relu_py.pt'))
+            print(f"conv3_2t_bn: dtype={c3_2t_bn.dtype}, device={c3_2t_bn.device}, shape={tuple(c3_2t_bn.shape)}")
+            print(f"conv3_2t_relu: dtype={c3_2t_relu.dtype}, device={c3_2t_relu.device}, shape={tuple(c3_2t_relu.shape)}")
+        c3_t = c3_2t_relu
+        # conv4_1t
+        c4_1t_conv = self.conv4_1t[0](feat4_t)
+        c4_1t_bn = self.conv4_1t[1](c4_1t_conv)
+        c4_1t_relu = self.conv4_1t[2](c4_1t_bn)
+        if sample_idx == 0:
+            torch.save(c4_1t_bn.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv4_1t_bn_py.pt'))
+            torch.save(c4_1t_relu.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv4_1t_relu_py.pt'))
+            print(f"conv4_1t_bn: dtype={c4_1t_bn.dtype}, device={c4_1t_bn.device}, shape={tuple(c4_1t_bn.shape)}")
+            print(f"conv4_1t_relu: dtype={c4_1t_relu.dtype}, device={c4_1t_relu.device}, shape={tuple(c4_1t_relu.shape)}")
+        c4_t_1 = c4_1t_relu
+        # conv4_2t
+        c4_2t_conv = self.conv4_2t[0](c4_t_1)
+        c4_2t_bn = self.conv4_2t[1](c4_2t_conv)
+        c4_2t_relu = self.conv4_2t[2](c4_2t_bn)
+        if sample_idx == 0:
+            torch.save(c4_2t_bn.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv4_2t_bn_py.pt'))
+            torch.save(c4_2t_relu.cpu(), os.path.join(debug_dir, 'sample_0_debug_conv4_2t_relu_py.pt'))
+            print(f"conv4_2t_bn: dtype={c4_2t_bn.dtype}, device={c4_2t_bn.device}, shape={tuple(c4_2t_bn.shape)}")
+            print(f"conv4_2t_relu: dtype={c4_2t_relu.dtype}, device={c4_2t_relu.device}, shape={tuple(c4_2t_relu.shape)}")
+        c4_t = c4_2t_relu
+        return [c3_t, c4_t]
diff --git a/test/test_models.cpp b/test/test_models.cpp
index d6d100f..b8ae426 100644
--- a/test/test_models.cpp
+++ b/test/test_models.cpp
@@ -9,11 +9,14 @@
 #include <ATen/Context.h> // Required for globalContext
 // #include <opencv2/opencv.hpp> // REMOVED
 #include <algorithm> // For std::find
+#include <unistd.h> // For getcwd
+#include <sys/stat.h> // For stat
 
 // Project headers
 #include "../cimp/resnet/resnet.h"
 #include "../cimp/classifier/classifier.h"
 #include "../cimp/bb_regressor/bb_regressor.h"
+#include "../cimp/bb_regressor/utils.h"
 
 namespace fs = std::filesystem;
 
@@ -112,6 +115,13 @@ void save_tensor_as_jit_module(const torch::Tensor& tensor, const std::string& f
 }
 
 int main(int argc, char* argv[]) {
+    // Print current working directory
+    char cwd[1024];
+    if (getcwd(cwd, sizeof(cwd)) != NULL) {
+        std::cout << "[DEBUG] Current working directory: " << cwd << std::endl;
+    } else {
+        std::cerr << "[DEBUG] Error getting current working directory." << std::endl;
+    }
     if (argc < 4) {
         // Original usage message for consistency with how run_tests.sh calls it
         std::cerr << "Usage: " << argv[0] << " <base_input_dir_path> <base_output_dir_path> <num_samples> [model_to_test]" << std::endl;
@@ -160,7 +170,9 @@ int main(int argc, char* argv[]) {
 
     // --- Initialize Models (once) ---
     std::cout << "--- Initializing Models ---" << std::endl;
-    std::string resnet_weights_path = "exported_weights/backbone_regenerated";
+    // Set ResNet weights path to absolute directory as per user instruction
+    std::string resnet_weights_path = "/media/mht/ADATA/repos/cpp_tracker/backbone_pure_tensors/";
+    std::cout << "[DEBUG] Using absolute ResNet weights path: " << resnet_weights_path << std::endl;
     std::string bb_reg_weights_path = "exported_weights/bb_regressor";
     std::string classifier_weights_path = "exported_weights/classifier";
     std::optional<cimp::resnet::ResNet> resnet_model_opt;
@@ -169,6 +181,22 @@ int main(int argc, char* argv[]) {
     std::optional<BBRegressor> bb_regressor_model_opt_wrapped;
     bool models_initialized_ok = true;
 
+    std::cout << "ResNet weights path: " << resnet_weights_path << std::endl;
+    struct stat st_resnet = {0};
+    if (stat(resnet_weights_path.c_str(), &st_resnet) != 0 || !S_ISDIR(st_resnet.st_mode)) {
+        std::cerr << "[ERROR] ResNet weights directory does not exist: " << resnet_weights_path << std::endl;
+    }
+    std::cout << "BBRegressor weights path: " << bb_reg_weights_path << std::endl;
+    struct stat st_bbreg = {0};
+    if (stat(bb_reg_weights_path.c_str(), &st_bbreg) != 0 || !S_ISDIR(st_bbreg.st_mode)) {
+        std::cerr << "[ERROR] BBRegressor weights directory does not exist: " << bb_reg_weights_path << std::endl;
+    }
+    std::cout << "Classifier weights path: " << classifier_weights_path << std::endl;
+    struct stat st_clf = {0};
+    if (stat(classifier_weights_path.c_str(), &st_clf) != 0 || !S_ISDIR(st_clf.st_mode)) {
+        std::cerr << "[ERROR] Classifier weights directory does not exist: " << classifier_weights_path << std::endl;
+    }
+
     try {
         std::vector<std::string> output_layers_resnet = {
             "conv1_output", "bn1_output", "relu1_output", "maxpool_output",
@@ -342,7 +370,7 @@ int main(int argc, char* argv[]) {
                     resnet_outputs["layer3"].clone()  
                 };
                 
-                std::vector<torch::Tensor> iou_feats = (*bb_regressor_model_opt_wrapped).get_iou_feat(backbone_feats_for_bb);
+                std::vector<torch::Tensor> iou_feats = (*bb_regressor_model_opt_wrapped).get_iou_feat(backbone_feats_for_bb, i);
                 if (iou_feats.size() >= 1) save_tensor_to_file(iou_feats[0], (bb_reg_out_dir / (sample_suffix + "_iou_feat0.pt")).string());
                 if (iou_feats.size() >= 2) save_tensor_to_file(iou_feats[1], (bb_reg_out_dir / (sample_suffix + "_iou_feat1.pt")).string());