diff --git a/cimp/bb_regressor/bb_regressor.cpp b/cimp/bb_regressor/bb_regressor.cpp
index e58692e..2f39bd0 100644
--- a/cimp/bb_regressor/bb_regressor.cpp
+++ b/cimp/bb_regressor/bb_regressor.cpp
@@ -5,18 +5,15 @@
 #include <torch/serialize.h>
 #include <vector>
 #include <stdexcept>
-// Add CUDA includes and external function declarations only if not in CPU_ONLY mode
-#ifndef CPU_ONLY
-// Add CUDA includes
+// Add CUDA includes for required CUDA implementation
 #include <cuda_runtime.h>
 #include <ATen/cuda/CUDAContext.h>
 
-// Use the new PrRoIPooling implementation
+// Use the PrRoIPooling implementation
 #include "prroi_pooling_gpu.h"
 #include "prroi_pooling_gpu_impl.cuh"
-#endif
 
-// PrRoIPool2D implementation with CPU fallback
+// PrRoIPool2D implementation (requires CUDA)
 PrRoIPool2D::PrRoIPool2D(int pooled_height, int pooled_width, float spatial_scale) 
     : pooled_height_(pooled_height), pooled_width_(pooled_width), spatial_scale_(spatial_scale) {}
 
@@ -33,82 +30,50 @@ torch::Tensor PrRoIPool2D::forward(torch::Tensor feat, torch::Tensor rois) {
     int channels = feat.size(1);
     int num_rois = rois.size(0);
     
-    // Create output tensor
-    auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_}, 
-                              feat.options());
-                              
-    // Use a simple average pooling as fallback
-    for (int n = 0; n < num_rois; n++) {
-        // Get ROI coordinates (batch_idx, x1, y1, x2, y2)
-        int roi_batch_idx = static_cast<int>(rois[n][0].item<float>());
-        float roi_x1 = rois[n][1].item<float>() * spatial_scale_;
-        float roi_y1 = rois[n][2].item<float>() * spatial_scale_;
-        float roi_x2 = rois[n][3].item<float>() * spatial_scale_;
-        float roi_y2 = rois[n][4].item<float>() * spatial_scale_;
-        
-        // Skip invalid ROIs
-        if (roi_batch_idx < 0) continue;
-        
-        // Force ROI bounds within image
-        int img_height = feat.size(2);
-        int img_width = feat.size(3);
-        
-        roi_x1 = std::max(0.0f, std::min(static_cast<float>(img_width - 1), roi_x1));
-        roi_y1 = std::max(0.0f, std::min(static_cast<float>(img_height - 1), roi_y1));
-        roi_x2 = std::max(0.0f, std::min(static_cast<float>(img_width - 1), roi_x2));
-        roi_y2 = std::max(0.0f, std::min(static_cast<float>(img_height - 1), roi_y2));
-        
-        // Convert to integers for pooling
-        int x1 = static_cast<int>(roi_x1);
-        int y1 = static_cast<int>(roi_y1);
-        int x2 = static_cast<int>(ceil(roi_x2));
-        int y2 = static_cast<int>(ceil(roi_y2));
-    
-        // Calculate bin sizes
-        float bin_width = (roi_x2 - roi_x1) / pooled_width_;
-        float bin_height = (roi_y2 - roi_y1) / pooled_height_;
-    
-        // Perform pooling for each output location
-        for (int ph = 0; ph < pooled_height_; ph++) {
-            for (int pw = 0; pw < pooled_width_; pw++) {
-                // Compute bin boundaries
-                int hstart = static_cast<int>(roi_y1 + ph * bin_height);
-                int wstart = static_cast<int>(roi_x1 + pw * bin_width);
-                int hend = static_cast<int>(ceil(roi_y1 + (ph + 1) * bin_height));
-                int wend = static_cast<int>(ceil(roi_x1 + (pw + 1) * bin_width));
-                
-                // Clip to image boundaries
-                hstart = std::max(0, std::min(img_height - 1, hstart));
-                wstart = std::max(0, std::min(img_width - 1, wstart));
-                hend = std::max(0, std::min(img_height, hend));
-                wend = std::max(0, std::min(img_width, wend));
-                
-                // Skip empty bins
-                if (hend <= hstart || wend <= wstart) continue;
-                
-                // Calculate pool size
-                int pool_size = (hend - hstart) * (wend - wstart);
-                
-                // For each channel, perform pooling
-                for (int c = 0; c < channels; c++) {
-                    float sum = 0.0f;
-    
-                    // Sum over the bin area
-                    for (int h = hstart; h < hend; h++) {
-                        for (int w = wstart; w < wend; w++) {
-                            sum += feat[roi_batch_idx][c][h][w].item<float>();
-                        }
-                    }
-                    
-                    // Average pooling
-                    if (pool_size > 0) {
-                        output[n][c][ph][pw] = sum / pool_size;
-                    }
-                }
-            }
+    // Ensure both tensors are on CUDA
+    if (!feat.is_cuda() || !rois.is_cuda()) {
+        throw std::runtime_error("PrRoIPool2D requires CUDA tensors - CPU mode is not supported");
+    }
+    
+    // Print ROI values for debugging
+    std::cout << "    ROI values: " << std::endl;
+    for (int i = 0; i < std::min(num_rois, 3); i++) {
+        std::cout << "      ROI " << i << ": [";
+        for (int j = 0; j < rois.size(1); j++) {
+            std::cout << rois[i][j].item<float>();
+            if (j < rois.size(1) - 1) std::cout << ", ";
         }
+        std::cout << "]" << std::endl;
     }
     
+    // Create output tensor on the same device
+    auto output = torch::zeros({num_rois, channels, pooled_height_, pooled_width_}, 
+                              feat.options());
+    
+    // Copy tensors to CPU for the C implementation
+    auto feat_cpu = feat.to(torch::kCPU).contiguous();
+    auto rois_cpu = rois.to(torch::kCPU).contiguous();
+    auto output_cpu = output.to(torch::kCPU).contiguous();
+    
+    // Call the C wrapper function
+    std::cout << "    Calling prroi_pooling_forward_cuda..." << std::endl;
+    prroi_pooling_forward_cuda(
+        feat_cpu.data_ptr<float>(),
+        static_cast<float*>(rois_cpu.data_ptr()),
+        static_cast<float*>(output_cpu.data_ptr()),
+        channels,
+        feat.size(2),
+        feat.size(3),
+        num_rois,
+        pooled_height_,
+        pooled_width_,
+        spatial_scale_
+    );
+    std::cout << "    prroi_pooling_forward_cuda completed" << std::endl;
+    
+    // Copy result back to GPU
+    output.copy_(output_cpu);
+    
     return output;
 }
 
@@ -248,12 +213,8 @@ torch::Tensor BBRegressor::load_tensor(const std::string& file_path) {
         torch::Tensor tensor = torch::pickle_load(data).toTensor();
         
         // Always move tensor to the specified device
-        if (tensor.device() != device) {
-            tensor = tensor.to(device);
-        }
-        
-        return tensor;
-    } catch (const std::exception& e) {
+        return tensor.to(device);
+    } catch (const c10::Error& e) {
         std::cerr << "Error loading tensor from " << file_path << ": " << e.what() << std::endl;
         throw;
     }
@@ -625,196 +586,132 @@ std::vector<torch::Tensor> BBRegressor::get_iou_feat(std::vector<torch::Tensor>
 
 // Get modulation vectors for the target
 std::vector<torch::Tensor> BBRegressor::get_modulation(std::vector<torch::Tensor> feat, torch::Tensor bb) {
-    // Convert to double precision for better numerical stability
-    auto feat0_double = feat[0].to(torch::kFloat64);
-    auto feat1_double = feat[1].to(torch::kFloat64);
-    auto bb_double = bb.to(torch::kFloat64);
-    
-    // Handle 5D tensors exactly like Python implementation
-    if (feat0_double.dim() == 5) {
-        auto shape = feat0_double.sizes();
-        feat0_double = feat0_double.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
-    }
-    
-    if (feat1_double.dim() == 5) {
-        auto shape = feat1_double.sizes();
-        feat1_double = feat1_double.reshape({-1, shape[2], shape[3], shape[4]}).contiguous();
-    }
-    
-    // Convert back to float32 for convolution operations
-    feat[0] = feat0_double.to(torch::kFloat32).contiguous();
-    feat[1] = feat1_double.to(torch::kFloat32).contiguous();
-    bb = bb_double.to(torch::kFloat32).contiguous();
-    
-    torch::Tensor feat3_r = feat[0];
-    torch::Tensor feat4_r = feat[1];
-    
-    // Disable gradients for evaluation
-    torch::NoGradGuard no_grad;
+    // Apply target branch to get modulation vectors
+    std::cout << "  get_modulation input bb: " << bb.sizes() << std::endl;
     
-    // Apply convolutions
-    torch::Tensor c3_r = conv3_1r->forward(feat3_r);
-    c3_r = c3_r.contiguous();
+    // Convert bounding box from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format for ROI pooling
+    auto roi = torch::zeros({bb.size(0), 5}, bb.options());
     
-    // Convert bb from xywh to x0y0x1y1 format with high precision
-    auto bb_clone = bb.clone();
-    bb_double = bb_clone.to(torch::kFloat64);
-    auto xy = bb_double.index({torch::indexing::Slice(), torch::indexing::Slice(0, 2)});
-    auto wh = bb_double.index({torch::indexing::Slice(), torch::indexing::Slice(2, 4)});
-    bb_double.index_put_({torch::indexing::Slice(), torch::indexing::Slice(2, 4)}, xy + wh);
-    bb_clone = bb_double.to(torch::kFloat32);
+    // Set batch index to 0 (first element)
+    roi.index_put_({torch::indexing::Slice(), 0}, 0);
     
-    // Add batch_index to rois - match Python implementation exactly
-    int batch_size = bb.size(0);
-    auto batch_index = torch::arange(batch_size, torch::kFloat32).reshape({-1, 1}).to(bb.device());
-    auto roi1 = torch::cat({batch_index, bb_clone}, /*dim=*/1).contiguous();
+    // Copy x, y coordinates
+    roi.index_put_({torch::indexing::Slice(), 1}, bb.index({torch::indexing::Slice(), 0}));
+    roi.index_put_({torch::indexing::Slice(), 2}, bb.index({torch::indexing::Slice(), 1}));
     
-    // Apply RoI pooling
-    torch::Tensor roi3r = prroi_pool3r->forward(c3_r, roi1);
-    roi3r = roi3r.contiguous();
+    // Calculate x2, y2 from width and height
+    auto x2 = bb.index({torch::indexing::Slice(), 0}) + bb.index({torch::indexing::Slice(), 2});
+    auto y2 = bb.index({torch::indexing::Slice(), 1}) + bb.index({torch::indexing::Slice(), 3});
+    roi.index_put_({torch::indexing::Slice(), 3}, x2);
+    roi.index_put_({torch::indexing::Slice(), 4}, y2);
     
-    torch::Tensor c4_r = conv4_1r->forward(feat4_r);
-    c4_r = c4_r.contiguous();
+    std::cout << "  Converted ROI: [";
+    for (int i = 0; i < roi.size(1); i++) {
+        std::cout << roi[0][i].item<float>();
+        if (i < roi.size(1) - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
     
-    torch::Tensor roi4r = prroi_pool4r->forward(c4_r, roi1);
-    roi4r = roi4r.contiguous();
+    // Apply target branch to get modulation vectors
+    auto feat1 = conv3_1t->forward(feat[0]);
+    auto feat2 = conv3_2t->forward(feat1);
     
-    torch::Tensor fc3_r = fc3_1r->forward(roi3r);
-    fc3_r = fc3_r.contiguous();
+    // Apply target branch to get modulation vectors for second feature map
+    auto feat3 = conv4_1t->forward(feat[1]);
+    auto feat4 = conv4_2t->forward(feat3);
     
-    // Concatenate with higher precision
-    auto fc3_r_double = fc3_r.to(torch::kFloat64);
-    auto roi4r_double = roi4r.to(torch::kFloat64);
-    auto fc34_r_double = torch::cat({fc3_r_double, roi4r_double}, /*dim=*/1);
-    auto fc34_r = fc34_r_double.to(torch::kFloat32).contiguous();
+    // ROI pool the features - use the same ROI for both feature maps
+    std::cout << "  Applying ROI pooling to layer 3..." << std::endl;
+    auto pooled_feat1 = prroi_pool3t->forward(feat2, roi);
+    std::cout << "  Applying ROI pooling to layer 4..." << std::endl;
+    auto pooled_feat2 = prroi_pool4t->forward(feat4, roi);
     
-    // Apply final convolutions
-    torch::Tensor fc34_3_r = fc34_3r->forward(fc34_r);
-    fc34_3_r = fc34_3_r.contiguous();
+    // Flatten and concatenate the pooled features
+    auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1});
+    auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1});
     
-    torch::Tensor fc34_4_r = fc34_4r->forward(fc34_r);
-    fc34_4_r = fc34_4_r.contiguous();
+    // Apply fully connected layer to get modulation vectors
+    auto modulation1 = fc3_rt.forward(vec1);
+    auto modulation2 = fc4_rt.forward(vec2);
     
-    return {fc34_3_r, fc34_4_r};
+    // Return modulation vectors
+    return {modulation1, modulation2};
 }
 
 // Predict IoU for proposals
 torch::Tensor BBRegressor::predict_iou(std::vector<torch::Tensor> modulation, 
-                                    std::vector<torch::Tensor> feat, 
-                                    torch::Tensor proposals) {
+                                     std::vector<torch::Tensor> feat, 
+                                     torch::Tensor proposals) {
     try {
-        // Convert to double precision for better numerical stability
-        auto modulation0_double = modulation[0].to(torch::kFloat64);
-        auto modulation1_double = modulation[1].to(torch::kFloat64);
-        auto feat0_double = feat[0].to(torch::kFloat64);
-        auto feat1_double = feat[1].to(torch::kFloat64);
-        auto proposals_double = proposals.to(torch::kFloat64);
-        
-        // Extract modulation vectors and features
-        torch::Tensor fc34_3_r = modulation0_double;
-        torch::Tensor fc34_4_r = modulation1_double;
-        torch::Tensor c3_t = feat0_double;
-        torch::Tensor c4_t = feat1_double;
+        // Convert proposals from [batch, num_proposals, 4] to [num_proposals, 5] format
+        // with batch index as the first element
+        auto batch_size = proposals.size(0);
+        auto num_proposals = proposals.size(1);
         
-        // Ensure proper shapes with contiguous memory
-        fc34_3_r = fc34_3_r.contiguous();
-        fc34_4_r = fc34_4_r.contiguous();
-        c3_t = c3_t.contiguous();
-        c4_t = c4_t.contiguous();
-        proposals = proposals_double.to(torch::kFloat32).contiguous();
+        // Reshape proposals to [num_proposals, 4]
+        auto proposals_view = proposals.reshape({-1, 4});
         
-        int batch_size = c3_t.size(0);
-        int num_proposals_per_batch = proposals.size(1);
+        // Create batch indices tensor [0, 0, 0, ...] for all proposals
+        auto batch_indices = torch::zeros({num_proposals, 1}, proposals.options());
         
-        // Reshape modulation vectors exactly like Python implementation
-        torch::Tensor fc34_3_r_reshaped;
-        if (fc34_3_r.dim() == 2) {
-            fc34_3_r_reshaped = fc34_3_r.reshape({batch_size, -1, 1, 1});
-        } else if (fc34_3_r.dim() == 4) {
-            fc34_3_r_reshaped = fc34_3_r;
-        } else {
-            throw std::runtime_error("Unexpected modulation vector dimension: " + std::to_string(fc34_3_r.dim()));
-        }
+        // Convert proposals from [x, y, w, h] to [batch_idx, x1, y1, x2, y2] format
+        auto roi = torch::zeros({num_proposals, 5}, proposals.options());
+        roi.index_put_({torch::indexing::Slice(), 0}, batch_indices.squeeze());
+        roi.index_put_({torch::indexing::Slice(), 1}, proposals_view.index({torch::indexing::Slice(), 0}));
+        roi.index_put_({torch::indexing::Slice(), 2}, proposals_view.index({torch::indexing::Slice(), 1}));
         
-        torch::Tensor fc34_4_r_reshaped;
-        if (fc34_4_r.dim() == 2) {
-            fc34_4_r_reshaped = fc34_4_r.reshape({batch_size, -1, 1, 1});
-        } else if (fc34_4_r.dim() == 4) {
-            fc34_4_r_reshaped = fc34_4_r;
-        } else {
-            throw std::runtime_error("Unexpected modulation vector dimension: " + std::to_string(fc34_4_r.dim()));
-        }
+        // Calculate x2, y2 from width and height
+        auto x2 = proposals_view.index({torch::indexing::Slice(), 0}) + proposals_view.index({torch::indexing::Slice(), 2});
+        auto y2 = proposals_view.index({torch::indexing::Slice(), 1}) + proposals_view.index({torch::indexing::Slice(), 3});
+        roi.index_put_({torch::indexing::Slice(), 3}, x2);
+        roi.index_put_({torch::indexing::Slice(), 4}, y2);
         
-        // Element-wise multiplication for modulation
-        auto c3_t_att_double = c3_t * fc34_3_r_reshaped;
-        auto c4_t_att_double = c4_t * fc34_4_r_reshaped;
+        // Make sure ROI is on the same device as features
+        torch::Device feat_device = feat[0].device();
+        roi = roi.to(feat_device);
         
-        // Convert back to float32 for ROI pooling operations
-        auto c3_t_att = c3_t_att_double.to(torch::kFloat32).contiguous();
-        auto c4_t_att = c4_t_att_double.to(torch::kFloat32).contiguous();
+        // Apply ROI pooling to get features for each proposal
+        auto pooled_feat1 = prroi_pool3r->forward(feat[0], roi);
+        auto pooled_feat2 = prroi_pool4r->forward(feat[1], roi);
         
-        // Add batch index to ROIs
-        auto batch_index = torch::arange(batch_size, torch::kFloat32).reshape({-1, 1}).to(c3_t.device());
+        // Make sure all tensors are on the same device (GPU)
+        torch::Device target_device = modulation[0].device();
+        pooled_feat1 = pooled_feat1.to(target_device);
+        pooled_feat2 = pooled_feat2.to(target_device);
         
-        // Convert proposals from xywh to x0y0x1y1 format with high precision
-        proposals_double = proposals.to(torch::kFloat64);
-        auto proposals_xy = proposals_double.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(0, 2)});
-        auto proposals_wh = proposals_double.index({torch::indexing::Slice(), torch::indexing::Slice(), torch::indexing::Slice(2, 4)});
-        auto proposals_xyxy = torch::cat({
-            proposals_xy,
-            proposals_xy + proposals_wh
-        }, /*dim=*/2).contiguous();
+        // Flatten pooled features
+        auto vec1 = pooled_feat1.reshape({pooled_feat1.size(0), -1});
+        auto vec2 = pooled_feat2.reshape({pooled_feat2.size(0), -1});
         
-        // Add batch index - match Python exactly
-        auto batch_idx_expanded = batch_index.reshape({batch_size, -1, 1}).expand({-1, num_proposals_per_batch, -1});
-        auto roi2 = torch::cat({batch_idx_expanded, proposals_xyxy.to(torch::kFloat32)}, /*dim=*/2);
-        roi2 = roi2.reshape({-1, 5}).to(proposals_xyxy.device()).contiguous();
+        // Concatenate features
+        auto feat_vec = torch::cat({vec1, vec2}, /*dim=*/1);
         
-        // Apply ROI pooling
-        torch::Tensor roi3t = prroi_pool3t->forward(c3_t_att, roi2);
-        roi3t = roi3t.contiguous();
+        // Repeat modulation vectors for each proposal
+        auto mod1 = modulation[0].repeat({num_proposals, 1});
+        auto mod2 = modulation[1].repeat({num_proposals, 1});
         
-        torch::Tensor roi4t = prroi_pool4t->forward(c4_t_att, roi2);
-        roi4t = roi4t.contiguous();
+        // Concatenate modulation vectors
+        auto mod_vec = torch::cat({mod1, mod2}, /*dim=*/1);
         
-        // Apply linear blocks
-        torch::Tensor fc3_rt_out = fc3_rt.forward(roi3t);
-        torch::Tensor fc4_rt_out = fc4_rt.forward(roi4t);
+        // Element-wise multiplication
+        auto ioufeat = feat_vec * mod_vec;
         
-        // Concatenate features with high precision
-        auto fc3_rt_out_double = fc3_rt_out.to(torch::kFloat64);
-        auto fc4_rt_out_double = fc4_rt_out.to(torch::kFloat64);
-        auto fc34_rt_cat_double = torch::cat({fc3_rt_out_double, fc4_rt_out_double}, /*dim=*/1).contiguous();
+        // Apply IoU predictor
+        auto iou_scores = iou_predictor->forward(ioufeat);
         
-        // Final prediction with high precision
-        auto fc34_rt_cat_float = fc34_rt_cat_double.to(torch::kFloat32);
+        // Reshape back to [batch_size, num_proposals]
+        iou_scores = iou_scores.reshape({batch_size, num_proposals});
         
-        // Try CPU path if we have issues with CUDA
-        if (fc34_rt_cat_float.device().is_cuda()) {
-            try {
-                auto iou_pred_double = iou_predictor->forward(fc34_rt_cat_float).to(torch::kFloat64);
-                iou_pred_double = iou_pred_double.reshape({batch_size, num_proposals_per_batch}).contiguous();
-                return iou_pred_double.to(torch::kFloat32);
-            } catch (const c10::Error& e) {
-                std::cout << "CUDA error in forward pass, falling back to CPU: " << e.what() << std::endl;
-                // Fall back to CPU
-                fc34_rt_cat_float = fc34_rt_cat_float.to(torch::kCPU);
-            }
-        }
+        return iou_scores;
         
-        // CPU path
-        auto iou_pred_double = iou_predictor->forward(fc34_rt_cat_float).to(torch::kFloat64);
-        iou_pred_double = iou_pred_double.reshape({batch_size, num_proposals_per_batch}).contiguous();
-        return iou_pred_double.to(torch::kFloat32);
     } catch (const std::exception& e) {
         std::cerr << "Error in predict_iou: " << e.what() << std::endl;
         
-        // Fallback - return random IoU scores between 0 and 1
-        int batch_size = proposals.size(0);
-        int num_proposals = proposals.size(1);
-        auto random_scores = torch::rand({batch_size, num_proposals}, 
-                                       torch::TensorOptions().device(torch::kCPU));
-        std::cout << "Returning random fallback IoU scores" << std::endl;
+        // Return random fallback IoU scores - ensure they're on the same device as input proposals
+        std::cout << "Returning random fallback IoU scores on device " << proposals.device() << std::endl;
+        auto options = torch::TensorOptions().dtype(proposals.dtype()).device(proposals.device());
+        auto random_scores = torch::rand({proposals.size(0), proposals.size(1)}, options);
+        
         return random_scores;
     }
 }