From 3bfb5658df07cb41fea78e0b42d2d59b5612529b Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 3 Apr 2026 18:27:42 -0600
Subject: [PATCH 1/9] added xfmBox to match xfmVector and xfmPoint etc

---
 cuBQL/math/affine.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h
index 798d037..d4e0d4c 100644
--- a/cuBQL/math/affine.h
+++ b/cuBQL/math/affine.h
@@ -181,6 +181,25 @@ namespace cuBQL {
                                               const typename AffineSpaceT<L>::vector_t& n)
   { return xfmNormal(m.l,n); }
 
+  template<typename T>
+  box_t<T,3> xfmBox(const AffineSpaceT<LinearSpace3<vec_t<T,3>>> &xfm,
+                    const box_t<T,3> &box)
+  {
+    box_t<T,3> out;
+    if (in.empty()) return out;
+    auto l = in.lower;
+    auto u = in.upper;
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,l.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,l.y,u.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,u.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,u.y,u.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,l.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,l.y,u.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,u.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,u.y,u.z)));
+    return out;
+  }
+
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Comparison Operators

From 0a9a1af55b91ac9660a4677199de4d73d91e894d Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 3 Apr 2026 18:31:01 -0600
Subject: [PATCH 2/9] typo fix

---
 cuBQL/math/affine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h
index d4e0d4c..8c73416 100644
--- a/cuBQL/math/affine.h
+++ b/cuBQL/math/affine.h
@@ -183,7 +183,7 @@ namespace cuBQL {
 
   template<typename T>
   box_t<T,3> xfmBox(const AffineSpaceT<LinearSpace3<vec_t<T,3>>> &xfm,
-                    const box_t<T,3> &box)
+                    const box_t<T,3> &in)
   {
     box_t<T,3> out;
     if (in.empty()) return out;

From 33b17349318f45dec120f17695921bb231fbdad9 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Fri, 12 Jun 2026 18:04:44 -0600
Subject: [PATCH 3/9] working on hip

---
 cuBQL/builder/cpu.h                           |   1 +
 cuBQL/builder/cpu/spatialMedian.h             |  41 ++
 cuBQL/builder/cuda.h                          |   2 +
 cuBQL/builder/cuda/builder_common.h           | 145 ++++---
 cuBQL/builder/cuda/gpu_builder.h              |   2 +
 cuBQL/builder/cuda/sah_builder.h              |  16 +-
 cuBQL/builder/cuda/sm_builder.h               | 400 +++++++++++++++---
 cuBQL/math/box.h                              |  19 +-
 cuBQL/math/common.h                           |   4 +-
 cuBQL/math/constants.h                        |   2 +-
 cuBQL/math/math.h                             |  14 +-
 cuBQL/math/vec.h                              |  31 +-
 .../math/rayTriangleIntersections.h           |   2 +-
 cuBQL/traversal/rayQueries.h                  |  37 +-
 14 files changed, 545 insertions(+), 171 deletions(-)

diff --git a/cuBQL/builder/cpu.h b/cuBQL/builder/cpu.h
index bdf447d..9c78acd 100644
--- a/cuBQL/builder/cpu.h
+++ b/cuBQL/builder/cpu.h
@@ -66,6 +66,7 @@ namespace cuBQL {
                   uint32_t              numPrims,
                   BuildConfig           buildConfig)
   {
+    PING;
     /*! right now, only have a slow spatial median builder */
     cpu::spatialMedian(bvh,boxes,numPrims,buildConfig);
   }
diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h
index b72d61e..6f128b8 100644
--- a/cuBQL/builder/cpu/spatialMedian.h
+++ b/cuBQL/builder/cpu/spatialMedian.h
@@ -112,11 +112,31 @@ namespace cuBQL {
           node.bounds = box_t<T,D>()
             .including(bvh.nodes[node.admin.offset+0].bounds)
             .including(bvh.nodes[node.admin.offset+1].bounds);
+          if (node.bounds.empty()) {
+            PING;
+            PRINT(bvh.numNodes);
+            PRINT(node.admin.offset);
+            PRINT(node.admin.count);
+            PRINT(bvh.nodes[node.admin.offset+0].bounds);
+            PRINT(bvh.nodes[node.admin.offset+1].bounds);
+
+            box3f a = box_t<T,D>()
+            .including(bvh.nodes[node.admin.offset+0].bounds);
+            box3f b = box_t<T,D>()
+            .including(bvh.nodes[node.admin.offset+0].bounds)
+            .including(bvh.nodes[node.admin.offset+1].bounds);
+            box3f c = box_t<T,D>();
+            PRINT(a);
+            PRINT(b);
+            PRINT(c);
+          }
         } else {
           node.bounds.clear();
           for (int i=0;i<node.admin.count;i++)
             node.bounds.extend(boxes[bvh.primIDs[node.admin.offset+i]]);
         }
+        if (node.bounds.empty())
+          { PING; PRINT(nodeID); PRINT(node.admin.count); }
       }
                          
       template<typename T, int D>
@@ -127,11 +147,30 @@ namespace cuBQL {
       {
         using box_t = ::cuBQL::box_t<T,D>;
         std::vector<int> primIDs;
+        PING; PRINT(numPrims);
         for (int i=0;i<numPrims;i++) {
           box_t box = boxes[i];
+          if (numPrims == 1) PRINT(box);
           if (box.empty()) continue;
           primIDs.push_back(i);
         }
+
+
+        if (primIDs.empty()) {
+          bvh.nodes = new typename BinaryBVH<T,D>::Node[1];
+          bvh.nodes[0].bounds = box3f();
+          bvh.nodes[0].admin.offset = 0;
+          bvh.nodes[0].admin.count = 1;
+          bvh.primIDs = new uint32_t[numPrims];
+          for (int i=0;i<numPrims;i++)
+            bvh.primIDs[i] = i;
+          bvh.numPrims = numPrims;
+          bvh.numNodes = 1;
+          PING;
+          return;
+        }
+        
+        PRINT(primIDs.size());
         std::vector<int>  altPrimIDs(primIDs.size());
         std::vector<Topo> topo(1);
         
@@ -151,6 +190,8 @@ namespace cuBQL {
         }
         topo.clear();
         refit(0,bvh,boxes);
+        PING;
+        PRINT(bvh.nodes[0].bounds);
       }
     } // spatialMedian_impl
     
diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h
index f15dad8..eca25ef 100644
--- a/cuBQL/builder/cuda.h
+++ b/cuBQL/builder/cuda.h
@@ -45,6 +45,7 @@ namespace cuBQL {
     DeviceMemoryResource()
     {}
     void malloc(void** ptr, size_t size, cudaStream_t s) override {
+      CUBQL_CUDA_CALL(StreamSynchronize(s));
       CUBQL_CUDA_CALL(Malloc(ptr, size));
     }
     void free(void* ptr, cudaStream_t s) override
@@ -81,6 +82,7 @@ namespace cuBQL {
       } );
     }
     void malloc(void** ptr, size_t size, cudaStream_t s) override {
+      PING;
 #ifndef NDEBUG
       if (s_numDevices > 1 && s == 0)
         std::cerr << "@cuBQL: warning; async memory allocator used with default stream."
diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h
index e71b901..49eac52 100644
--- a/cuBQL/builder/cuda/builder_common.h
+++ b/cuBQL/builder/cuda/builder_common.h
@@ -23,7 +23,7 @@ namespace cub {
 namespace cuBQL {
   namespace gpuBuilder_impl {
 
-    inline __device__ void atomic_min(int32_t *v, int32_t vv)
+    inline __device__ void atomic_min(volatile int32_t *v, int32_t vv)
     { atomicMin((int *)v,(int)vv); }
     inline __device__ void atomic_min(int64_t *v, int64_t vv)
     { atomicMin((long long *)v,(long long)vv); }
@@ -32,7 +32,7 @@ namespace cuBQL {
     inline __device__ void atomic_min(uint64_t *v, uint64_t vv)
     { atomicMin((unsigned long long *)v,(unsigned long long)vv); }
 
-    inline __device__ void atomic_max(int32_t *v, int32_t vv)
+    inline __device__ void atomic_max(volatile int32_t *v, int32_t vv)
     { atomicMax((int *)v,(int)vv); }
     inline __device__ void atomic_max(int64_t *v, int64_t vv)
     { atomicMax((long long *)v,(long long)vv); }
@@ -81,72 +81,86 @@ namespace cuBQL {
 
     template<typename T> struct int_type_of;
     template<> struct int_type_of<float> { typedef int32_t type; };
-    template<> struct int_type_of<double> { typedef int64_t type; };
-    template<> struct int_type_of<int32_t> { typedef int32_t type; };
-    template<> struct int_type_of<int64_t> { typedef int64_t type; };
-    template<> struct int_type_of<uint32_t> { typedef uint32_t type; };
-    template<> struct int_type_of<uint64_t> { typedef uint64_t type; };
+    // template<> struct int_type_of<double> { typedef int64_t type; };
+    // template<> struct int_type_of<int32_t> { typedef int32_t type; };
+    // template<> struct int_type_of<int64_t> { typedef int64_t type; };
+    // template<> struct int_type_of<uint32_t> { typedef uint32_t type; };
+    // template<> struct int_type_of<uint64_t> { typedef uint64_t type; };
 
-    template<typename T> inline __device__
-    typename int_type_of<T>::type encode(T v);
+    // template<typename T> inline __device__
+    // typename int_type_of<T>::type encode(T v);
     
-    template<> inline __device__
-    int32_t encode(float f)
+    // template<>
+    inline __device__
+    int32_t encode(const float &f)
     {
-      const int32_t sign = 0x80000000;
-      int32_t bits = __float_as_int(f);
+      // int *ptr = (int *)&f;
+      
+      const int32_t sign = 0x80000000U;
+      // int32_t bits = __float_as_int(f);
+      int32_t bits;// = *ptr;//(const int &)f;
+      memcpy(&bits,&f,4);
       if (bits & sign) bits ^= 0x7fffffff;
       return bits;
     }
-    template<> inline __device__
-    int64_t encode(double f)
-    {
-      const int64_t sign = 0x8000000000000000LL;
-      int64_t bits = __double_as_longlong(f);
-      if (bits & sign) bits ^= 0x7fffffffffffffffLL;
-      return bits;
-    }
-    template<> inline __device__
-    int32_t encode(int32_t bits)
-    {
-      return bits;
-    }
-    template<> inline __device__
-    int64_t encode(int64_t bits)
-    {
-      return bits;
-    }
+    // template<> inline __device__
+    // int64_t encode(double f)
+    // {
+    //   const int64_t sign = 0x8000000000000000LL;
+    //   int64_t bits = __double_as_longlong(f);
+    //   if (bits & sign) bits ^= 0x7fffffffffffffffLL;
+    //   return bits;
+    // }
+    // template<> inline __device__
+    // int32_t encode(int32_t bits)
+    // {
+    //   return bits;
+    // }
+    // template<> inline __device__
+    // int64_t encode(int64_t bits)
+    // {
+    //   return bits;
+    // }
 
     template<typename T> inline __device__
     T decode(int32_t v);
-    template<typename T> inline __device__
-    T decode(uint32_t v);
-    template<typename T> inline __device__
-    T decode(int64_t v);
-    template<typename T> inline __device__
-    T decode(uint64_t v);
+    // template<typename T> inline __device__
+    // T decode(uint32_t v);
+    // template<typename T> inline __device__
+    // T decode(int64_t v);
+    // template<typename T> inline __device__
+    // T decode(uint64_t v);
 
     template<> inline __device__
     float decode<float>(int32_t bits)
     {
       const int32_t sign = 0x80000000;
       if (bits & sign) bits ^= 0x7fffffff;
-      return __int_as_float(bits);
+// #ifdef __HIPCC__
+//       float ff = ((const float &)bits);
+//       return ff;
+// #endif
+      float f;
+      memcpy(&f,&bits,sizeof(4));
+      return f;
+      // float *f = (float *)&bits;
+      // return *f;
+      // return __int_as_float(bits);
     }
-    template<> inline __device__
-    int32_t decode<int32_t>(int32_t bits)
-    { return bits; }
-    template<> inline __device__
-    int64_t decode<int64_t>(int64_t bits)
-    { return bits; }
+    // template<> inline __device__
+    // int32_t decode<int32_t>(int32_t bits)
+    // { return bits; }
+    // template<> inline __device__
+    // int64_t decode<int64_t>(int64_t bits)
+    // { return bits; }
 
-    template<> inline __device__
-    double decode<double>(int64_t bits)
-    {
-      const int64_t sign = 0x8000000000000000LL;
-      if (bits & sign) bits ^= 0x7fffffffffffffffLL;
-      return __longlong_as_double(bits);
-    }
+    // template<> inline __device__
+    // double decode<double>(int64_t bits)
+    // {
+    //   const int64_t sign = 0x8000000000000000LL;
+    //   if (bits & sign) bits ^= 0x7fffffffffffffffLL;
+    //   return __longlong_as_double(bits);
+    // }
     
     template<typename box_t>
     struct CUBQL_ALIGN(8) AtomicBox {
@@ -219,29 +233,37 @@ namespace cuBQL {
       using scalar_t = typename box_t::scalar_t;
 #pragma unroll
       for (int d=0;d<box_t::numDims;d++) {
+#if 1
+        lower[d] = encode(+1e6f);//encode(+FLT_MAX);
+        upper[d] = encode(-1e6f);//encode(-FLT_MAX);
+#else
         lower[d] = encode(empty_box_lower<scalar_t>());//encode(+FLT_MAX);
         upper[d] = encode(empty_box_upper<scalar_t>());//encode(-FLT_MAX);
+#endif
       }
     }
 
     template<typename box_t> inline __device__
-    void atomic_grow(AtomicBox<box_t> &abox, const typename box_t::vec_t &other)
+    void atomic_grow(AtomicBox<box_t> &abox,
+                     typename box_t::vec_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
+      auto in_y = abox.upper[1];
 #pragma unroll
       for (int d=0;d<box_t::numDims;d++) {
-        const typename int_type_of<scalar_t>::type enc
-          = //AtomicBox<box_t>::
-          encode(other[d]);//get(other,d));
+        // const typename int_type_of<scalar_t>::type enc
+        const int32_t enc
+          = encode(other[d]);
         if (enc < abox.lower[d])
-          atomic_min(&abox.lower[d],enc);
+          atomic_min((volatile int32_t*)abox.lower+d,enc);
         if (enc > abox.upper[d])
-          atomic_max(&abox.upper[d],enc);
+          atomic_max((volatile int32_t*)abox.upper+d,enc);
       }
     } 
     
     template<typename box_t>
-    inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const box_t &other)
+    inline __device__
+    void atomic_grow(AtomicBox<box_t> &abox, box_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
@@ -252,13 +274,14 @@ namespace cuBQL {
         const typename int_type_of<scalar_t>::type 
           enc_upper = //AtomicBox<box_t>::
           encode(other.get_upper(d));
-        if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower);
-        if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper);
+        if (enc_lower < abox.lower[d]) atomic_min(abox.lower+d,enc_lower);
+        if (enc_upper > abox.upper[d]) atomic_max(abox.upper+d,enc_upper);
       }
     }
 
     template<typename box_t>
-    inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const AtomicBox<box_t> &other)
+    inline __device__ void atomic_grow(AtomicBox<box_t> &abox,
+                                       const AtomicBox<box_t> &other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
diff --git a/cuBQL/builder/cuda/gpu_builder.h b/cuBQL/builder/cuda/gpu_builder.h
index 18365f7..9b396dc 100644
--- a/cuBQL/builder/cuda/gpu_builder.h
+++ b/cuBQL/builder/cuda/gpu_builder.h
@@ -52,7 +52,9 @@ namespace cuBQL {
         buildConfig.makeLeafThreshold = 1;
       gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource);
     }
+    PING;
     cuBQL::cuda::refit(bvh,boxes,s,memResource);
+    PING;
   }
 
   namespace cuda {
diff --git a/cuBQL/builder/cuda/sah_builder.h b/cuBQL/builder/cuda/sah_builder.h
index 423d6fb..4ba5277 100644
--- a/cuBQL/builder/cuda/sah_builder.h
+++ b/cuBQL/builder/cuda/sah_builder.h
@@ -554,14 +554,14 @@ namespace cuBQL {
                     GpuMemoryResource& memResource)
     { real3_sahBuilder<float>(bvh,boxes,_numPrims,buildConfig,s,memResource); }
     
-    template<>
-    inline void sahBuilder(BinaryBVH<double,3>  &bvh,
-                    const box_t<double,3> *boxes,
-                    uint32_t     _numPrims,
-                    BuildConfig  buildConfig,
-                    cudaStream_t s,
-                    GpuMemoryResource& memResource)
-    { real3_sahBuilder<double>(bvh,boxes,_numPrims,buildConfig,s,memResource); }
+    // template<>
+    // inline void sahBuilder(BinaryBVH<double,3>  &bvh,
+    //                 const box_t<double,3> *boxes,
+    //                 uint32_t     _numPrims,
+    //                 BuildConfig  buildConfig,
+    //                 cudaStream_t s,
+    //                 GpuMemoryResource& memResource)
+    // { real3_sahBuilder<double>(bvh,boxes,_numPrims,buildConfig,s,memResource); }
   } // ::cuBQL::sahBuilder_impl
 
   namespace cuda {
diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
index 59395cd..c3cc42c 100644
--- a/cuBQL/builder/cuda/sm_builder.h
+++ b/cuBQL/builder/cuda/sm_builder.h
@@ -25,13 +25,14 @@ namespace cuBQL {
     };
 
     template<typename T, int D>
-    struct CUBQL_ALIGN(16) TempNode {
+    struct// CUBQL_ALIGN(16)
+      TempNode {
       using box_t = cuBQL::box_t<T,D>;
-      union {
+      // union {
         struct {
-          AtomicBox<box_t> centBounds;
           uint32_t         count;
           uint32_t         unused;
+          AtomicBox<box_t> centBounds;
         } openBranch;
         struct {
           uint32_t offset;
@@ -42,9 +43,8 @@ namespace cuBQL {
         struct {
           uint32_t offset;
           uint32_t count;
-          uint32_t unused[2];
         } doneNode;
-      };
+      // };
     };
     
     template<typename T, int D>
@@ -53,11 +53,15 @@ namespace cuBQL {
                    NodeState       *nodeStates,
                    TempNode<T,D> *nodes)
     {
+      if (threadIdx.x > 0) return;
+      
       buildState->numNodes = 2;
       
       nodeStates[0]             = OPEN_BRANCH;
       nodes[0].openBranch.count = 0;
       nodes[0].openBranch.centBounds.set_empty();
+      if (nodes[0].openBranch.centBounds.upper[0] > 0)
+        nodes[0].openBranch.centBounds.upper[0] = 8888;
 
       nodeStates[1]            = DONE_NODE;
       nodes[1].doneNode.offset = 0;
@@ -82,7 +86,9 @@ namespace cuBQL {
         me.done   = false;
         // this could be made faster by block-reducing ...
         atomicAdd(&nodes[0].openBranch.count,1);
-        atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box));
+        auto ctr = box.center();
+        atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box));
+        // atomic_grow(nodes[0].openBranch.centBounds,centerOf(box));
       } else {
         me.nodeID = (uint32_t)-1;
         me.done   = true;
@@ -97,7 +103,7 @@ namespace cuBQL {
                       uint32_t       numNodes,
                       BuildConfig    buildConfig)
     {
-#if 1
+#if 0
       __shared__ int l_newNodeOfs;
       if (threadIdx.x == 0)
         l_newNodeOfs = 0;
@@ -208,13 +214,14 @@ namespace cuBQL {
       
       auto in = nodes[nodeID].openBranch;
       if (in.count <= buildConfig.makeLeafThreshold) {
-        auto &done  = nodes[nodeID].doneNode;
+        auto done  = nodes[nodeID].doneNode;
         done.count  = in.count;
         // set this to max-value, so the prims can later do atomicMin
         // with their position ion the leaf list; this value is
         // greater than any prim position.
         done.offset = (uint32_t)-1;
         nodeState   = DONE_NODE;
+        nodes[nodeID].doneNode = done;
       } else {
         float widestWidth = 0.f;
         int   widestDim   = -1;
@@ -235,7 +242,8 @@ namespace cuBQL {
           widestCtr = ctr;
         }
       
-        auto &open = nodes[nodeID].openNode;
+        // auto &open = nodes[nodeID].openNode;
+        auto open = nodes[nodeID].openNode;
         if (widestDim >= 0) {
           open.pos = widestCtr;
         }
@@ -249,12 +257,15 @@ namespace cuBQL {
 #pragma unroll
         for (int side=0;side<2;side++) {
           const int childID = open.offset+side;
-          auto &child = nodes[childID].openBranch;
-          child.centBounds.set_empty();
-          child.count         = 0;
+          TempNode<T,D> child;
+          // auto &child = nodes[childID].openBranch;
+          child.openBranch.centBounds.set_empty();
+          child.openBranch.count         = 0;
           nodeStates[childID] = OPEN_BRANCH;
+          nodes[childID] = child;
         }
         nodeState = OPEN_NODE;
+        nodes[nodeID].openNode = open;
       }
 #endif
     }
@@ -420,6 +431,8 @@ namespace cuBQL {
                GpuMemoryResource &memResource)
     {
       assert(sizeof(PrimState) == sizeof(uint64_t));
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
       
       // ==================================================================
       // do build on temp nodes
@@ -432,13 +445,168 @@ namespace cuBQL {
       _ALLOC(nodeStates,2*numPrims,s,memResource);
       _ALLOC(primStates,numPrims,s,memResource);
       _ALLOC(buildState,1,s,memResource);
-      initState<<<1,1,0,s>>>(buildState,
-                             nodeStates,
-                             tempNodes);
-      initPrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+
+#if 1
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+        {
+          int numNodes = 1;
+          std::cout << "*********** INPUT BOUNDS ***********" << std::endl;
+          std::vector<box3f> h_bbs(numPrims);
+          CUBQL_CUDA_CALL(Memcpy(h_bbs.data(),boxes,
+                                 numPrims*sizeof(*boxes),cudaMemcpyDefault));
+          for (int i=0;i<numPrims;i++) {
+            auto &bb= h_bbs[i];
+            if (bb.upper.y > 1000000)
+              PRINT(bb);
+          }
+        }
+#endif
+      
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+        initState<<<1,128,0,s>>>(buildState,
+                                 nodeStates,
+                                 tempNodes);
+        
+
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+#if 0
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+        {
+          int numNodes = 1;
+          std::cout << "*********** AFTER INITSTATE ***********" << std::endl;
+          std::vector<NodeState> h_nodeStates(numNodes);
+          std::vector<TempNode<T,D>> h_tempNodes(numNodes);
+          CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
+                                 numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
+CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
+                       numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
+          for (int i=0;i<numNodes;i++) {
+            int state = nodeStates[i];
+            // printf("nodestae %i = %i\n",i,state);
+            auto &n = tempNodes[i];
+            switch(state) {
+            case OPEN_NODE:
+              break;
+            case OPEN_BRANCH: {
+              auto _decode = [](int32_t bits) -> float {
+                const int32_t sign = 0x80000000;
+                if (bits & sign) bits ^= 0x7fffffff;
+                float f;
+                memcpy(&f,&bits,4);
+                return f;
+              };
+              auto _decode2 = [](int32_t bits) -> float {
+                const int32_t sign = 0x80000000;
+                if (bits & sign) bits ^= 0x7fffffff;
+                return (const float &)bits;
+              };
+              auto _encode = [](float f) -> int32_t {
+                int *ptr = (int *)&f;
+                const int32_t sign = 0x80000000U;
+                // int32_t bits = __float_as_int(f);
+                int32_t bits = *ptr;//(const int &)f;
+                if (bits & sign) bits ^= 0x7fffffff;
+                return bits;
+              };
+              auto &bb = n.openBranch.centBounds;
+              float tf = +1e8f;
+              auto etf = _encode(tf);
+              auto detf = _decode(etf);
+              auto detf2 = _decode2(etf);
+              printf(" in f %f enc %i/0x%x dec %f / %f\n",
+                     tf,etf,etf,detf,detf2);
+              printf(" [%i] OPENBRANCH cnt %i bb %i %i %i: %i %i %i -> %f %f %f : %f %f %f\n",
+                     i,
+                     n.openBranch.count,
+                     (bb.lower[0]),
+                     (bb.lower[1]),
+                     (bb.lower[2]),
+                     (bb.upper[0]),
+                     (bb.upper[1]),
+                     (bb.upper[2]),
+                     _decode(bb.lower[0]),
+                     _decode(bb.lower[1]),
+                     _decode(bb.lower[2]),
+                     _decode(bb.upper[0]),
+                     _decode(bb.upper[1]),
+                     _decode(bb.upper[2]));
+            } break;
+            case DONE_NODE:
+              break;
+            }
+          }
+        }
+#endif
+
+      
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+      initPrims<<<divRoundUp(numPrims,128),128,0,s>>>
         (tempNodes,
          primStates,boxes,numPrims);
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+
 
+#if 0
+      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+      CUBQL_CUDA_SYNC_CHECK();
+        {
+          int numNodes = 1;
+          std::cout << "*********** INITIAL BOUNDS ***********" << std::endl;
+          std::vector<NodeState> h_nodeStates(numNodes);
+          std::vector<TempNode<T,D>> h_tempNodes(numNodes);
+          CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
+                                 numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
+CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
+                       numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
+          for (int i=0;i<numNodes;i++) {
+            int state = nodeStates[i];
+            // printf("nodestae %i = %i\n",i,state);
+            auto &n = tempNodes[i];
+            switch(state) {
+            case OPEN_NODE:
+              break;
+            case OPEN_BRANCH: {
+              auto decode = [](int32_t bits) {
+                const int32_t sign = 0x80000000;
+                if (bits & sign) bits ^= 0x7fffffff;
+                float f;
+                memcpy(&f,&bits,4);
+                return f;
+                // const int32_t sign = 0x80000000;
+                // if (bits & sign) bits ^= 0x7fffffff;
+                // return (const float &)bits;
+              };
+              auto &bb = n.openBranch.centBounds;
+              printf(" [%i] OPENBRANCH cnt %i bb %i %i %i: %i %i %i -> %f %f %f : %f %f %f\n",
+                     i,
+                     n.openBranch.count,
+                     (bb.lower[0]),
+                     (bb.lower[1]),
+                     (bb.lower[2]),
+                     (bb.upper[0]),
+                     (bb.upper[1]),
+                     (bb.upper[2]),
+                     decode(bb.lower[0]),
+                     decode(bb.lower[1]),
+                     decode(bb.lower[2]),
+                     decode(bb.upper[0]),
+                     decode(bb.upper[1]),
+                     decode(bb.upper[2]));
+            } break;
+            case DONE_NODE:
+              break;
+            }
+          }
+        }
+#endif
+
+      
       int numDone = 0;
       int numNodes;
 
@@ -447,60 +615,168 @@ namespace cuBQL {
       CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
       
       
-#if CUBQL_PROFILE
-      int pass = 0;
-      static Profile t_writeNodes;
-      static Profile t_writePrims;
-      static Profile t_sortPrims;
-      static Profile t_nodePass[100];
-      static Profile t_primPass[100];
-      if (t_writeNodes.name == "") {
-        t_writeNodes.setName("writeNodes");
-        t_writePrims.setName("writePrims");
-        t_sortPrims.setName("sortPrims");
-        for (int i=0;i<100;i++) {
-          t_nodePass[i].setName("nodePass",i);
-          t_primPass[i].setName("primPass",i);
-        }
-      }
-#endif
+// #if CUBQL_PROFILE
+//       int pass = 0;
+//       static Profile t_writeNodes;
+//       static Profile t_writePrims;
+//       static Profile t_sortPrims;
+//       static Profile t_nodePass[100];
+//       static Profile t_primPass[100];
+//       if (t_writeNodes.name == "") {
+//         t_writeNodes.setName("writeNodes");
+//         t_writePrims.setName("writePrims");
+//         t_sortPrims.setName("sortPrims");
+//         for (int i=0;i<100;i++) {
+//           t_nodePass[i].setName("nodePass",i);
+//           t_primPass[i].setName("primPass",i);
+//         }
+//       }
+// #endif
+      PING;
       while (true) {
+        PING;
+        PRINT(numDone);
+        int pre_numNodes = numNodes;
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
-        if (numNodes == numDone)
-          break;
         CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
         CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
-#if CUBQL_PROFILE
-        t_nodePass[pass].sync_start();
+        PRINT(numNodes);
+        CUBQL_CUDA_SYNC_CHECK();
+        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+        if (numNodes == numDone)
+          break;
+        PRINT(numNodes-numDone);
+        bool dbg = (numNodes-numDone) == 8098;
+// #if CUBQL_PROFILE
+//         t_nodePass[pass].sync_start();
+// #endif
+        CUBQL_CUDA_SYNC_CHECK();
+        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+
+#if 0
+        {
+          std::cout << "*********** PRE SELECT ***********" << std::endl;
+          std::vector<NodeState> h_nodeStates(numNodes);
+          std::vector<TempNode<T,D>> h_tempNodes(numNodes);
+          CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
+                                 numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
+          CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
+                                 numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
+          for (int i=0;i<numNodes;i++) {
+            int state = nodeStates[i];
+            // printf("nodestae %i = %i\n",i,state);
+            auto &n = tempNodes[i];
+            switch(state) {
+            case OPEN_NODE:
+              break;
+            case OPEN_BRANCH: {
+              auto decode = [](int32_t bits) {
+                const int32_t sign = 0x80000000;
+                if (bits & sign) bits ^= 0x7fffffff;
+                float f;
+                memcpy(&f,&bits,4);
+                return f;
+              };
+              auto &bb = n.openBranch.centBounds;
+              printf(" [%i] OPENBRANCH cnt %i bb %f %f %f : %f %f %f\n",
+                     i,
+                     n.openBranch.count,
+                     decode(bb.lower[0]),
+                     decode(bb.lower[1]),
+                     decode(bb.lower[2]),
+                     decode(bb.upper[0]),
+                     decode(bb.upper[1]),
+                     decode(bb.upper[2]));
+            } break;
+            case DONE_NODE:
+              break;
+            }
+          }
+        }
 #endif
+          
         selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
           (buildState,
            nodeStates,tempNodes,numNodes,
            buildConfig);
-#if CUBQL_PROFILE
-        t_nodePass[pass].sync_stop();
-        t_primPass[pass].sync_start();
+        CUBQL_CUDA_SYNC_CHECK();
+        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+
+#if 0
+        {
+        std::vector<NodeState> h_nodeStates(numNodes);
+        std::vector<TempNode<T,D>> h_tempNodes(numNodes);
+CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
+                       numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
+CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
+                       numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
+        for (int i=0;i<numNodes;i++) {
+          int state = nodeStates[i];
+          // printf("nodestae %i = %i\n",i,state);
+          auto &n = tempNodes[i];
+          switch(state) {
+          case OPEN_NODE:
+            printf(" [%i] OPENNODE offset %i dim %i pos %f\n",
+                   i,
+                   n.openNode.offset,
+                   n.openNode.dim,
+                   n.openNode.pos);
+            break;
+          case OPEN_BRANCH:
+            break;
+          case DONE_NODE:
+            break;
+          }
+        //   union {
+        // struct {
+        //   AtomicBox<box_t> centBounds;
+        //   uint32_t         count;
+        //   uint32_t         unused;
+        // } openBranch;
+        // struct {
+        //   uint32_t offset;
+        //   int      dim;
+        //   uint32_t tieBreaker;
+        //   float    pos;
+        // } openNode;
+        // struct {
+        //   uint32_t offset;
+        //   uint32_t count;
+        //   uint32_t unused[2];
+        // } doneNode;
+        }
+        }
 #endif
+        
+// #if CUBQL_PROFILE
+//         t_nodePass[pass].sync_stop();
+//         t_primPass[pass].sync_start();
+// #endif
+        
         numDone = numNodes;
 
 // #if 1
-        if (sizeof(T)*D <= sizeof(float3)) {
+        if (0 && sizeof(T)*D <= sizeof(float3)) {
           updatePrims_shm<<<divRoundUp(numPrims,512),512,0,s>>>
             (nodeStates,tempNodes,
              primStates,boxes,numPrims,numDone);
         } else 
 // #else
-        updatePrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+        updatePrims<<<divRoundUp(numPrims,128),128,0,s>>>
           (nodeStates,tempNodes,
            primStates,boxes,numPrims);
 // #endif
+
+        CUBQL_CUDA_SYNC_CHECK();
+        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
         
-#if CUBQL_PROFILE
-        t_primPass[pass].sync_stop();
-        ++ pass;
-#endif
+// #if CUBQL_PROFILE
+//         t_primPass[pass].sync_stop();
+//         ++ pass;
+// #endif
       }
+      PING;
       CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
       // ==================================================================
       // sort {item,nodeID} list
@@ -510,9 +786,10 @@ namespace cuBQL {
       uint8_t   *d_temp_storage     = NULL;
       size_t     temp_storage_bytes = 0;
       PrimState *sortedPrimStates   = 0;
-#if CUBQL_PROFILE
-      t_sortPrims.sync_start();
-#endif
+// #if CUBQL_PROFILE
+//       t_sortPrims.sync_start();
+// #endif
+      PING;
       _ALLOC(sortedPrimStates,numPrims,s,memResource);
       auto rc =
         cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
@@ -527,23 +804,25 @@ namespace cuBQL {
                                        numPrims,32,64,s);
       rc = rc;
       _FREE(d_temp_storage,s,memResource);
-#if CUBQL_PROFILE
-      t_sortPrims.sync_stop();
-      t_writePrims.sync_start();
-#endif
+// #if CUBQL_PROFILE
+//       t_sortPrims.sync_stop();
+//       t_writePrims.sync_start();
+// #endif
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
 
+      PING;
       bvh.numPrims = numPrims;
       _ALLOC(bvh.primIDs,numPrims,s,memResource);
       writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
         (tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
-#if CUBQL_PROFILE
-      t_writePrims.sync_stop();
-      t_writeNodes.sync_start();
-#endif
+// #if CUBQL_PROFILE
+//       t_writePrims.sync_stop();
+//       t_writeNodes.sync_start();
+// #endif
 
+      PING;
       // ==================================================================
       // allocate and write final nodes
       // ==================================================================
@@ -551,14 +830,15 @@ namespace cuBQL {
       _ALLOC(bvh.nodes,numNodes,s,memResource);
       writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
         (bvh.nodes,tempNodes,numNodes);
-#if CUBQL_PROFILE
-      t_writeNodes.sync_stop();
-#endif
+// #if CUBQL_PROFILE
+//       t_writeNodes.sync_stop();
+// #endif
       _FREE(sortedPrimStates,s,memResource);
       _FREE(tempNodes,s,memResource);
       _FREE(nodeStates,s,memResource);
       _FREE(primStates,s,memResource);
       _FREE(buildState,s,memResource);
+      PING;
     }
 
   } // ::cuBQL::gpuBuilder_impl
diff --git a/cuBQL/math/box.h b/cuBQL/math/box.h
index 07736c4..a26f49f 100644
--- a/cuBQL/math/box.h
+++ b/cuBQL/math/box.h
@@ -79,7 +79,8 @@ namespace cuBQL {
     /*! create a box from two points. note this will NOT make sure
       that a<b; if this is an empty box it will just create an empty
       box! */
-    inline __cubql_both box_t(const vec_t_data<T,D> &a, const vec_t_data<T,D> &b)
+    inline __cubql_both box_t(const vec_t_data<T,D> &a,
+                              const vec_t_data<T,D> &b)
     {
       lower = vec_t(a);
       upper = vec_t(b);
@@ -88,11 +89,23 @@ namespace cuBQL {
     /*! returns a box that bounds both 'this' and another point 'v';
       this does not get modified */
     inline __cubql_both box_t including(const vec_t &v) const
-    { return box_t{min(lower,v),max(upper,v)}; }
+    {
+      // return box_t(min(lower,v),max(upper,v));
+      box_t b;
+      b.lower = min(this->lower,v);
+      b.upper = max(this->upper,v);
+      return b;
+    }
     /*! returns a box that bounds both 'this' and another box 'b';
       this does not get modified */
     inline __cubql_both box_t including(const box_t &b) const
-    { return box_t{min(lower,b.lower),max(upper,b.upper)}; }
+    {
+      box_t bb;
+      bb.lower = min(this->lower,b.lower);
+      bb.upper = max(this->upper,b.upper);
+      return bb;
+      // return box_t(min(lower,b.lower),max(upper,b.upper));
+    }
     
     inline __cubql_both box_t &grow(const vec_t &v)
     { lower = min(lower,v); upper = max(upper,v); return *this; }
diff --git a/cuBQL/math/common.h b/cuBQL/math/common.h
index a3063dd..00a1b68 100644
--- a/cuBQL/math/common.h
+++ b/cuBQL/math/common.h
@@ -336,8 +336,8 @@ namespace cuBQL {
 
 #define CUBQL_CUDA_SYNC_CHECK()                                 \
   {                                                             \
-    cudaDeviceSynchronize();                                    \
-    cudaError_t rc = cudaGetLastError();                        \
+    cudaError_t rc = cudaDeviceSynchronize();                   \
+    rc = cudaGetLastError();                                    \
     if (rc != cudaSuccess) {                                    \
       fprintf(stderr, "error (%s: line %d): %s\n",              \
               __FILE__, __LINE__, cudaGetErrorString(rc));      \
diff --git a/cuBQL/math/constants.h b/cuBQL/math/constants.h
index 4b5c3c4..488ad7f 100644
--- a/cuBQL/math/constants.h
+++ b/cuBQL/math/constants.h
@@ -6,7 +6,7 @@
 #include <limits>
 #include <limits.h>
 #if defined(__CUDACC__) && !defined(CUDART_INF_F)
-#include <math_constants.h>
+// #include <math_constants.h>
 #endif
 
 #ifndef M_PI
diff --git a/cuBQL/math/math.h b/cuBQL/math/math.h
index 6943b75..86855fb 100644
--- a/cuBQL/math/math.h
+++ b/cuBQL/math/math.h
@@ -4,14 +4,15 @@
 #pragma once
 
 #include "cuBQL/math/common.h"
-#ifdef __CUDACC__
-#include <cuda/std/limits>
-#endif
+// #ifdef __CUDACC__
+// #include <cuda/std/limits>
+// #endif
 #include <limits>
 
 namespace cuBQL {
   
-#ifdef __CUDACC__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+// #if defined(__CUDACC__) || defined(__HIPCC__)
   // make sure we use the built-in cuda functoins that use floats, not
   // the c-stdlib ones that use doubles.
   using ::min;
@@ -21,14 +22,15 @@ namespace cuBQL {
   using std::max;
 #endif
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+// #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 # define CUBQL_INF CUDART_INF_F
 // # define CUBQL_INF ::cuda::std::numeric_limits<float>::infinity()
 #else
 # define CUBQL_INF std::numeric_limits<float>::infinity()
 #endif
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #else
     inline __cubql_both float __int_as_float(int i) { return (const float &)i; }
     inline __cubql_both int __float_as_int(float f) { return (const int &)f; }
diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h
index 9f7a9b4..3048657 100644
--- a/cuBQL/math/vec.h
+++ b/cuBQL/math/vec.h
@@ -7,7 +7,7 @@
 #include <type_traits>
 #include <limits>
 #include "constants.h"
-#ifdef __CUDACC__
+#if defined(__CUDACC__) && !defined(CUDART_INF)
 # include <cuda.h>
 #endif
 
@@ -19,10 +19,10 @@
 
 namespace cuBQL {
 
-#ifndef __CUDACC__
-  using std::min;
-  using std::max;
-#endif
+// #ifndef __CUDA_ARCH__
+//   using std::min;
+//   using std::max;
+// #endif
 
 #ifndef CUBQL_SUPPORT_CUDA_VECTOR_TYPES
 #define CUBQL_SUPPORT_CUDA_VECTOR_TYPES 0
@@ -46,7 +46,8 @@ namespace cuBQL {
     equivalent, let's also create a 'invalid_t' to be used by
     default */
 
-#ifndef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#else
   struct float2 { float x, y; };
   struct float3 { float x, y, z; };
   struct CUBQL_ALIGN(16) float4 { float x, y, z, w; };
@@ -492,18 +493,21 @@ namespace cuBQL {
   { return vec_t<uint64_t,4>( v.x >> b, v.y >> b, v.z >> b, v.w >> b ); }
   
   inline __cubql_both double abs(double d) {
-#ifdef __CUDA_ARCH__
+// #ifdef __CUDA_ARCH__
     return ::abs(d);
-#else
-    return std::abs(d);
-#endif
+// #else
+//     return std::abs(d);
+// #endif
   }
   inline __cubql_both float abs(float d) {
-#ifdef __CUDA_ARCH__
-    return ::abs(d);
+#if defined(__CUDA_ARCH__) || defined(__CUDACC__)
+    return abs(d);
 #else
     return std::abs(d);
 #endif
+// #else
+//     return std::abs(d);
+// #endif
   }
   // inline __cubql_both double abs(double d) { return absf(d); }
   
@@ -711,7 +715,8 @@ namespace cuBQL {
 
   template<typename T, int N>
   inline __cubql_both
-  vec_t<T,N> normalize(vec_t<T,N> v) { return v * (T(1)/sqrt(dot(v,v))); }
+  vec_t<T,N> normalize(vec_t<T,N> v) { return v * (T(1)/::sqrt(dot(v,v))); }
+  // vec_t<T,N> normalize(vec_t<T,N> v) { return v * (T(1)/sqrt(dot(v,v))); }
 
 
   // ------------------------------------------------------------------
diff --git a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h
index bc20150..44a7683 100644
--- a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h
+++ b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h
@@ -40,7 +40,7 @@ namespace cuBQL {
 #else
     auto dbg = _dbg;
 #endif
-    if (dbg) /* nothing, just for 'maybe_unused' */; 
+    if (dbg) {/* nothing, just for 'maybe_unused' */};
     
     using vec3 = vec_t<T,3>;
     const vec3 v0(tri.a);
diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h
index 838c6aa..78a8011 100644
--- a/cuBQL/traversal/rayQueries.h
+++ b/cuBQL/traversal/rayQueries.h
@@ -716,11 +716,11 @@ namespace cuBQL {
               ray_t<T> &ray,
               bool _dbg)
   {
-#ifdef NDEBUG
+// #ifdef NDEBUG
     const bool dbg = false;
-#else
-    bool dbg = _dbg;
-#endif
+// #else
+//     bool dbg = _dbg;
+// #endif
     using bvh_t = BinaryBVH<T,3>;
     using node_t = typename bvh_t::node_t;
     using scalar_t = typename bvh_t::scalar_t;
@@ -746,7 +746,7 @@ namespace cuBQL {
     if (ray.direction.y == (scalar_t)0) ray.direction.y = scalar_t(1e-20);
     if (ray.direction.z == (scalar_t)0) ray.direction.z = scalar_t(1e-20);
     vec3_t rcp_dir = rcp(ray.direction);
-      
+
     // ------------------------------------------------------------------
     // traverse until there's nothing left to traverse:
     // ------------------------------------------------------------------
@@ -761,6 +761,7 @@ namespace cuBQL {
       while (true) {
         // if (dbg) printf("node %i.%i\n",(int)node.offset,(int)node.count);
         if (node.count != 0) {
+          // return;//xx
           // it's a boy! - seriously: this is not a inner node; so
           // we're either at a final leaf, or at an instance node
           if (blasStackBase != nullptr)
@@ -771,8 +772,8 @@ namespace cuBQL {
           tlasSavedNodePtr = bvh.nodes;
           tlasSavedPrimIDs = bvh.primIDs;
 #ifndef NDEBUG
-          if (node.count != 1)
-            printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n");
+          // if (node.count != 1)
+          //   printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n");
 #endif
           // if (dbg)
           //   printf("inner-leaf primIDs %p ofs %i count %i\n",
@@ -809,6 +810,7 @@ namespace cuBQL {
           // if (dbg) printf("new node %i.%i\n",(int)node.offset,(int)node.count);
         }          
 
+        
         uint32_t n0Idx = (uint32_t)node.offset+0;
         uint32_t n1Idx = (uint32_t)node.offset+1;
         node_t n0 = bvh.nodes[n0Idx];
@@ -817,12 +819,14 @@ namespace cuBQL {
         bool o0 = rayIntersectsBox(node_t0,ray,rcp_dir,n0.bounds,dbg);
         bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds,dbg);
 
-        if (dbg) {
-          dout << " node L " << n0.bounds << "\n";
-          dout << " node R " << n1.bounds << "\n";
-          dout << "children L hit " << int(o0) << " dist " << node_t0
-               << " R hit " << int(o1) << " dist " << node_t1 << "\n";
-        }
+        // return; //xx
+
+        // if (dbg) {
+        //   dout << " node L " << n0.bounds << "\n";
+        //   dout << " node R " << n1.bounds << "\n";
+        //   dout << "children L hit " << int(o0) << " dist " << node_t0
+        //        << " R hit " << int(o1) << " dist " << node_t1 << "\n";
+        // }
         if (o0) {
           if (o1) {
             if ((stackPtr-traversalStack) >= STACK_DEPTH) {
@@ -849,15 +853,16 @@ namespace cuBQL {
           }
         }
       }
- 
+
+      // return;//xx
+      
       if (node.count != 0 && blasStackBase != nullptr) {
         // we're at a valid leaf: call the lambda and see if that gave
         // us a new, closer cull radius
         // if (dbg)
         //   printf("trav leaf-leaf primIDs %p offset %i count %i\n",
         //          bvh.primIDs,(int)node.offset,(int)node.count);
-        ray.tMax
-          = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count);
+        ray.tMax  = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count);
       }
       // ------------------------------------------------------------------
       // pop next un-traversed node from stack, discarding any nodes

From eceb7326384ef7f049455d314480dbd8b5a5622c Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 14 Jun 2026 13:29:55 -0600
Subject: [PATCH 4/9] cleanups

---
 cuBQL/builder/cpu.h                 |   1 -
 cuBQL/builder/cpu/spatialMedian.h   |  28 +--
 cuBQL/builder/cuda.h                |   6 +-
 cuBQL/builder/cuda/builder_common.h | 122 +++++-----
 cuBQL/builder/cuda/gpu_builder.h    |   2 -
 cuBQL/builder/cuda/sm_builder.h     | 335 +---------------------------
 6 files changed, 66 insertions(+), 428 deletions(-)

diff --git a/cuBQL/builder/cpu.h b/cuBQL/builder/cpu.h
index 9c78acd..bdf447d 100644
--- a/cuBQL/builder/cpu.h
+++ b/cuBQL/builder/cpu.h
@@ -66,7 +66,6 @@ namespace cuBQL {
                   uint32_t              numPrims,
                   BuildConfig           buildConfig)
   {
-    PING;
     /*! right now, only have a slow spatial median builder */
     cpu::spatialMedian(bvh,boxes,numPrims,buildConfig);
   }
diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h
index 6f128b8..ace5e24 100644
--- a/cuBQL/builder/cpu/spatialMedian.h
+++ b/cuBQL/builder/cpu/spatialMedian.h
@@ -112,31 +112,11 @@ namespace cuBQL {
           node.bounds = box_t<T,D>()
             .including(bvh.nodes[node.admin.offset+0].bounds)
             .including(bvh.nodes[node.admin.offset+1].bounds);
-          if (node.bounds.empty()) {
-            PING;
-            PRINT(bvh.numNodes);
-            PRINT(node.admin.offset);
-            PRINT(node.admin.count);
-            PRINT(bvh.nodes[node.admin.offset+0].bounds);
-            PRINT(bvh.nodes[node.admin.offset+1].bounds);
-
-            box3f a = box_t<T,D>()
-            .including(bvh.nodes[node.admin.offset+0].bounds);
-            box3f b = box_t<T,D>()
-            .including(bvh.nodes[node.admin.offset+0].bounds)
-            .including(bvh.nodes[node.admin.offset+1].bounds);
-            box3f c = box_t<T,D>();
-            PRINT(a);
-            PRINT(b);
-            PRINT(c);
-          }
         } else {
           node.bounds.clear();
           for (int i=0;i<node.admin.count;i++)
             node.bounds.extend(boxes[bvh.primIDs[node.admin.offset+i]]);
         }
-        if (node.bounds.empty())
-          { PING; PRINT(nodeID); PRINT(node.admin.count); }
       }
                          
       template<typename T, int D>
@@ -147,16 +127,14 @@ namespace cuBQL {
       {
         using box_t = ::cuBQL::box_t<T,D>;
         std::vector<int> primIDs;
-        PING; PRINT(numPrims);
         for (int i=0;i<numPrims;i++) {
           box_t box = boxes[i];
-          if (numPrims == 1) PRINT(box);
           if (box.empty()) continue;
           primIDs.push_back(i);
         }
 
-
         if (primIDs.empty()) {
+          // if we had no valid input prims whatsoever
           bvh.nodes = new typename BinaryBVH<T,D>::Node[1];
           bvh.nodes[0].bounds = box3f();
           bvh.nodes[0].admin.offset = 0;
@@ -166,11 +144,9 @@ namespace cuBQL {
             bvh.primIDs[i] = i;
           bvh.numPrims = numPrims;
           bvh.numNodes = 1;
-          PING;
           return;
         }
         
-        PRINT(primIDs.size());
         std::vector<int>  altPrimIDs(primIDs.size());
         std::vector<Topo> topo(1);
         
@@ -190,8 +166,6 @@ namespace cuBQL {
         }
         topo.clear();
         refit(0,bvh,boxes);
-        PING;
-        PRINT(bvh.nodes[0].bounds);
       }
     } // spatialMedian_impl
     
diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h
index eca25ef..061091b 100644
--- a/cuBQL/builder/cuda.h
+++ b/cuBQL/builder/cuda.h
@@ -77,12 +77,14 @@ namespace cuBQL {
           cudaMemPool_t mempool;
           CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice));
           uint64_t threshold = UINT64_MAX;
-          CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
+          CUBQL_CUDA_CALL
+            (MemPoolSetAttribute(mempool,
+                                 cudaMemPoolAttrReleaseThreshold,
+                                 &threshold));
         }
       } );
     }
     void malloc(void** ptr, size_t size, cudaStream_t s) override {
-      PING;
 #ifndef NDEBUG
       if (s_numDevices > 1 && s == 0)
         std::cerr << "@cuBQL: warning; async memory allocator used with default stream."
diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h
index 49eac52..3e080c7 100644
--- a/cuBQL/builder/cuda/builder_common.h
+++ b/cuBQL/builder/cuda/builder_common.h
@@ -81,86 +81,75 @@ namespace cuBQL {
 
     template<typename T> struct int_type_of;
     template<> struct int_type_of<float> { typedef int32_t type; };
-    // template<> struct int_type_of<double> { typedef int64_t type; };
-    // template<> struct int_type_of<int32_t> { typedef int32_t type; };
-    // template<> struct int_type_of<int64_t> { typedef int64_t type; };
-    // template<> struct int_type_of<uint32_t> { typedef uint32_t type; };
-    // template<> struct int_type_of<uint64_t> { typedef uint64_t type; };
+    template<> struct int_type_of<double> { typedef int64_t type; };
+    template<> struct int_type_of<int32_t> { typedef int32_t type; };
+    template<> struct int_type_of<int64_t> { typedef int64_t type; };
+    template<> struct int_type_of<uint32_t> { typedef uint32_t type; };
+    template<> struct int_type_of<uint64_t> { typedef uint64_t type; };
 
-    // template<typename T> inline __device__
-    // typename int_type_of<T>::type encode(T v);
+    template<typename T> inline __device__
+    typename int_type_of<T>::type encode(T v);
     
-    // template<>
-    inline __device__
-    int32_t encode(const float &f)
+    template<> inline __device__
+    int32_t encode(float f)
     {
-      // int *ptr = (int *)&f;
-      
       const int32_t sign = 0x80000000U;
-      // int32_t bits = __float_as_int(f);
-      int32_t bits;// = *ptr;//(const int &)f;
-      memcpy(&bits,&f,4);
+      int32_t bits = __float_as_int(f);
       if (bits & sign) bits ^= 0x7fffffff;
       return bits;
     }
-    // template<> inline __device__
-    // int64_t encode(double f)
-    // {
-    //   const int64_t sign = 0x8000000000000000LL;
-    //   int64_t bits = __double_as_longlong(f);
-    //   if (bits & sign) bits ^= 0x7fffffffffffffffLL;
-    //   return bits;
-    // }
-    // template<> inline __device__
-    // int32_t encode(int32_t bits)
-    // {
-    //   return bits;
-    // }
-    // template<> inline __device__
-    // int64_t encode(int64_t bits)
-    // {
-    //   return bits;
-    // }
+    
+    template<> inline __device__
+    int64_t encode(double f)
+    {
+      const int64_t sign = 0x8000000000000000LL;
+      int64_t bits = __double_as_longlong(f);
+      if (bits & sign) bits ^= 0x7fffffffffffffffLL;
+      return bits;
+    }
+    template<> inline __device__
+    int32_t encode(int32_t bits)
+    {
+      return bits;
+    }
+    template<> inline __device__
+    int64_t encode(int64_t bits)
+    {
+      return bits;
+    }
 
     template<typename T> inline __device__
     T decode(int32_t v);
-    // template<typename T> inline __device__
-    // T decode(uint32_t v);
-    // template<typename T> inline __device__
-    // T decode(int64_t v);
-    // template<typename T> inline __device__
-    // T decode(uint64_t v);
+    template<typename T> inline __device__
+    T decode(uint32_t v);
+    template<typename T> inline __device__
+    T decode(int64_t v);
+    template<typename T> inline __device__
+    T decode(uint64_t v);
 
     template<> inline __device__
     float decode<float>(int32_t bits)
     {
       const int32_t sign = 0x80000000;
       if (bits & sign) bits ^= 0x7fffffff;
-// #ifdef __HIPCC__
-//       float ff = ((const float &)bits);
-//       return ff;
-// #endif
-      float f;
-      memcpy(&f,&bits,sizeof(4));
-      return f;
-      // float *f = (float *)&bits;
-      // return *f;
-      // return __int_as_float(bits);
+      return __int_as_float(bits);
     }
-    // template<> inline __device__
-    // int32_t decode<int32_t>(int32_t bits)
-    // { return bits; }
-    // template<> inline __device__
-    // int64_t decode<int64_t>(int64_t bits)
-    // { return bits; }
+    
+    template<> inline __device__
+    int32_t decode<int32_t>(int32_t bits)
+    { return bits; }
+    
+    template<> inline __device__
+    int64_t decode<int64_t>(int64_t bits)
+    { return bits; }
 
-    // template<> inline __device__
-    // double decode<double>(int64_t bits)
-    // {
-    //   const int64_t sign = 0x8000000000000000LL;
-    //   if (bits & sign) bits ^= 0x7fffffffffffffffLL;
-    //   return __longlong_as_double(bits);
-    // }
+    template<> inline __device__
+    double decode<double>(int64_t bits)
+    {
+      const int64_t sign = 0x8000000000000000LL;
+      if (bits & sign) bits ^= 0x7fffffffffffffffLL;
+      return __longlong_as_double(bits);
+    }
     
     template<typename box_t>
     struct CUBQL_ALIGN(8) AtomicBox {
@@ -233,13 +222,8 @@ namespace cuBQL {
       using scalar_t = typename box_t::scalar_t;
 #pragma unroll
       for (int d=0;d<box_t::numDims;d++) {
-#if 1
-        lower[d] = encode(+1e6f);//encode(+FLT_MAX);
-        upper[d] = encode(-1e6f);//encode(-FLT_MAX);
-#else
         lower[d] = encode(empty_box_lower<scalar_t>());//encode(+FLT_MAX);
         upper[d] = encode(empty_box_upper<scalar_t>());//encode(-FLT_MAX);
-#endif
       }
     }
 
@@ -248,11 +232,9 @@ namespace cuBQL {
                      typename box_t::vec_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
-      auto in_y = abox.upper[1];
 #pragma unroll
       for (int d=0;d<box_t::numDims;d++) {
-        // const typename int_type_of<scalar_t>::type enc
-        const int32_t enc
+        const typename int_type_of<scalar_t>::type enc
           = encode(other[d]);
         if (enc < abox.lower[d])
           atomic_min((volatile int32_t*)abox.lower+d,enc);
diff --git a/cuBQL/builder/cuda/gpu_builder.h b/cuBQL/builder/cuda/gpu_builder.h
index 9b396dc..18365f7 100644
--- a/cuBQL/builder/cuda/gpu_builder.h
+++ b/cuBQL/builder/cuda/gpu_builder.h
@@ -52,9 +52,7 @@ namespace cuBQL {
         buildConfig.makeLeafThreshold = 1;
       gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource);
     }
-    PING;
     cuBQL::cuda::refit(bvh,boxes,s,memResource);
-    PING;
   }
 
   namespace cuda {
diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
index c3cc42c..30ef82c 100644
--- a/cuBQL/builder/cuda/sm_builder.h
+++ b/cuBQL/builder/cuda/sm_builder.h
@@ -25,10 +25,10 @@ namespace cuBQL {
     };
 
     template<typename T, int D>
-    struct// CUBQL_ALIGN(16)
+    struct CUBQL_ALIGN(16)
       TempNode {
       using box_t = cuBQL::box_t<T,D>;
-      // union {
+      union {
         struct {
           uint32_t         count;
           uint32_t         unused;
@@ -44,7 +44,7 @@ namespace cuBQL {
           uint32_t offset;
           uint32_t count;
         } doneNode;
-      // };
+      };
     };
     
     template<typename T, int D>
@@ -53,15 +53,11 @@ namespace cuBQL {
                    NodeState       *nodeStates,
                    TempNode<T,D> *nodes)
     {
-      if (threadIdx.x > 0) return;
-      
       buildState->numNodes = 2;
       
       nodeStates[0]             = OPEN_BRANCH;
       nodes[0].openBranch.count = 0;
       nodes[0].openBranch.centBounds.set_empty();
-      if (nodes[0].openBranch.centBounds.upper[0] > 0)
-        nodes[0].openBranch.centBounds.upper[0] = 8888;
 
       nodeStates[1]            = DONE_NODE;
       nodes[1].doneNode.offset = 0;
@@ -87,8 +83,7 @@ namespace cuBQL {
         // this could be made faster by block-reducing ...
         atomicAdd(&nodes[0].openBranch.count,1);
         auto ctr = box.center();
-        atomic_grow(nodes[0].openBranch.centBounds,ctr);//centerOf(box));
-        // atomic_grow(nodes[0].openBranch.centBounds,centerOf(box));
+        atomic_grow(nodes[0].openBranch.centBounds,ctr);
       } else {
         me.nodeID = (uint32_t)-1;
         me.done   = true;
@@ -103,7 +98,7 @@ namespace cuBQL {
                       uint32_t       numNodes,
                       BuildConfig    buildConfig)
     {
-#if 0
+#if 1
       __shared__ int l_newNodeOfs;
       if (threadIdx.x == 0)
         l_newNodeOfs = 0;
@@ -431,9 +426,6 @@ namespace cuBQL {
                GpuMemoryResource &memResource)
     {
       assert(sizeof(PrimState) == sizeof(uint64_t));
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-      
       // ==================================================================
       // do build on temp nodes
       // ==================================================================
@@ -446,337 +438,46 @@ namespace cuBQL {
       _ALLOC(primStates,numPrims,s,memResource);
       _ALLOC(buildState,1,s,memResource);
 
-#if 1
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-        {
-          int numNodes = 1;
-          std::cout << "*********** INPUT BOUNDS ***********" << std::endl;
-          std::vector<box3f> h_bbs(numPrims);
-          CUBQL_CUDA_CALL(Memcpy(h_bbs.data(),boxes,
-                                 numPrims*sizeof(*boxes),cudaMemcpyDefault));
-          for (int i=0;i<numPrims;i++) {
-            auto &bb= h_bbs[i];
-            if (bb.upper.y > 1000000)
-              PRINT(bb);
-          }
-        }
-#endif
+      initState<<<1,128,0,s>>>(buildState,
+                               nodeStates,
+                               tempNodes);
       
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-        initState<<<1,128,0,s>>>(buildState,
-                                 nodeStates,
-                                 tempNodes);
-        
-
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-#if 0
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-        {
-          int numNodes = 1;
-          std::cout << "*********** AFTER INITSTATE ***********" << std::endl;
-          std::vector<NodeState> h_nodeStates(numNodes);
-          std::vector<TempNode<T,D>> h_tempNodes(numNodes);
-          CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
-                                 numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
-CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
-                       numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
-          for (int i=0;i<numNodes;i++) {
-            int state = nodeStates[i];
-            // printf("nodestae %i = %i\n",i,state);
-            auto &n = tempNodes[i];
-            switch(state) {
-            case OPEN_NODE:
-              break;
-            case OPEN_BRANCH: {
-              auto _decode = [](int32_t bits) -> float {
-                const int32_t sign = 0x80000000;
-                if (bits & sign) bits ^= 0x7fffffff;
-                float f;
-                memcpy(&f,&bits,4);
-                return f;
-              };
-              auto _decode2 = [](int32_t bits) -> float {
-                const int32_t sign = 0x80000000;
-                if (bits & sign) bits ^= 0x7fffffff;
-                return (const float &)bits;
-              };
-              auto _encode = [](float f) -> int32_t {
-                int *ptr = (int *)&f;
-                const int32_t sign = 0x80000000U;
-                // int32_t bits = __float_as_int(f);
-                int32_t bits = *ptr;//(const int &)f;
-                if (bits & sign) bits ^= 0x7fffffff;
-                return bits;
-              };
-              auto &bb = n.openBranch.centBounds;
-              float tf = +1e8f;
-              auto etf = _encode(tf);
-              auto detf = _decode(etf);
-              auto detf2 = _decode2(etf);
-              printf(" in f %f enc %i/0x%x dec %f / %f\n",
-                     tf,etf,etf,detf,detf2);
-              printf(" [%i] OPENBRANCH cnt %i bb %i %i %i: %i %i %i -> %f %f %f : %f %f %f\n",
-                     i,
-                     n.openBranch.count,
-                     (bb.lower[0]),
-                     (bb.lower[1]),
-                     (bb.lower[2]),
-                     (bb.upper[0]),
-                     (bb.upper[1]),
-                     (bb.upper[2]),
-                     _decode(bb.lower[0]),
-                     _decode(bb.lower[1]),
-                     _decode(bb.lower[2]),
-                     _decode(bb.upper[0]),
-                     _decode(bb.upper[1]),
-                     _decode(bb.upper[2]));
-            } break;
-            case DONE_NODE:
-              break;
-            }
-          }
-        }
-#endif
-
-      
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
       initPrims<<<divRoundUp(numPrims,128),128,0,s>>>
         (tempNodes,
          primStates,boxes,numPrims);
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-
-
-#if 0
-      CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-      CUBQL_CUDA_SYNC_CHECK();
-        {
-          int numNodes = 1;
-          std::cout << "*********** INITIAL BOUNDS ***********" << std::endl;
-          std::vector<NodeState> h_nodeStates(numNodes);
-          std::vector<TempNode<T,D>> h_tempNodes(numNodes);
-          CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
-                                 numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
-CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
-                       numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
-          for (int i=0;i<numNodes;i++) {
-            int state = nodeStates[i];
-            // printf("nodestae %i = %i\n",i,state);
-            auto &n = tempNodes[i];
-            switch(state) {
-            case OPEN_NODE:
-              break;
-            case OPEN_BRANCH: {
-              auto decode = [](int32_t bits) {
-                const int32_t sign = 0x80000000;
-                if (bits & sign) bits ^= 0x7fffffff;
-                float f;
-                memcpy(&f,&bits,4);
-                return f;
-                // const int32_t sign = 0x80000000;
-                // if (bits & sign) bits ^= 0x7fffffff;
-                // return (const float &)bits;
-              };
-              auto &bb = n.openBranch.centBounds;
-              printf(" [%i] OPENBRANCH cnt %i bb %i %i %i: %i %i %i -> %f %f %f : %f %f %f\n",
-                     i,
-                     n.openBranch.count,
-                     (bb.lower[0]),
-                     (bb.lower[1]),
-                     (bb.lower[2]),
-                     (bb.upper[0]),
-                     (bb.upper[1]),
-                     (bb.upper[2]),
-                     decode(bb.lower[0]),
-                     decode(bb.lower[1]),
-                     decode(bb.lower[2]),
-                     decode(bb.upper[0]),
-                     decode(bb.upper[1]),
-                     decode(bb.upper[2]));
-            } break;
-            case DONE_NODE:
-              break;
-            }
-          }
-        }
-#endif
 
-      
       int numDone = 0;
       int numNodes;
 
       // ------------------------------------------------------------------      
       cudaEvent_t stateDownloadedEvent;
       CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
-      
-      
-// #if CUBQL_PROFILE
-//       int pass = 0;
-//       static Profile t_writeNodes;
-//       static Profile t_writePrims;
-//       static Profile t_sortPrims;
-//       static Profile t_nodePass[100];
-//       static Profile t_primPass[100];
-//       if (t_writeNodes.name == "") {
-//         t_writeNodes.setName("writeNodes");
-//         t_writePrims.setName("writePrims");
-//         t_sortPrims.setName("sortPrims");
-//         for (int i=0;i<100;i++) {
-//           t_nodePass[i].setName("nodePass",i);
-//           t_primPass[i].setName("primPass",i);
-//         }
-//       }
-// #endif
-      PING;
       while (true) {
-        PING;
-        PRINT(numDone);
         int pre_numNodes = numNodes;
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
         CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
         CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
-        PRINT(numNodes);
-        CUBQL_CUDA_SYNC_CHECK();
         CUBQL_CUDA_SYNC_CHECK_STREAM(s);
         if (numNodes == numDone)
           break;
-        PRINT(numNodes-numDone);
-        bool dbg = (numNodes-numDone) == 8098;
-// #if CUBQL_PROFILE
-//         t_nodePass[pass].sync_start();
-// #endif
-        CUBQL_CUDA_SYNC_CHECK();
-        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
 
-#if 0
-        {
-          std::cout << "*********** PRE SELECT ***********" << std::endl;
-          std::vector<NodeState> h_nodeStates(numNodes);
-          std::vector<TempNode<T,D>> h_tempNodes(numNodes);
-          CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
-                                 numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
-          CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
-                                 numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
-          for (int i=0;i<numNodes;i++) {
-            int state = nodeStates[i];
-            // printf("nodestae %i = %i\n",i,state);
-            auto &n = tempNodes[i];
-            switch(state) {
-            case OPEN_NODE:
-              break;
-            case OPEN_BRANCH: {
-              auto decode = [](int32_t bits) {
-                const int32_t sign = 0x80000000;
-                if (bits & sign) bits ^= 0x7fffffff;
-                float f;
-                memcpy(&f,&bits,4);
-                return f;
-              };
-              auto &bb = n.openBranch.centBounds;
-              printf(" [%i] OPENBRANCH cnt %i bb %f %f %f : %f %f %f\n",
-                     i,
-                     n.openBranch.count,
-                     decode(bb.lower[0]),
-                     decode(bb.lower[1]),
-                     decode(bb.lower[2]),
-                     decode(bb.upper[0]),
-                     decode(bb.upper[1]),
-                     decode(bb.upper[2]));
-            } break;
-            case DONE_NODE:
-              break;
-            }
-          }
-        }
-#endif
-          
         selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
           (buildState,
            nodeStates,tempNodes,numNodes,
            buildConfig);
-        CUBQL_CUDA_SYNC_CHECK();
-        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
 
-#if 0
-        {
-        std::vector<NodeState> h_nodeStates(numNodes);
-        std::vector<TempNode<T,D>> h_tempNodes(numNodes);
-CUBQL_CUDA_CALL(Memcpy(h_nodeStates.data(),nodeStates,
-                       numNodes*sizeof(*nodeStates),cudaMemcpyDefault));
-CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
-                       numNodes*sizeof(*tempNodes),cudaMemcpyDefault));
-        for (int i=0;i<numNodes;i++) {
-          int state = nodeStates[i];
-          // printf("nodestae %i = %i\n",i,state);
-          auto &n = tempNodes[i];
-          switch(state) {
-          case OPEN_NODE:
-            printf(" [%i] OPENNODE offset %i dim %i pos %f\n",
-                   i,
-                   n.openNode.offset,
-                   n.openNode.dim,
-                   n.openNode.pos);
-            break;
-          case OPEN_BRANCH:
-            break;
-          case DONE_NODE:
-            break;
-          }
-        //   union {
-        // struct {
-        //   AtomicBox<box_t> centBounds;
-        //   uint32_t         count;
-        //   uint32_t         unused;
-        // } openBranch;
-        // struct {
-        //   uint32_t offset;
-        //   int      dim;
-        //   uint32_t tieBreaker;
-        //   float    pos;
-        // } openNode;
-        // struct {
-        //   uint32_t offset;
-        //   uint32_t count;
-        //   uint32_t unused[2];
-        // } doneNode;
-        }
-        }
-#endif
-        
-// #if CUBQL_PROFILE
-//         t_nodePass[pass].sync_stop();
-//         t_primPass[pass].sync_start();
-// #endif
-        
         numDone = numNodes;
 
-// #if 1
-        if (0 && sizeof(T)*D <= sizeof(float3)) {
+        if (1 && sizeof(T)*D <= sizeof(float3)) {
           updatePrims_shm<<<divRoundUp(numPrims,512),512,0,s>>>
             (nodeStates,tempNodes,
              primStates,boxes,numPrims,numDone);
         } else 
-// #else
         updatePrims<<<divRoundUp(numPrims,128),128,0,s>>>
           (nodeStates,tempNodes,
            primStates,boxes,numPrims);
-// #endif
-
-        CUBQL_CUDA_SYNC_CHECK();
-        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
-        
-// #if CUBQL_PROFILE
-//         t_primPass[pass].sync_stop();
-//         ++ pass;
-// #endif
       }
-      PING;
       CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
       // ==================================================================
       // sort {item,nodeID} list
@@ -786,10 +487,6 @@ CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
       uint8_t   *d_temp_storage     = NULL;
       size_t     temp_storage_bytes = 0;
       PrimState *sortedPrimStates   = 0;
-// #if CUBQL_PROFILE
-//       t_sortPrims.sync_start();
-// #endif
-      PING;
       _ALLOC(sortedPrimStates,numPrims,s,memResource);
       auto rc =
         cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
@@ -804,25 +501,15 @@ CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
                                        numPrims,32,64,s);
       rc = rc;
       _FREE(d_temp_storage,s,memResource);
-// #if CUBQL_PROFILE
-//       t_sortPrims.sync_stop();
-//       t_writePrims.sync_start();
-// #endif
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
 
-      PING;
       bvh.numPrims = numPrims;
       _ALLOC(bvh.primIDs,numPrims,s,memResource);
       writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
         (tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
-// #if CUBQL_PROFILE
-//       t_writePrims.sync_stop();
-//       t_writeNodes.sync_start();
-// #endif
 
-      PING;
       // ==================================================================
       // allocate and write final nodes
       // ==================================================================
@@ -830,15 +517,11 @@ CUBQL_CUDA_CALL(Memcpy(h_tempNodes.data(),tempNodes,
       _ALLOC(bvh.nodes,numNodes,s,memResource);
       writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
         (bvh.nodes,tempNodes,numNodes);
-// #if CUBQL_PROFILE
-//       t_writeNodes.sync_stop();
-// #endif
       _FREE(sortedPrimStates,s,memResource);
       _FREE(tempNodes,s,memResource);
       _FREE(nodeStates,s,memResource);
       _FREE(primStates,s,memResource);
       _FREE(buildState,s,memResource);
-      PING;
     }
 
   } // ::cuBQL::gpuBuilder_impl

From fd5efa2858e221c551a4f870bc9c33bb2584e777 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 14 Jun 2026 13:32:04 -0600
Subject: [PATCH 5/9] fixed misplaced 'volatile's

---
 cuBQL/builder/cuda/builder_common.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h
index 3e080c7..63a9a33 100644
--- a/cuBQL/builder/cuda/builder_common.h
+++ b/cuBQL/builder/cuda/builder_common.h
@@ -23,7 +23,7 @@ namespace cub {
 namespace cuBQL {
   namespace gpuBuilder_impl {
 
-    inline __device__ void atomic_min(volatile int32_t *v, int32_t vv)
+    inline __device__ void atomic_min(int32_t *v, int32_t vv)
     { atomicMin((int *)v,(int)vv); }
     inline __device__ void atomic_min(int64_t *v, int64_t vv)
     { atomicMin((long long *)v,(long long)vv); }
@@ -32,7 +32,7 @@ namespace cuBQL {
     inline __device__ void atomic_min(uint64_t *v, uint64_t vv)
     { atomicMin((unsigned long long *)v,(unsigned long long)vv); }
 
-    inline __device__ void atomic_max(volatile int32_t *v, int32_t vv)
+    inline __device__ void atomic_max(int32_t *v, int32_t vv)
     { atomicMax((int *)v,(int)vv); }
     inline __device__ void atomic_max(int64_t *v, int64_t vv)
     { atomicMax((long long *)v,(long long)vv); }
@@ -93,7 +93,7 @@ namespace cuBQL {
     template<> inline __device__
     int32_t encode(float f)
     {
-      const int32_t sign = 0x80000000U;
+      const int32_t sign = 0x80000000;
       int32_t bits = __float_as_int(f);
       if (bits & sign) bits ^= 0x7fffffff;
       return bits;
@@ -237,9 +237,9 @@ namespace cuBQL {
         const typename int_type_of<scalar_t>::type enc
           = encode(other[d]);
         if (enc < abox.lower[d])
-          atomic_min((volatile int32_t*)abox.lower+d,enc);
+          atomic_min(abox.lower+d,enc);
         if (enc > abox.upper[d])
-          atomic_max((volatile int32_t*)abox.upper+d,enc);
+          atomic_max(abox.upper+d,enc);
       }
     } 
     

From ad099c4dbb87cb397ca0b7ee20a0c264569b880b Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 14 Jun 2026 13:38:27 -0600
Subject: [PATCH 6/9] more cleanups

---
 cuBQL/builder/cuda/sah_builder.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cuBQL/builder/cuda/sah_builder.h b/cuBQL/builder/cuda/sah_builder.h
index 4ba5277..423d6fb 100644
--- a/cuBQL/builder/cuda/sah_builder.h
+++ b/cuBQL/builder/cuda/sah_builder.h
@@ -554,14 +554,14 @@ namespace cuBQL {
                     GpuMemoryResource& memResource)
     { real3_sahBuilder<float>(bvh,boxes,_numPrims,buildConfig,s,memResource); }
     
-    // template<>
-    // inline void sahBuilder(BinaryBVH<double,3>  &bvh,
-    //                 const box_t<double,3> *boxes,
-    //                 uint32_t     _numPrims,
-    //                 BuildConfig  buildConfig,
-    //                 cudaStream_t s,
-    //                 GpuMemoryResource& memResource)
-    // { real3_sahBuilder<double>(bvh,boxes,_numPrims,buildConfig,s,memResource); }
+    template<>
+    inline void sahBuilder(BinaryBVH<double,3>  &bvh,
+                    const box_t<double,3> *boxes,
+                    uint32_t     _numPrims,
+                    BuildConfig  buildConfig,
+                    cudaStream_t s,
+                    GpuMemoryResource& memResource)
+    { real3_sahBuilder<double>(bvh,boxes,_numPrims,buildConfig,s,memResource); }
   } // ::cuBQL::sahBuilder_impl
 
   namespace cuda {

From cff482ae4b18c737505ed1725a934b363e7549c0 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 14 Jun 2026 13:40:11 -0600
Subject: [PATCH 7/9] Updated windows github runner config

---
 .github/workflows/Windows.yml | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml
index 17bdf08..c03ae67 100644
--- a/.github/workflows/Windows.yml
+++ b/.github/workflows/Windows.yml
@@ -23,26 +23,16 @@ jobs:
             visual-studio: "Visual Studio 17 2022"
             shell: "powershell"
             config: Release
-          - os: windows-2022
-            cuda: "13.0.0"
-            visual-studio: "Visual Studio 17 2022"
-            shell: "powershell"
-            config: Release
-          - os: windows-2025
-            cuda: "12.6.2"
-            visual-studio: "Visual Studio 17 2022"
-            shell: "powershell"
-            config: Release
           - os: windows-2025
-            cuda: "13.0.0"
-            visual-studio: "Visual Studio 17 2022"
+            cuda: "13.2.0"
+            visual-studio: "Visual Studio 18 2026"
             shell: "powershell"
             config: Release
           - os: windows-2025
-            cuda: "13.0.2"
-            visual-studio: "Visual Studio 17 2022"
+            cuda: "13.2.0"
+            visual-studio: "Visual Studio 18 2026"
             shell: "powershell"
-            config: Release
+            config: Debug
 
     env:
       cmake_generator: "${{ matrix.visual-studio }}"

From 81e2751d7fce2b7c235a28f5ad5c0bcee3d02361 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 14 Jun 2026 14:01:20 -0600
Subject: [PATCH 8/9] fixed missing CUDART_INF_F include

---
 cuBQL/builder/cuda/sm_builder.h | 1 -
 cuBQL/math/constants.h          | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
index 30ef82c..cf68ada 100644
--- a/cuBQL/builder/cuda/sm_builder.h
+++ b/cuBQL/builder/cuda/sm_builder.h
@@ -453,7 +453,6 @@ namespace cuBQL {
       cudaEvent_t stateDownloadedEvent;
       CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
       while (true) {
-        int pre_numNodes = numNodes;
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
         CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
diff --git a/cuBQL/math/constants.h b/cuBQL/math/constants.h
index 488ad7f..4b5c3c4 100644
--- a/cuBQL/math/constants.h
+++ b/cuBQL/math/constants.h
@@ -6,7 +6,7 @@
 #include <limits>
 #include <limits.h>
 #if defined(__CUDACC__) && !defined(CUDART_INF_F)
-// #include <math_constants.h>
+#include <math_constants.h>
 #endif
 
 #ifndef M_PI

From 59382763f2bd8ed9a12e148a859a8128276d14b7 Mon Sep 17 00:00:00 2001
From: Ingo Wald <ingowald@gmail.com>
Date: Sun, 14 Jun 2026 14:04:42 -0600
Subject: [PATCH 9/9] added /Zc:preprocessor for windows/vs2026 - won't compile
 without that

---
 cuBQL/CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt
index 3a22714..5f2c7d8 100644
--- a/cuBQL/CMakeLists.txt
+++ b/cuBQL/CMakeLists.txt
@@ -6,6 +6,15 @@
 # in one of his/her source files
 add_library(cuBQL INTERFACE)
 
+if (WIN32)
+  target_compile_options(cuBQL INTERFACE
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/Zc:preprocessor">
+  )
+#  target_compile_options(${PTX_TARGET} PRIVATE
+#    $<$<COMPILE_LANGUAGE:CUDA>:-std=c++17>
+#  )
+endif()
+
 target_sources(cuBQL INTERFACE
   # main public "interface" to this library
   bvh.h