diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml
index 17bdf08..c03ae67 100644
--- a/.github/workflows/Windows.yml
+++ b/.github/workflows/Windows.yml
@@ -23,26 +23,16 @@ jobs:
             visual-studio: "Visual Studio 17 2022"
             shell: "powershell"
             config: Release
-          - os: windows-2022
-            cuda: "13.0.0"
-            visual-studio: "Visual Studio 17 2022"
-            shell: "powershell"
-            config: Release
-          - os: windows-2025
-            cuda: "12.6.2"
-            visual-studio: "Visual Studio 17 2022"
-            shell: "powershell"
-            config: Release
           - os: windows-2025
-            cuda: "13.0.0"
-            visual-studio: "Visual Studio 17 2022"
+            cuda: "13.2.0"
+            visual-studio: "Visual Studio 18 2026"
             shell: "powershell"
             config: Release
           - os: windows-2025
-            cuda: "13.0.2"
-            visual-studio: "Visual Studio 17 2022"
+            cuda: "13.2.0"
+            visual-studio: "Visual Studio 18 2026"
             shell: "powershell"
-            config: Release
+            config: Debug
 
     env:
       cmake_generator: "${{ matrix.visual-studio }}"
diff --git a/cuBQL/CMakeLists.txt b/cuBQL/CMakeLists.txt
index 3a22714..5f2c7d8 100644
--- a/cuBQL/CMakeLists.txt
+++ b/cuBQL/CMakeLists.txt
@@ -6,6 +6,15 @@
 # in one of his/her source files
 add_library(cuBQL INTERFACE)
 
+if (WIN32)
+  target_compile_options(cuBQL INTERFACE
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler="/Zc:preprocessor">
+  )
+#  target_compile_options(${PTX_TARGET} PRIVATE
+#    $<$<COMPILE_LANGUAGE:CUDA>:-std=c++17>
+#  )
+endif()
+
 target_sources(cuBQL INTERFACE
   # main public "interface" to this library
   bvh.h
diff --git a/cuBQL/builder/cpu/spatialMedian.h b/cuBQL/builder/cpu/spatialMedian.h
index b72d61e..ace5e24 100644
--- a/cuBQL/builder/cpu/spatialMedian.h
+++ b/cuBQL/builder/cpu/spatialMedian.h
@@ -132,6 +132,21 @@ namespace cuBQL {
           if (box.empty()) continue;
           primIDs.push_back(i);
         }
+
+        if (primIDs.empty()) {
+          // if we had no valid input prims whatsoever
+          bvh.nodes = new typename BinaryBVH<T,D>::Node[1];
+          bvh.nodes[0].bounds = box3f();
+          bvh.nodes[0].admin.offset = 0;
+          bvh.nodes[0].admin.count = 1;
+          bvh.primIDs = new uint32_t[numPrims];
+          for (int i=0;i<numPrims;i++)
+            bvh.primIDs[i] = i;
+          bvh.numPrims = numPrims;
+          bvh.numNodes = 1;
+          return;
+        }
+        
         std::vector<int>  altPrimIDs(primIDs.size());
         std::vector<Topo> topo(1);
         
diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h
index f15dad8..061091b 100644
--- a/cuBQL/builder/cuda.h
+++ b/cuBQL/builder/cuda.h
@@ -45,6 +45,7 @@ namespace cuBQL {
     DeviceMemoryResource()
     {}
     void malloc(void** ptr, size_t size, cudaStream_t s) override {
+      CUBQL_CUDA_CALL(StreamSynchronize(s));
       CUBQL_CUDA_CALL(Malloc(ptr, size));
     }
     void free(void* ptr, cudaStream_t s) override
@@ -76,7 +77,10 @@ namespace cuBQL {
           cudaMemPool_t mempool;
           CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice));
           uint64_t threshold = UINT64_MAX;
-          CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
+          CUBQL_CUDA_CALL
+            (MemPoolSetAttribute(mempool,
+                                 cudaMemPoolAttrReleaseThreshold,
+                                 &threshold));
         }
       } );
     }
diff --git a/cuBQL/builder/cuda/builder_common.h b/cuBQL/builder/cuda/builder_common.h
index e71b901..63a9a33 100644
--- a/cuBQL/builder/cuda/builder_common.h
+++ b/cuBQL/builder/cuda/builder_common.h
@@ -98,6 +98,7 @@ namespace cuBQL {
       if (bits & sign) bits ^= 0x7fffffff;
       return bits;
     }
+    
     template<> inline __device__
     int64_t encode(double f)
     {
@@ -133,9 +134,11 @@ namespace cuBQL {
       if (bits & sign) bits ^= 0x7fffffff;
       return __int_as_float(bits);
     }
+    
     template<> inline __device__
     int32_t decode<int32_t>(int32_t bits)
     { return bits; }
+    
     template<> inline __device__
     int64_t decode<int64_t>(int64_t bits)
     { return bits; }
@@ -225,23 +228,24 @@ namespace cuBQL {
     }
 
     template<typename box_t> inline __device__
-    void atomic_grow(AtomicBox<box_t> &abox, const typename box_t::vec_t &other)
+    void atomic_grow(AtomicBox<box_t> &abox,
+                     typename box_t::vec_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
       for (int d=0;d<box_t::numDims;d++) {
         const typename int_type_of<scalar_t>::type enc
-          = //AtomicBox<box_t>::
-          encode(other[d]);//get(other,d));
+          = encode(other[d]);
         if (enc < abox.lower[d])
-          atomic_min(&abox.lower[d],enc);
+          atomic_min(abox.lower+d,enc);
         if (enc > abox.upper[d])
-          atomic_max(&abox.upper[d],enc);
+          atomic_max(abox.upper+d,enc);
       }
     } 
     
     template<typename box_t>
-    inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const box_t &other)
+    inline __device__
+    void atomic_grow(AtomicBox<box_t> &abox, box_t other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
@@ -252,13 +256,14 @@ namespace cuBQL {
         const typename int_type_of<scalar_t>::type 
           enc_upper = //AtomicBox<box_t>::
           encode(other.get_upper(d));
-        if (enc_lower < abox.lower[d]) atomic_min(&abox.lower[d],enc_lower);
-        if (enc_upper > abox.upper[d]) atomic_max(&abox.upper[d],enc_upper);
+        if (enc_lower < abox.lower[d]) atomic_min(abox.lower+d,enc_lower);
+        if (enc_upper > abox.upper[d]) atomic_max(abox.upper+d,enc_upper);
       }
     }
 
     template<typename box_t>
-    inline __device__ void atomic_grow(AtomicBox<box_t> &abox, const AtomicBox<box_t> &other)
+    inline __device__ void atomic_grow(AtomicBox<box_t> &abox,
+                                       const AtomicBox<box_t> &other)
     {
       using scalar_t = typename AtomicBox<box_t>::scalar_t;
 #pragma unroll
diff --git a/cuBQL/builder/cuda/sm_builder.h b/cuBQL/builder/cuda/sm_builder.h
index 59395cd..cf68ada 100644
--- a/cuBQL/builder/cuda/sm_builder.h
+++ b/cuBQL/builder/cuda/sm_builder.h
@@ -25,13 +25,14 @@ namespace cuBQL {
     };
 
     template<typename T, int D>
-    struct CUBQL_ALIGN(16) TempNode {
+    struct CUBQL_ALIGN(16)
+      TempNode {
       using box_t = cuBQL::box_t<T,D>;
       union {
         struct {
-          AtomicBox<box_t> centBounds;
           uint32_t         count;
           uint32_t         unused;
+          AtomicBox<box_t> centBounds;
         } openBranch;
         struct {
           uint32_t offset;
@@ -42,7 +43,6 @@ namespace cuBQL {
         struct {
           uint32_t offset;
           uint32_t count;
-          uint32_t unused[2];
         } doneNode;
       };
     };
@@ -82,7 +82,8 @@ namespace cuBQL {
         me.done   = false;
         // this could be made faster by block-reducing ...
         atomicAdd(&nodes[0].openBranch.count,1);
-        atomic_grow(nodes[0].openBranch.centBounds,box.center());//centerOf(box));
+        auto ctr = box.center();
+        atomic_grow(nodes[0].openBranch.centBounds,ctr);
       } else {
         me.nodeID = (uint32_t)-1;
         me.done   = true;
@@ -208,13 +209,14 @@ namespace cuBQL {
       
       auto in = nodes[nodeID].openBranch;
       if (in.count <= buildConfig.makeLeafThreshold) {
-        auto &done  = nodes[nodeID].doneNode;
+        auto done  = nodes[nodeID].doneNode;
         done.count  = in.count;
         // set this to max-value, so the prims can later do atomicMin
         // with their position ion the leaf list; this value is
         // greater than any prim position.
         done.offset = (uint32_t)-1;
         nodeState   = DONE_NODE;
+        nodes[nodeID].doneNode = done;
       } else {
         float widestWidth = 0.f;
         int   widestDim   = -1;
@@ -235,7 +237,8 @@ namespace cuBQL {
           widestCtr = ctr;
         }
       
-        auto &open = nodes[nodeID].openNode;
+        // auto &open = nodes[nodeID].openNode;
+        auto open = nodes[nodeID].openNode;
         if (widestDim >= 0) {
           open.pos = widestCtr;
         }
@@ -249,12 +252,15 @@ namespace cuBQL {
 #pragma unroll
         for (int side=0;side<2;side++) {
           const int childID = open.offset+side;
-          auto &child = nodes[childID].openBranch;
-          child.centBounds.set_empty();
-          child.count         = 0;
+          TempNode<T,D> child;
+          // auto &child = nodes[childID].openBranch;
+          child.openBranch.centBounds.set_empty();
+          child.openBranch.count         = 0;
           nodeStates[childID] = OPEN_BRANCH;
+          nodes[childID] = child;
         }
         nodeState = OPEN_NODE;
+        nodes[nodeID].openNode = open;
       }
 #endif
     }
@@ -420,7 +426,6 @@ namespace cuBQL {
                GpuMemoryResource &memResource)
     {
       assert(sizeof(PrimState) == sizeof(uint64_t));
-      
       // ==================================================================
       // do build on temp nodes
       // ==================================================================
@@ -432,10 +437,12 @@ namespace cuBQL {
       _ALLOC(nodeStates,2*numPrims,s,memResource);
       _ALLOC(primStates,numPrims,s,memResource);
       _ALLOC(buildState,1,s,memResource);
-      initState<<<1,1,0,s>>>(buildState,
-                             nodeStates,
-                             tempNodes);
-      initPrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+
+      initState<<<1,128,0,s>>>(buildState,
+                               nodeStates,
+                               tempNodes);
+      
+      initPrims<<<divRoundUp(numPrims,128),128,0,s>>>
         (tempNodes,
          primStates,boxes,numPrims);
 
@@ -445,61 +452,30 @@ namespace cuBQL {
       // ------------------------------------------------------------------      
       cudaEvent_t stateDownloadedEvent;
       CUBQL_CUDA_CALL(EventCreate(&stateDownloadedEvent));
-      
-      
-#if CUBQL_PROFILE
-      int pass = 0;
-      static Profile t_writeNodes;
-      static Profile t_writePrims;
-      static Profile t_sortPrims;
-      static Profile t_nodePass[100];
-      static Profile t_primPass[100];
-      if (t_writeNodes.name == "") {
-        t_writeNodes.setName("writeNodes");
-        t_writePrims.setName("writePrims");
-        t_sortPrims.setName("sortPrims");
-        for (int i=0;i<100;i++) {
-          t_nodePass[i].setName("nodePass",i);
-          t_primPass[i].setName("primPass",i);
-        }
-      }
-#endif
       while (true) {
         CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
                                     sizeof(numNodes),cudaMemcpyDeviceToHost,s));
-        if (numNodes == numDone)
-          break;
         CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
         CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
-#if CUBQL_PROFILE
-        t_nodePass[pass].sync_start();
-#endif
+        CUBQL_CUDA_SYNC_CHECK_STREAM(s);
+        if (numNodes == numDone)
+          break;
+
         selectSplits<<<divRoundUp(numNodes,1024),1024,0,s>>>
           (buildState,
            nodeStates,tempNodes,numNodes,
            buildConfig);
-#if CUBQL_PROFILE
-        t_nodePass[pass].sync_stop();
-        t_primPass[pass].sync_start();
-#endif
+
         numDone = numNodes;
 
-// #if 1
-        if (sizeof(T)*D <= sizeof(float3)) {
+        if (1 && sizeof(T)*D <= sizeof(float3)) {
           updatePrims_shm<<<divRoundUp(numPrims,512),512,0,s>>>
             (nodeStates,tempNodes,
              primStates,boxes,numPrims,numDone);
         } else 
-// #else
-        updatePrims<<<divRoundUp(numPrims,1024),1024,0,s>>>
+        updatePrims<<<divRoundUp(numPrims,128),128,0,s>>>
           (nodeStates,tempNodes,
            primStates,boxes,numPrims);
-// #endif
-        
-#if CUBQL_PROFILE
-        t_primPass[pass].sync_stop();
-        ++ pass;
-#endif
       }
       CUBQL_CUDA_CALL(EventDestroy(stateDownloadedEvent));
       // ==================================================================
@@ -510,9 +486,6 @@ namespace cuBQL {
       uint8_t   *d_temp_storage     = NULL;
       size_t     temp_storage_bytes = 0;
       PrimState *sortedPrimStates   = 0;
-#if CUBQL_PROFILE
-      t_sortPrims.sync_start();
-#endif
       _ALLOC(sortedPrimStates,numPrims,s,memResource);
       auto rc =
         cub::DeviceRadixSort::SortKeys((void*&)d_temp_storage, temp_storage_bytes,
@@ -527,10 +500,6 @@ namespace cuBQL {
                                        numPrims,32,64,s);
       rc = rc;
       _FREE(d_temp_storage,s,memResource);
-#if CUBQL_PROFILE
-      t_sortPrims.sync_stop();
-      t_writePrims.sync_start();
-#endif
       // ==================================================================
       // allocate and write BVH item list, and write offsets of leaf nodes
       // ==================================================================
@@ -539,10 +508,6 @@ namespace cuBQL {
       _ALLOC(bvh.primIDs,numPrims,s,memResource);
       writePrimsAndLeafOffsets<<<divRoundUp(numPrims,1024),1024,0,s>>>
         (tempNodes,bvh.primIDs,sortedPrimStates,numPrims);
-#if CUBQL_PROFILE
-      t_writePrims.sync_stop();
-      t_writeNodes.sync_start();
-#endif
 
       // ==================================================================
       // allocate and write final nodes
@@ -551,9 +516,6 @@ namespace cuBQL {
       _ALLOC(bvh.nodes,numNodes,s,memResource);
       writeNodes<<<divRoundUp(numNodes,1024),1024,0,s>>>
         (bvh.nodes,tempNodes,numNodes);
-#if CUBQL_PROFILE
-      t_writeNodes.sync_stop();
-#endif
       _FREE(sortedPrimStates,s,memResource);
       _FREE(tempNodes,s,memResource);
       _FREE(nodeStates,s,memResource);
diff --git a/cuBQL/math/affine.h b/cuBQL/math/affine.h
index 798d037..8c73416 100644
--- a/cuBQL/math/affine.h
+++ b/cuBQL/math/affine.h
@@ -181,6 +181,25 @@ namespace cuBQL {
                                               const typename AffineSpaceT<L>::vector_t& n)
   { return xfmNormal(m.l,n); }
 
+  template<typename T>
+  box_t<T,3> xfmBox(const AffineSpaceT<LinearSpace3<vec_t<T,3>>> &xfm,
+                    const box_t<T,3> &in)
+  {
+    box_t<T,3> out;
+    if (in.empty()) return out;
+    auto l = in.lower;
+    auto u = in.upper;
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,l.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,l.y,u.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,u.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(l.x,u.y,u.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,l.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,l.y,u.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,u.y,l.z)));
+    out.extend(xfmPoint(xfm,vec_t<T,3>(u.x,u.y,u.z)));
+    return out;
+  }
+
 
   ////////////////////////////////////////////////////////////////////////////////
   /// Comparison Operators
diff --git a/cuBQL/math/box.h b/cuBQL/math/box.h
index 07736c4..a26f49f 100644
--- a/cuBQL/math/box.h
+++ b/cuBQL/math/box.h
@@ -79,7 +79,8 @@ namespace cuBQL {
     /*! create a box from two points. note this will NOT make sure
       that a<b; if this is an empty box it will just create an empty
       box! */
-    inline __cubql_both box_t(const vec_t_data<T,D> &a, const vec_t_data<T,D> &b)
+    inline __cubql_both box_t(const vec_t_data<T,D> &a,
+                              const vec_t_data<T,D> &b)
     {
       lower = vec_t(a);
       upper = vec_t(b);
@@ -88,11 +89,23 @@ namespace cuBQL {
     /*! returns a box that bounds both 'this' and another point 'v';
       this does not get modified */
     inline __cubql_both box_t including(const vec_t &v) const
-    { return box_t{min(lower,v),max(upper,v)}; }
+    {
+      // return box_t(min(lower,v),max(upper,v));
+      box_t b;
+      b.lower = min(this->lower,v);
+      b.upper = max(this->upper,v);
+      return b;
+    }
     /*! returns a box that bounds both 'this' and another box 'b';
       this does not get modified */
     inline __cubql_both box_t including(const box_t &b) const
-    { return box_t{min(lower,b.lower),max(upper,b.upper)}; }
+    {
+      box_t bb;
+      bb.lower = min(this->lower,b.lower);
+      bb.upper = max(this->upper,b.upper);
+      return bb;
+      // return box_t(min(lower,b.lower),max(upper,b.upper));
+    }
     
     inline __cubql_both box_t &grow(const vec_t &v)
     { lower = min(lower,v); upper = max(upper,v); return *this; }
diff --git a/cuBQL/math/common.h b/cuBQL/math/common.h
index a3063dd..00a1b68 100644
--- a/cuBQL/math/common.h
+++ b/cuBQL/math/common.h
@@ -336,8 +336,8 @@ namespace cuBQL {
 
 #define CUBQL_CUDA_SYNC_CHECK()                                 \
   {                                                             \
-    cudaDeviceSynchronize();                                    \
-    cudaError_t rc = cudaGetLastError();                        \
+    cudaError_t rc = cudaDeviceSynchronize();                   \
+    rc = cudaGetLastError();                                    \
     if (rc != cudaSuccess) {                                    \
       fprintf(stderr, "error (%s: line %d): %s\n",              \
               __FILE__, __LINE__, cudaGetErrorString(rc));      \
diff --git a/cuBQL/math/math.h b/cuBQL/math/math.h
index 6943b75..86855fb 100644
--- a/cuBQL/math/math.h
+++ b/cuBQL/math/math.h
@@ -4,14 +4,15 @@
 #pragma once
 
 #include "cuBQL/math/common.h"
-#ifdef __CUDACC__
-#include <cuda/std/limits>
-#endif
+// #ifdef __CUDACC__
+// #include <cuda/std/limits>
+// #endif
 #include <limits>
 
 namespace cuBQL {
   
-#ifdef __CUDACC__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+// #if defined(__CUDACC__) || defined(__HIPCC__)
   // make sure we use the built-in cuda functoins that use floats, not
   // the c-stdlib ones that use doubles.
   using ::min;
@@ -21,14 +22,15 @@ namespace cuBQL {
   using std::max;
 #endif
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+// #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 # define CUBQL_INF CUDART_INF_F
 // # define CUBQL_INF ::cuda::std::numeric_limits<float>::infinity()
 #else
 # define CUBQL_INF std::numeric_limits<float>::infinity()
 #endif
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #else
     inline __cubql_both float __int_as_float(int i) { return (const float &)i; }
     inline __cubql_both int __float_as_int(float f) { return (const int &)f; }
diff --git a/cuBQL/math/vec.h b/cuBQL/math/vec.h
index 9f7a9b4..3048657 100644
--- a/cuBQL/math/vec.h
+++ b/cuBQL/math/vec.h
@@ -7,7 +7,7 @@
 #include <type_traits>
 #include <limits>
 #include "constants.h"
-#ifdef __CUDACC__
+#if defined(__CUDACC__) && !defined(CUDART_INF)
 # include <cuda.h>
 #endif
 
@@ -19,10 +19,10 @@
 
 namespace cuBQL {
 
-#ifndef __CUDACC__
-  using std::min;
-  using std::max;
-#endif
+// #ifndef __CUDA_ARCH__
+//   using std::min;
+//   using std::max;
+// #endif
 
 #ifndef CUBQL_SUPPORT_CUDA_VECTOR_TYPES
 #define CUBQL_SUPPORT_CUDA_VECTOR_TYPES 0
@@ -46,7 +46,8 @@ namespace cuBQL {
     equivalent, let's also create a 'invalid_t' to be used by
     default */
 
-#ifndef __CUDACC__
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#else
   struct float2 { float x, y; };
   struct float3 { float x, y, z; };
   struct CUBQL_ALIGN(16) float4 { float x, y, z, w; };
@@ -492,18 +493,21 @@ namespace cuBQL {
   { return vec_t<uint64_t,4>( v.x >> b, v.y >> b, v.z >> b, v.w >> b ); }
   
   inline __cubql_both double abs(double d) {
-#ifdef __CUDA_ARCH__
+// #ifdef __CUDA_ARCH__
     return ::abs(d);
-#else
-    return std::abs(d);
-#endif
+// #else
+//     return std::abs(d);
+// #endif
   }
   inline __cubql_both float abs(float d) {
-#ifdef __CUDA_ARCH__
-    return ::abs(d);
+#if defined(__CUDA_ARCH__) || defined(__CUDACC__)
+    return abs(d);
 #else
     return std::abs(d);
 #endif
+// #else
+//     return std::abs(d);
+// #endif
   }
   // inline __cubql_both double abs(double d) { return absf(d); }
   
@@ -711,7 +715,8 @@ namespace cuBQL {
 
   template<typename T, int N>
   inline __cubql_both
-  vec_t<T,N> normalize(vec_t<T,N> v) { return v * (T(1)/sqrt(dot(v,v))); }
+  vec_t<T,N> normalize(vec_t<T,N> v) { return v * (T(1)/::sqrt(dot(v,v))); }
+  // vec_t<T,N> normalize(vec_t<T,N> v) { return v * (T(1)/sqrt(dot(v,v))); }
 
 
   // ------------------------------------------------------------------
diff --git a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h
index bc20150..44a7683 100644
--- a/cuBQL/queries/triangleData/math/rayTriangleIntersections.h
+++ b/cuBQL/queries/triangleData/math/rayTriangleIntersections.h
@@ -40,7 +40,7 @@ namespace cuBQL {
 #else
     auto dbg = _dbg;
 #endif
-    if (dbg) /* nothing, just for 'maybe_unused' */; 
+    if (dbg) {/* nothing, just for 'maybe_unused' */};
     
     using vec3 = vec_t<T,3>;
     const vec3 v0(tri.a);
diff --git a/cuBQL/traversal/rayQueries.h b/cuBQL/traversal/rayQueries.h
index 2baa33e..1a983f5 100644
--- a/cuBQL/traversal/rayQueries.h
+++ b/cuBQL/traversal/rayQueries.h
@@ -705,11 +705,11 @@ namespace cuBQL {
               ray_t<T> &ray,
               bool _dbg)
   {
-#ifdef NDEBUG
+// #ifdef NDEBUG
     const bool dbg = false;
-#else
-    bool dbg = _dbg;
-#endif
+// #else
+//     bool dbg = _dbg;
+// #endif
     using bvh_t = BinaryBVH<T,3>;
     using node_t = typename bvh_t::node_t;
     using scalar_t = typename bvh_t::scalar_t;
@@ -735,7 +735,7 @@ namespace cuBQL {
     if (ray.direction.y == (scalar_t)0) ray.direction.y = scalar_t(1e-20);
     if (ray.direction.z == (scalar_t)0) ray.direction.z = scalar_t(1e-20);
     vec3_t rcp_dir = rcp(ray.direction);
-      
+
     // ------------------------------------------------------------------
     // traverse until there's nothing left to traverse:
     // ------------------------------------------------------------------
@@ -750,6 +750,7 @@ namespace cuBQL {
       while (true) {
         // if (dbg) printf("node %i.%i\n",(int)node.offset,(int)node.count);
         if (node.count != 0) {
+          // return;//xx
           // it's a boy! - seriously: this is not a inner node; so
           // we're either at a final leaf, or at an instance node
           if (blasStackBase != nullptr)
@@ -760,8 +761,8 @@ namespace cuBQL {
           tlasSavedNodePtr = bvh.nodes;
           tlasSavedPrimIDs = bvh.primIDs;
 #ifndef NDEBUG
-          if (node.count != 1)
-            printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n");
+          // if (node.count != 1)
+          //   printf("TWO-LEVEL BVH MUST BE BUILT WITH 1 PRIM PER LEAF!\n");
 #endif
           // if (dbg)
           //   printf("inner-leaf primIDs %p ofs %i count %i\n",
@@ -798,6 +799,7 @@ namespace cuBQL {
           // if (dbg) printf("new node %i.%i\n",(int)node.offset,(int)node.count);
         }          
 
+        
         uint32_t n0Idx = (uint32_t)node.offset+0;
         uint32_t n1Idx = (uint32_t)node.offset+1;
         node_t n0 = bvh.nodes[n0Idx];
@@ -806,12 +808,14 @@ namespace cuBQL {
         bool o0 = rayIntersectsBox(node_t0,ray,rcp_dir,n0.bounds,dbg);
         bool o1 = rayIntersectsBox(node_t1,ray,rcp_dir,n1.bounds,dbg);
 
-        if (dbg) {
-          dout << " node L " << n0.bounds << "\n";
-          dout << " node R " << n1.bounds << "\n";
-          dout << "children L hit " << int(o0) << " dist " << node_t0
-               << " R hit " << int(o1) << " dist " << node_t1 << "\n";
-        }
+        // return; //xx
+
+        // if (dbg) {
+        //   dout << " node L " << n0.bounds << "\n";
+        //   dout << " node R " << n1.bounds << "\n";
+        //   dout << "children L hit " << int(o0) << " dist " << node_t0
+        //        << " R hit " << int(o1) << " dist " << node_t1 << "\n";
+        // }
         if (o0) {
           if (o1) {
             if ((stackPtr-traversalStack) >= STACK_DEPTH) {
@@ -838,15 +842,16 @@ namespace cuBQL {
           }
         }
       }
- 
+
+      // return;//xx
+      
       if (node.count != 0 && blasStackBase != nullptr) {
         // we're at a valid leaf: call the lambda and see if that gave
         // us a new, closer cull radius
         // if (dbg)
         //   printf("trav leaf-leaf primIDs %p offset %i count %i\n",
         //          bvh.primIDs,(int)node.offset,(int)node.count);
-        ray.tMax
-          = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count);
+        ray.tMax  = processLeaf(bvh.primIDs,(int)node.offset,(int)node.count);
       }
       // ------------------------------------------------------------------
       // pop next un-traversed node from stack, discarding any nodes