diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 2fefb3a24..a1d2d1b66 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -6,6 +6,9 @@ const AdjointCuTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂,
 function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
     return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
 end
+function TensorMap{T, S, N₁, N₂, DA}(t::TensorMap{T, S, N₁, N₂, HA}) where {T, S, N₁, N₂, DA <: CuArray{T}, HA <: Array{T}}
+    return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
+end
 
 # project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
 function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
@@ -101,18 +104,6 @@ function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
     return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
 end
 
-function Base.convert(
-        TT::Type{CuTensorMap{T, S, N₁, N₂}},
-        t::AbstractTensorMap{<:Any, S, N₁, N₂}
-    ) where {T, S, N₁, N₂}
-    if typeof(t) === TT
-        return t
-    else
-        tnew = TT(undef, space(t))
-        return copy!(tnew, t)
-    end
-end
-
 function LinearAlgebra.isposdef(t::CuTensorMap)
     domain(t) == codomain(t) ||
         throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
@@ -138,10 +129,9 @@ function Base.promote_rule(
     return CuTensorMap{T, S, N₁, N₂}
 end
 
-TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
+TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
     CuArray{T, N, CUDA.default_memory}
 
-
 # CuTensorMap exponentation:
 function TensorKit.exp!(t::CuTensorMap)
     domain(t) == codomain(t) ||
diff --git a/ext/TensorKitCUDAExt/truncation.jl b/ext/TensorKitCUDAExt/truncation.jl
index 019ded97b..42d623730 100644
--- a/ext/TensorKitCUDAExt/truncation.jl
+++ b/ext/TensorKitCUDAExt/truncation.jl
@@ -10,7 +10,7 @@ function MatrixAlgebraKit.findtruncated(
         fill!(v, dim(c))
     end
 
-    perm = sortperm(parent(values); strategy.by, strategy.rev)
+    perm = isempty(parent(values)) ? Int64[] : sortperm(parent(values); strategy.by, strategy.rev)
     cumulative_dim = cumsum(Base.permute!(parent(dims), perm))
 
     result = similar(values, Bool)
@@ -36,7 +36,7 @@ function MatrixAlgebraKit.findtruncated(
         end
     end
 
-    perm = sortperm(parent(values); by = abs, rev = false)
+    perm = isempty(parent(values)) ? Int64[] : sortperm(parent(values); by = abs, rev = false)
     cumulative_err = cumsum(Base.permute!(parent(ϵᵖ), perm))
 
     result = similar(values, Bool)
diff --git a/src/tensors/abstracttensor.jl b/src/tensors/abstracttensor.jl
index 628845526..56c3da839 100644
--- a/src/tensors/abstracttensor.jl
+++ b/src/tensors/abstracttensor.jl
@@ -53,9 +53,7 @@ storagetype(t) = storagetype(typeof(t))
 function storagetype(::Type{T}) where {T <: AbstractTensorMap}
     if T isa Union
         # attempt to be slightly more specific by promoting unions
-        Ma = storagetype(T.a)
-        Mb = storagetype(T.b)
-        return promote_storagetype(Ma, Mb)
+        return promote_storagetype(T.a, T.b)
     else
         # fallback definition by using scalartype
         return similarstoragetype(scalartype(T))
@@ -103,8 +101,15 @@ similarstoragetype(X::Type, ::Type{T}) where {T <: Number} =
 
 # implement on tensors
 similarstoragetype(::Type{TT}) where {TT <: AbstractTensorMap} = similarstoragetype(storagetype(TT))
-similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number} =
-    similarstoragetype(storagetype(TT), T)
+function similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number}
+    return similarstoragetype(storagetype(TT), T)
+end
+function similarstoragetype(::Type{<:AbstractTensorMap{T, S, N₁, N₂}}, ::Type{TA}) where {T <: Number, TA <: DenseVector, S, N₁, N₂}
+    return similarstoragetype(TA, T)
+end
+function similarstoragetype(t::AbstractTensorMap{T, S, N₁, N₂}, ::Type{TA}) where {T <: Number, TA <: DenseVector, S, N₁, N₂}
+    return similarstoragetype(typeof(t), TA)
+end
 
 # implement on arrays
 similarstoragetype(::Type{A}) where {A <: DenseVector{<:Number}} = A
diff --git a/src/tensors/adjoint.jl b/src/tensors/adjoint.jl
index dfc1a4471..ca484e77b 100644
--- a/src/tensors/adjoint.jl
+++ b/src/tensors/adjoint.jl
@@ -50,8 +50,6 @@ Base.@propagate_inbounds function subblock(t::AdjointTensorMap, (f₁, f₂)::Tu
     return permutedims(conj(data), (domainind(tp)..., codomainind(tp)...))
 end
 
-to_cpu(t::AdjointTensorMap) = adjoint(to_cpu(adjoint(t)))
-
 # Show
 #------
 function Base.showarg(io::IO, t::AdjointTensorMap, toplevel::Bool)
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index d28b2e1df..f12123384 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -190,7 +190,6 @@ has_shared_permute(t::BraidingTensor, ::Index2Tuple) = false
 function add_transform!(
         tdst::AbstractTensorMap,
         tsrc::BraidingTensor, (p₁, p₂)::Index2Tuple,
-        fusiontreetransform,
         α::Number, β::Number, backend::AbstractBackend...
     )
     return add_transform!(
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index b2ac4134b..e73ad2787 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -280,7 +280,7 @@ end
 # ----------------
 function TO.tensoradd_type(TC, A::DiagonalTensorMap, ::Index2Tuple{1, 1}, ::Bool)
     M = similarstoragetype(A, TC)
-    return DiagonalTensorMap{TC, spacetype(A), M}
+    return DiagonalTensorMap{scalartype(M), spacetype(A), M}
 end
 
 function TO.tensorcontract_type(
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 3108abb17..f7e2f9ecf 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -17,6 +17,8 @@ for (operation, manipulation) in (
         $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
             sectorscalartype(I) <: Integer ? T :
             sectorscalartype(I) <: Real ? float(T) : complex(T)
+        $promote_op(::Type{TA}, ::Type{I}) where {TA <: DenseVector, I <: Sector} =
+            similarstoragetype(TA, $promote_op(eltype(TA), I))
         # TODO: currently the manipulations all use sectorscalartype, change to:
         # $manipulation_scalartype(I) <: Integer ? T :
         # $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index 342c83186..531c58638 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -21,7 +21,6 @@ struct TensorMap{T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} <: Abstrac
         end
         return TensorMap{T, S, N₁, N₂, A}(data, space)
     end
-
     # constructors from data
     function TensorMap{T, S, N₁, N₂, A}(
             data::A, space::TensorMapSpace{S, N₁, N₂}
@@ -34,6 +33,9 @@ struct TensorMap{T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} <: Abstrac
         return new{T, S, N₁, N₂, A}(data, space)
     end
 end
+# constructors from another TensorMap -- no-op
+TensorMap{T, S, N₁, N₂, A}(t::TensorMap{T, S, N₁, N₂, A}) where {T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} = t
+TensorMap{T, S, N₁, N₂, A}(t::TensorMap{T, S, N₁, N₂}) where {T, S <: IndexSpace, N₁, N₂, A <: DenseVector{T}} = TensorMap(A(t.data), space(t))
 
 """
     Tensor{T, S, N, A<:DenseVector{T}} = TensorMap{T, S, N, 0, A}
@@ -407,11 +409,6 @@ for randf in (:rand, :randn, :randexp, :randisometry)
     end
 end
 
-# Moving arbitrary TensorMaps to CPU
-#-----------------------------
-to_cpu(t::TensorMapWithStorage{T, Vector{T}}) where {T} = t # no op
-to_cpu(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
-
 # Efficient copy constructors
 #-----------------------------
 Base.copy(t::TensorMap) = typeof(t)(copy(t.data), t.space)
diff --git a/src/tensors/tensoroperations.jl b/src/tensors/tensoroperations.jl
index 3fc79cf0c..f42f7d94f 100644
--- a/src/tensors/tensoroperations.jl
+++ b/src/tensors/tensoroperations.jl
@@ -379,7 +379,7 @@ function blas_contract!(
     bstyle = BraidingStyle(sectortype(C))
     bstyle isa SymmetricBraiding ||
         throw(SectorMismatch("only tensors with symmetric braiding rules can be contracted; try `@planar` instead"))
-    TC = scalartype(C)
+    TC = storagetype(C) # without this, Anew below has wrong storagetype
 
     # check which tensors have to be permuted/copied
     copyA = !(TO.isblascontractable(A, pA) && scalartype(A) === TC)
diff --git a/test/amd/tensors.jl b/test/amd/tensors.jl
index 0c6898405..2ac522daa 100644
--- a/test/amd/tensors.jl
+++ b/test/amd/tensors.jl
@@ -97,7 +97,7 @@ for V in spacelist
             for T in (Int, Float32, ComplexF64)
                 t = @constinferred AMDGPU.rand(T, W)
                 d = convert(Dict, t)
-                @test TensorKit.to_cpu(t) == convert(TensorMap, d)
+                @test adapt(Array, t) == convert(TensorMap, d)
             end
         end
         symmetricbraiding && @timedtestset "Basic linear algebra" begin
@@ -189,10 +189,10 @@ for V in spacelist
                     t = AMDGPU.rand(T, W)
                     t2 = @constinferred AMDGPU.rand!(similar(t))
                     α = rand(T)
-                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
-                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
-                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                    @test norm(t, 2) ≈ norm(adapt(Array, t), 2)
+                    @test dot(t2, t) ≈ dot(adapt(Array, t2), adapt(Array, t))
+                    @test adapt(Array, α * t) ≈ α * adapt(Array, t)
+                    @test adapt(Array, t + t) ≈ 2 * adapt(Array, t)
                 end
             end
             @timedtestset "Real and imaginary parts" begin
@@ -202,17 +202,17 @@ for V in spacelist
 
                     tr = @constinferred real(t)
                     @test scalartype(tr) <: Real
-                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
+                    @test real(adapt(Array, t)) == adapt(Array, tr)
                     @test storagetype(tr) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
 
                     ti = @constinferred imag(t)
                     @test scalartype(ti) <: Real
-                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
+                    @test imag(adapt(Array, t)) == adapt(Array, ti)
                     @test storagetype(ti) == ROCVector{real(T), AMDGPU.Mem.HIPBuffer}
 
                     tc = @inferred complex(t)
                     @test scalartype(tc) <: Complex
-                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
+                    @test complex(adapt(Array, t)) == adapt(Array, tc)
                     @test storagetype(tc) == ROCVector{complex(T), AMDGPU.Mem.HIPBuffer}
 
                     tc2 = @inferred complex(tr, ti)
@@ -275,13 +275,13 @@ for V in spacelist
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
                     dt2 = AMDGPU.@allowscalar permute(t, (p1, p2))
-                    ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
-                    @test ht2 == TensorKit.to_cpu(dt2)
+                    ht2 = permute(adapt(Array, t), (p1, p2))
+                    @test ht2 == adapt(Array, dt2)
                 end
 
                 dt3 = AMDGPU.@allowscalar repartition(t, k)
-                ht3 = repartition(TensorKit.to_cpu(t), k)
-                @test ht3 == TensorKit.to_cpu(dt3)
+                ht3 = repartition(adapt(Array, t), k)
+                @test ht3 == adapt(Array, dt3)
             end
         end
         symmetricbraiding && @timedtestset "Full trace: test self-consistency" begin
@@ -339,10 +339,10 @@ for V in spacelist
                 @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
                     dA2[b, t2, c'] * drhoR[c', c] *
                     dH[s1, s2, t1, t2]
-                @tensor hHrA12[a, s1, s2, c] := TensorKit.to_cpu(drhoL)[a, a'] * conj(TensorKit.to_cpu(dA1)[a', t1, b]) *
-                    TensorKit.to_cpu(dA2)[b, t2, c'] * TensorKit.to_cpu(drhoR)[c', c] *
-                    TensorKit.to_cpu(dH)[s1, s2, t1, t2]
-                @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
+                @tensor hHrA12[a, s1, s2, c] := adapt(Array, drhoL)[a, a'] * conj(adapt(Array, dA1)[a', t1, b]) *
+                    adapt(Array, dA2)[b, t2, c'] * adapt(Array, drhoR)[c', c] *
+                    adapt(Array, dH)[s1, s2, t1, t2]
+                @test adapt(Array, dHrA12) ≈ hHrA12
             end
         end=# # doesn't yet work because of AdjointTensor
         BraidingStyle(I) isa HasBraiding && @timedtestset "Index flipping: test flipping inverse" begin
@@ -422,31 +422,31 @@ for V in spacelist
                 t1 = AMDGPU.rand(T, W1, W1)
                 t2 = AMDGPU.rand(T, W2, W2)
                 t = AMDGPU.rand(T, W1, W2)
-                ht1 = TensorKit.to_cpu(t1)
-                ht2 = TensorKit.to_cpu(t2)
-                ht = TensorKit.to_cpu(t)
-                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
-                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
-                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
-                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
+                ht1 = adapt(Array, t1)
+                ht2 = adapt(Array, t2)
+                ht = adapt(Array, t)
+                @test adapt(Array, t1 * t) ≈ ht1 * ht
+                @test adapt(Array, t1' * t) ≈ ht1' * ht
+                @test adapt(Array, t2 * t') ≈ ht2 * ht'
+                @test adapt(Array, t2' * t') ≈ ht2' * ht'
 
                 #=AMDGPU.@allowscalar begin
-                    @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
-                    @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
+                    @test adapt(Array, inv(t1)) ≈ inv(ht1)
+                    @test adapt(Array, pinv(t)) ≈ pinv(ht)
 
                     if T == Float32 || T == ComplexF32
                         continue
                     end
 
-                    @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
-                    @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
-                    @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
-                    @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
+                    @test adapt(Array, t1 \ t) ≈ ht1 \ ht
+                    @test adapt(Array, t1' \ t) ≈ ht1' \ ht
+                    @test adapt(Array, t2 \ t') ≈ ht2 \ ht'
+                    @test adapt(Array, t2' \ t') ≈ ht2' \ ht'
 
-                    @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
-                    @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
-                    @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
-                    @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
+                    @test adapt(Array, t2 / t) ≈ ht2 / ht
+                    @test adapt(Array, t2' / t) ≈ ht2' / ht
+                    @test adapt(Array, t1 / t') ≈ ht1 / ht'
+                    @test adapt(Array, t1' / t') ≈ ht1' / ht'
                 end=#
             end
         end
@@ -456,11 +456,11 @@ for V in spacelist
                 #=t = project_hermitian!(AMDGPU.randn(T, W, W))
                 s = dim(W)
                 @test (@constinferred sqrt(t))^2 ≈ t
-                @test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
+                @test adapt(Array, sqrt(t)) ≈ sqrt(adapt(Array, t))
                 expt = @constinferred exp(t)
-                @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
+                @test adapt(Array, expt) ≈ exp(adapt(Array, t))
                 @test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
-                @test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
+                @test adapt(Array, log(project_hermitian!(expt))) ≈ log(adapt(Array, expt))
 
                 @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
                       id(storagetype(t), W)
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 738440bef..973118cac 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -14,6 +14,7 @@ for V in spacelist
     println("---------------------------------------")
     println("CUDA Tensors with symmetry: $Istr")
     println("---------------------------------------")
+    hasbraiding = BraidingStyle(I) isa HasBraiding
     symmetricbraiding = BraidingStyle(I) isa SymmetricBraiding
     @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
         V1, V2, V3, V4, V5 = V
@@ -115,7 +116,7 @@ for V in spacelist
             for T in (Int, Float32, ComplexF64)
                 t = @constinferred cuRAND.rand(T, W)
                 d = convert(Dict, t)
-                @test convert(Dict, TensorKit.to_cpu(t)) == d
+                @test convert(Dict, adapt(Array, t)) == d
             end
         end
         symmetricbraiding && @timedtestset "Basic linear algebra" begin
@@ -207,10 +208,10 @@ for V in spacelist
                     t = cuRAND.rand(T, W)
                     t2 = @constinferred cuRAND.rand!(similar(t))
                     α = rand(T)
-                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
-                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
-                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                    @test norm(t, 2) ≈ norm(adapt(Array, t), 2)
+                    @test dot(t2, t) ≈ dot(adapt(Array, t2), adapt(Array, t))
+                    @test adapt(Vector{T}, (α * t)) ≈ α * adapt(Vector{T}, t)
+                    @test adapt(Vector{T}, (t + t)) ≈ 2 * adapt(Vector{T}, t)
                 end
             end
             @timedtestset "Real and imaginary parts" begin
@@ -220,17 +221,17 @@ for V in spacelist
 
                     tr = @constinferred real(t)
                     @test scalartype(tr) <: Real
-                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
+                    @test real(adapt(Array, t)) == adapt(Array, tr)
                     @test storagetype(tr) == CuVector{real(T), CUDA.DeviceMemory}
 
                     ti = @constinferred imag(t)
                     @test scalartype(ti) <: Real
-                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
+                    @test imag(adapt(Array, t)) == adapt(Array, ti)
                     @test storagetype(ti) == CuVector{real(T), CUDA.DeviceMemory}
 
                     tc = @inferred complex(t)
                     @test scalartype(tc) <: Complex
-                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
+                    @test complex(adapt(Array, t)) == adapt(Array, tc)
                     @test storagetype(tc) == CuVector{complex(T), CUDA.DeviceMemory}
 
                     tc2 = @inferred complex(tr, ti)
@@ -243,7 +244,7 @@ for V in spacelist
             W = V1 ⊗ V2
             t = @constinferred cuRAND.randn(W ← W)
             @test typeof(convert(typeof(t), t')) == typeof(t)
-            @test typeof(TensorKit.to_cpu(t')) == typeof(TensorKit.to_cpu(t)')
+            @test typeof(adapt(Array, t')) == typeof(adapt(Array, t)')
             tc = complex(t)
             @test convert(typeof(tc), t) == tc
             @test typeof(convert(typeof(tc), t)) == typeof(tc)
@@ -263,16 +264,22 @@ for V in spacelist
         symmetricbraiding && @timedtestset "Permutations: test via inner product invariance" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
             t = cuRAND.rand(ComplexF64, W)
+            ht = adapt(Vector{ComplexF64}, t)
             t′ = cuRAND.randn!(similar(t))
+            ht′ = adapt(Vector{ComplexF64}, t′)
+            dot_htt′ = dot(ht′, ht)
+            dot_tt′ = dot(t′, t)
+            @test dot_tt′ ≈ dot_htt′
+            norm_t = norm(t)
             for k in 0:5
                 for p in permutations(1:5)
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
                     t2 = @constinferred permute(t, (p1, p2))
-                    t2 = permute(t, (p1, p2))
-                    @test norm(t2) ≈ norm(t)
                     t2′ = permute(t′, (p1, p2))
-                    @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                    @test norm(t2) ≈ norm_t
+                    @test dot(t2′, t2) ≈ dot_tt′
+                    @test dot(transpose(t2′), transpose(t2)) ≈ dot_tt′
                 end
                 t3 = @constinferred repartition(t, $k)
                 t3 = repartition(t, k)
@@ -290,32 +297,29 @@ for V in spacelist
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
                     dt2 = permute(t, (p1, p2))
-                    ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
-                    @test ht2 ≈ TensorKit.to_cpu(dt2)
+                    ht2 = permute(adapt(Array, t), (p1, p2))
+                    @test ht2 ≈ adapt(Array, dt2)
                 end
-
-                dt3 = CUDA.@allowscalar repartition(t, k)
-                ht3 = repartition(TensorKit.to_cpu(t), k)
-                @test ht3 ≈ TensorKit.to_cpu(dt3)
+                dt3 = repartition(t, k)
+                ht3 = repartition(adapt(Array, t), k)
+                @test ht3 ≈ adapt(Array, dt3)
             end
         end
         symmetricbraiding && @timedtestset "Full trace: test self-consistency" begin
             t = cuRAND.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
-            CUDA.@allowscalar begin
-                t2 = permute(t, ((1, 2), (4, 3)))
-                s = @constinferred tr(t2)
-                @test conj(s) ≈ tr(t2')
-                if !isdual(V1)
-                    t2 = twist!(t2, 1)
-                end
-                if isdual(V2)
-                    t2 = twist!(t2, 2)
-                end
-                ss = tr(t2)
-                @tensor s2 = t[a, b, b, a]
-                @tensor t3[a, b] := t[a, c, c, b]
-                @tensor s3 = t3[a, a]
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
             end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
             @test ss ≈ s2
             @test ss ≈ s3
         end
@@ -328,20 +332,16 @@ for V in spacelist
         end
         symmetricbraiding && @timedtestset "Trace: test via conversion" begin
             t = cuRAND.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
-            CUDA.@allowscalar begin
-                @tensor t2[a, b] := t[c, d, b, d, c, a]
-                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
-            end
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
             @test t3 ≈ ad(t2)
         end
         symmetricbraiding && @timedtestset "Trace and contraction" begin
             t1 = cuRAND.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
             t2 = cuRAND.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
-            CUDA.@allowscalar begin
-                t3 = t1 ⊗ t2
-                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
-                @tensor tb[a, b] := t3[x, y, a, y, b, x]
-            end
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
             @test ta ≈ tb
         end
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
@@ -354,50 +354,44 @@ for V in spacelist
                 @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
                     dA2[b, t2, c'] * drhoR[c', c] *
                     dH[s1, s2, t1, t2]
-                @tensor hHrA12[a, s1, s2, c] := TensorKit.to_cpu(drhoL)[a, a'] * conj(TensorKit.to_cpu(dA1)[a', t1, b]) *
-                    TensorKit.to_cpu(dA2)[b, t2, c'] * TensorKit.to_cpu(drhoR)[c', c] *
-                    TensorKit.to_cpu(dH)[s1, s2, t1, t2]
-                @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
+                @tensor hHrA12[a, s1, s2, c] := adapt(Array, drhoL)[a, a'] * conj(adapt(Array, dA1)[a', t1, b]) *
+                    adapt(Array, dA2)[b, t2, c'] * adapt(Array, drhoR)[c', c] *
+                    adapt(Array, dH)[s1, s2, t1, t2]
+                @test adapt(Array, dHrA12) ≈ hHrA12
             end
         end
-        BraidingStyle(I) isa HasBraiding && @timedtestset "Index flipping: test flipping inverse" begin
+        hasbraiding && @timedtestset "Index flipping: test flipping inverse" begin
             t = cuRAND.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← (V4 ⊗ V5)')
             for i in 1:5
-                CUDA.@allowscalar begin
-                    @test t ≈ flip(flip(t, i), i; inv = true)
-                    @test t ≈ flip(flip(t, i; inv = true), i)
-                end
+                @test t ≈ flip(flip(t, i), i; inv = true)
+                @test t ≈ flip(flip(t, i; inv = true), i)
             end
         end
-        #=@timedtestset "Index flipping: test via explicit flip" begin
+        symmetricbraiding && @timedtestset "Index flipping: test via explicit flip" begin
             t = cuRAND.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
-            F1 = unitary(flip(V1), V1)
+            F1 = adapt(CuArray{ComplexF64}, unitary(flip(V1), V1))
 
-            CUDA.@allowscalar begin
-                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
-                @test flip(t, 1) ≈ tf
-                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
-                @test twist!(flip(t, 2), 2) ≈ tf
-                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
-                @test flip(t, 3) ≈ tf
-                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
-                @test twist!(flip(t, 4), 4) ≈ tf
-            end
+            @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+            @test flip(t, 1) ≈ tf
+            @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+            @test twist!(flip(t, 2), 2) ≈ tf
+            @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+            @test flip(t, 3) ≈ tf
+            @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+            @test twist!(flip(t, 4), 4) ≈ tf
         end
-        @timedtestset "Index flipping: test via contraction" begin
+        symmetricbraiding && @timedtestset "Index flipping: test via contraction" begin
             t1 = cuRAND.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
             t2 = cuRAND.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
-            CUDA.@allowscalar begin
-                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
-                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
-                @test ta ≈ tb
-                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
-                @test ta ≈ tb
-                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
-                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
-                @test flip(ta, (1, 2)) ≈ tb
-            end
-        end=# # TODO
+            @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+            @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+            @test ta ≈ tb
+            @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+            @test ta ≈ tb
+            @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+            @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+            @test flip(ta, (1, 2)) ≈ tb
+        end
         @timedtestset "Multiplication of isometries: test properties" begin
             W1 = V1 ⊗ V2 ⊗ V3
             W2 = (V4 ⊗ V5)'
@@ -435,30 +429,30 @@ for V in spacelist
                 t1 = cuRAND.rand(T, W1, W1)
                 t2 = cuRAND.rand(T, W2, W2)
                 t = cuRAND.rand(T, W1, W2)
-                ht1 = TensorKit.to_cpu(t1)
-                ht2 = TensorKit.to_cpu(t2)
-                ht = TensorKit.to_cpu(t)
-                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
-                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
-                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
-                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
+                ht1 = adapt(Array, t1)
+                ht2 = adapt(Array, t2)
+                ht = adapt(Array, t)
+                @test adapt(Array, t1 * t) ≈ ht1 * ht
+                @test adapt(Array, t1' * t) ≈ ht1' * ht
+                @test adapt(Array, t2 * t') ≈ ht2 * ht'
+                @test adapt(Array, t2' * t') ≈ ht2' * ht'
 
-                @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
-                @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
+                @test adapt(Array, inv(t1)) ≈ inv(ht1)
+                @test adapt(Array, pinv(t)) ≈ pinv(ht)
 
                 if T == Float32 || T == ComplexF32
                     continue
                 end
 
-                @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
-                @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
-                @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
-                @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
+                @test adapt(Array, t1 \ t) ≈ ht1 \ ht
+                @test adapt(Array, t1' \ t) ≈ ht1' \ ht
+                @test adapt(Array, t2 \ t') ≈ ht2 \ ht'
+                @test adapt(Array, t2' \ t') ≈ ht2' \ ht'
 
-                @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
-                @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
-                @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
-                @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
+                @test adapt(Array, t2 / t) ≈ ht2 / ht
+                @test adapt(Array, t2' / t) ≈ ht2' / ht
+                @test adapt(Array, t1 / t') ≈ ht1 / ht'
+                @test adapt(Array, t1' / t') ≈ ht1' / ht'
             end
         end
         symmetricbraiding && @timedtestset "Tensor functions" begin
@@ -467,14 +461,14 @@ for V in spacelist
                 t = project_hermitian!(cuRAND.randn(T, W, W))
                 s = dim(W)
                 #@test (@constinferred sqrt(t))^2 ≈ t
-                #@test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
+                #@test adapt(Array, sqrt(t)) ≈ sqrt(adapt(Array, t))
 
                 expt = @constinferred exp(t)
-                @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
+                @test adapt(Array, expt) ≈ exp(adapt(Array, t))
 
                 # log doesn't work on CUDA yet (scalar indexing)
                 #@test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
-                #@test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
+                #@test adapt(Array, log(project_hermitian!(expt))) ≈ log(adapt(Array, expt))
 
                 #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
                           id(storagetype(t), W)
@@ -551,10 +545,8 @@ for V in spacelist
                 t1 = cuRAND.rand(T, V1, V5')
                 t2 = cuRAND.rand(T, V2 ⊗ V3, V4')
                 t = @constinferred (t1 ⊗ t2)
-                CUDA.@allowscalar begin
-                    @tensor t′[1 2 3; 4 5] := t1[1; 4] * t2[2 3; 5]
-                end
-                @test t ≈ t′ # This should really not be broken
+                @tensor t′[1 2 3; 4 5] := t1[1; 4] * t2[2 3; 5]
+                @test t ≈ t′
             end
         end
     end
@@ -567,17 +559,17 @@ end
         V1, V2, V3, V4, V5 = Vlist1
         W1, W2, W3, W4, W5 = Vlist2
         for T in (Float32, ComplexF64)
-            t1 = rand(T, V2 ⊗ V3, (V4 ⊗ V5)')
-            t2 = rand(T, W2, (W3 ⊗ W4)')
+            t1 = CUDA.rand(T, V2 ⊗ V3, (V4 ⊗ V5)')
+            t2 = CUDA.rand(T, W2, (W3 ⊗ W4)')
             t = @constinferred (t1 ⊠ t2)
             d1 = dim(codomain(t1))
             d2 = dim(codomain(t2))
             d3 = dim(domain(t1))
             d4 = dim(domain(t2))
-            At = convert(Array, t)
+            At = convert(Array, adapt(Vector{T}, t))
             @test reshape(At, (d1, d2, d3, d4)) ≈
-                reshape(convert(Array, t1), (d1, 1, d3, 1)) .*
-                reshape(convert(Array, t2), (1, d2, 1, d4))
+                reshape(convert(Array, adapt(Vector{T}, t1)), (d1, 1, d3, 1)) .*
+                reshape(convert(Array, adapt(Vector{T}, t2)), (1, d2, 1, d4))
         end
     end
 end