diff --git a/Project.toml b/Project.toml
index 9afe3dc..3596254 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.15.2"
 
 [deps]
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -22,6 +23,7 @@ CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4,
 MPI = "0.20"
 Polyester = "0.7"
 julia = "1.9"
+oneAPI = "1.6.1"
 
 [extras]
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
diff --git a/ext/ImplicitGlobalGrid_ONEAPIExt.jl b/ext/ImplicitGlobalGrid_ONEAPIExt.jl
new file mode 100644
index 0000000..7eb09d6
--- /dev/null
+++ b/ext/ImplicitGlobalGrid_ONEAPIExt.jl
@@ -0,0 +1,5 @@
+module ImplicitGlobalGrid_INTELExt
+    include(joinpath(@__DIR__, "..", "src", "ONEAPIExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "ONEAPIExt", "select_device.jl"))
+    include(joinpath(@__DIR__, "..", "src", "ONEAPIExt", "update_halo.jl"))
+end
diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl
index f86e18b..d2979ac 100644
--- a/src/ImplicitGlobalGrid.jl
+++ b/src/ImplicitGlobalGrid.jl
@@ -48,6 +48,7 @@ include("shared.jl")
 include("defaults_shared.jl")
 include(joinpath("AMDGPUExt", "defaults.jl"))
 include(joinpath("CUDAExt", "defaults.jl"))
+include(joinpath("ONEAPIExt", "defaults.jl"))
 include(joinpath("PolyesterExt", "memcopy_polyester_default.jl"))
 
 ## Alphabetical include of files
diff --git a/src/ONEAPIExt/defaults.jl b/src/ONEAPIExt/defaults.jl
new file mode 100644
index 0000000..cbb3560
--- /dev/null
+++ b/src/ONEAPIExt/defaults.jl
@@ -0,0 +1,22 @@
+# shared.jl
+
+is_onearray(A::GGArray) = false
+
+
+# select_device.jl
+
+function nb_oneapidevices end
+function oneapidevice! end
+
+
+# update_halo.jl
+
+function free_update_halo_intelbuffers end
+function init_onebufs_arrays end
+function init_onebufs end
+function reinterpret_onebufs end
+function reallocate_undersized_onebufs end
+function reregister_onebufs end
+function get_onesendbufs_raw end
+function get_onerecvbufs_raw end
+function allocate_onestreams end
diff --git a/src/ONEAPIExt/select_device.jl b/src/ONEAPIExt/select_device.jl
new file mode 100644
index 0000000..e381b2c
--- /dev/null
+++ b/src/ONEAPIExt/select_device.jl
@@ -0,0 +1,2 @@
+ImplicitGlobalGrid.nb_oneapidevices()       = length(oneAPI.devices())
+ImplicitGlobalGrid.oneapidevice!(device_id) = oneAPI.device!(device_id)
diff --git a/src/ONEAPIExt/shared.jl b/src/ONEAPIExt/shared.jl
new file mode 100644
index 0000000..47c12a2
--- /dev/null
+++ b/src/ONEAPIExt/shared.jl
@@ -0,0 +1,45 @@
+import ImplicitGlobalGrid
+import ImplicitGlobalGrid: GGArray, GGField, GGNumber, halosize, ol, oneapiaware_MPI, sendranges, recvranges, sendbuf_flat, recvbuf_flat, write_d2x!, read_x2d!, write_d2h_async!, read_h2d_async!, register, is_cuarray
+import ImplicitGlobalGrid: NNEIGHBORS_PER_DIM, GG_ALLOC_GRANULARITY
+using oneAPI
+
+
+##------
+## TYPES
+
+const oneField{T,N} = GGField{T,N,oneArray{T,N}}
+
+
+##------------------------------------
+## HANDLING OF CUDA AND AMDGPU SUPPORT
+
+ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_ONEAPIExt}) = true
+ImplicitGlobalGrid.is_functional(::Val{:oneAPI})                   = oneAPI.functional()
+
+
+##-------------
+## SYNTAX SUGAR
+
+ImplicitGlobalGrid.is_onearray(A::oneArray) = true   #NOTE: this function is only to be used when multiple dispatch on the type of the array seems an overkill (in particular when only something needs to be done for the GPU case, but nothing for the CPU case) and as long as performance does not suffer.
+
+
+##--------------------------------------------------------------------------------
+## FUNCTIONS FOR WRAPPING ARRAYS AND FIELDS AND DEFINE ARRAY PROPERTY BASE METHODS
+
+ImplicitGlobalGrid.wrap_field(A::oneArray, hw::Tuple) = oneField{eltype(A), ndims(A)}((A, hw))
+
+Base.size(A::oneField)          = Base.size(A.A)
+Base.size(A::oneField, args...) = Base.size(A.A, args...)
+Base.length(A::oneField)        = Base.length(A.A)
+Base.ndims(A::oneField)         = Base.ndims(A.A)
+Base.eltype(A::oneField)        = Base.eltype(A.A)
+
+
+##---------------
+## oneAPI functions
+
+function ImplicitGlobalGrid.register(::Type{<:oneArray},buf::Array{T}) where T <: GGNumber
+    rbuf = oneAPI.Mem.register(oneAPI.Mem.Host, pointer(buf), sizeof(buf), oneAPI.Mem.HOSTREGISTER_DEVICEMAP);
+    rbuf_d = convert(onePtr{T}, rbuf);
+    return unsafe_wrap(oneArray, rbuf_d, size(buf)), rbuf;
+end
diff --git a/src/ONEAPIExt/update_halo.jl b/src/ONEAPIExt/update_halo.jl
new file mode 100644
index 0000000..27774c7
--- /dev/null
+++ b/src/ONEAPIExt/update_halo.jl
@@ -0,0 +1,260 @@
+##---------------------------------------
+## FUNCTIONS RELATED TO BUFFER ALLOCATION
+
+# NOTE: CUDA and AMDGPU buffers live and are dealt with independently, enabling the support of usage of CUDA and AMD GPUs at the same time.
+
+ImplicitGlobalGrid.free_update_halo_onebuffers(args...) = free_update_halo_onebuffers(args...)
+ImplicitGlobalGrid.init_onebufs_arrays(args...) = init_onebufs_arrays(args...)
+ImplicitGlobalGrid.init_onebufs(args...) = init_onebufs(args...)
+ImplicitGlobalGrid.reinterpret_onebufs(args...) = reinterpret_onebufs(args...)
+ImplicitGlobalGrid.reallocate_undersized_onebufs(args...) = reallocate_undersized_onebufs(args...)
+ImplicitGlobalGrid.reregister_onebufs(args...) = reregister_onebufs(args...)
+ImplicitGlobalGrid.get_onesendbufs_raw(args...) = get_onesendbufs_raw(args...)
+ImplicitGlobalGrid.get_onerecvbufs_raw(args...) = get_onerecvbufs_raw(args...)
+ImplicitGlobalGrid.gpusendbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpusendbuf(n,dim,i,A)
+ImplicitGlobalGrid.gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpurecvbuf(n,dim,i,A)
+ImplicitGlobalGrid.gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpusendbuf_flat(n,dim,i,A)
+ImplicitGlobalGrid.gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where {T <: GGNumber} = gpurecvbuf_flat(n,dim,i,A)
+
+let
+    global free_update_halo_onebuffers, init_onebufs_arrays, init_onebufs, reinterpret_onebufs, reregister_onebufs, reallocate_undersized_onebufs
+    global gpusendbuf, gpurecvbuf, gpusendbuf_flat, gpurecvbuf_flat
+    onesendbufs_raw = nothing
+    onerecvbufs_raw = nothing
+    onesendbufs_raw_h = nothing
+    onerecvbufs_raw_h = nothing
+
+    function free_update_halo_onebuffers()
+        free_onebufs(onesendbufs_raw)
+        free_onebufs(onerecvbufs_raw)
+        unregister_onebufs(onesendbufs_raw_h)
+        unregister_onebufs(onerecvbufs_raw_h)
+        reset_one_buffers()
+    end
+
+    function free_onebufs(bufs)
+        if (bufs !== nothing)
+            for i = 1:length(bufs)
+                for n = 1:length(bufs[i])
+                    if is_onearray(bufs[i][n]) oneAPI.unsafe_free!(bufs[i][n]); bufs[i][n] = []; end
+                end
+            end
+        end
+    end
+
+    function unregister_onebufs(bufs)
+        if (bufs !== nothing)
+            for i = 1:length(bufs)
+                for n = 1:length(bufs[i])
+                    if (isa(bufs[i][n],oneAPI.Mem.HostBuffer)) oneAPI.Mem.unregister(bufs[i][n]); bufs[i][n] = []; end
+                end
+            end
+        end
+    end
+
+    function reset_one_buffers()
+        onesendbufs_raw = nothing
+        onerecvbufs_raw = nothing
+        onesendbufs_raw_h = nothing
+        onerecvbufs_raw_h = nothing
+    end
+
+
+    # (oneAPI functions)
+
+    function init_onebufs_arrays()
+        onesendbufs_raw = Array{Array{Any,1},1}();
+        onerecvbufs_raw = Array{Array{Any,1},1}();
+        onesendbufs_raw_h = Array{Array{Any,1},1}();
+        onerecvbufs_raw_h = Array{Array{Any,1},1}();
+    end
+
+    function init_onebufs(T::DataType, fields::GGField...)
+        while (length(onesendbufs_raw) < length(fields)) push!(onesendbufs_raw, [oneArray{T}(undef,0), oneArray{T}(undef,0)]); end
+        while (length(onerecvbufs_raw) < length(fields)) push!(onerecvbufs_raw, [oneArray{T}(undef,0), oneArray{T}(undef,0)]); end
+        while (length(onesendbufs_raw_h) < length(fields)) push!(onesendbufs_raw_h, [[], []]); end
+        while (length(onerecvbufs_raw_h) < length(fields)) push!(onerecvbufs_raw_h, [[], []]); end
+    end
+
+    function reinterpret_onebufs(T::DataType, i::Integer, n::Integer)
+        if (eltype(onesendbufs_raw[i][n]) != T) onesendbufs_raw[i][n] = reinterpret(T, onesendbufs_raw[i][n]); end
+        if (eltype(onerecvbufs_raw[i][n]) != T) onerecvbufs_raw[i][n] = reinterpret(T, onerecvbufs_raw[i][n]); end
+    end
+
+    function reallocate_undersized_onebufs(T::DataType, i::Integer, max_halo_elems::Integer)
+        if (!isnothing(onesendbufs_raw) && length(onesendbufs_raw[i][1]) < max_halo_elems)
+            for n = 1:NNEIGHBORS_PER_DIM
+                reallocate_onebufs(T, i, n, max_halo_elems); GC.gc(); # Too small buffers had been replaced with larger ones; free the unused memory immediately.
+            end
+        end
+    end
+
+    function reallocate_onebufs(T::DataType, i::Integer, n::Integer, max_halo_elems::Integer)
+        onesendbufs_raw[i][n] = oneAPI.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY); # Ensure that the amount of allocated memory is a multiple of 4*sizeof(T) (sizeof(Float64)/sizeof(Float16) = 4). So, we can always correctly reinterpret the raw buffers even if next time sizeof(T) is greater.
+        onerecvbufs_raw[i][n] = oneAPI.zeros(T, Int(ceil(max_halo_elems/GG_ALLOC_GRANULARITY))*GG_ALLOC_GRANULARITY);
+    end
+
+    function reregister_onebufs(T::DataType, i::Integer, n::Integer, sendbufs_raw, recvbufs_raw)
+        if (isa(onesendbufs_raw_h[i][n],oneAPI.Mem.HostBuffer)) oneAPI.Mem.unregister(onesendbufs_raw_h[i][n]); onesendbufs_raw_h[i][n] = []; end # It is always initialized registered... if (cusendbufs_raw_h[i][n].bytesize > 32*sizeof(T))
+        if (isa(onerecvbufs_raw_h[i][n],oneAPI.Mem.HostBuffer)) oneAPI.Mem.unregister(onerecvbufs_raw_h[i][n]); onerecvbufs_raw_h[i][n] = []; end # It is always initialized registered... if (curecvbufs_raw_h[i][n].bytesize > 32*sizeof(T))
+        onesendbufs_raw[i][n], onesendbufs_raw_h[i][n] = register(oneArray,sendbufs_raw[i][n]);
+        onerecvbufs_raw[i][n], onerecvbufs_raw_h[i][n] = register(oneArray,recvbufs_raw[i][n]);
+    end
+
+
+    # (oneAPI functions)
+
+    function gpusendbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber
+        return view(onesendbufs_raw[i][n]::oneVector{T},1:prod(halosize(dim,A)));
+    end
+
+    function gpurecvbuf_flat(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber
+        return view(onerecvbufs_raw[i][n]::CuVector{T},1:prod(halosize(dim,A)));
+    end
+
+
+    # (GPU functions)
+
+    #TODO: see if remove T here and in other cases for CuArray, ROCArray or Array (but then it does not verify that CuArray/ROCArray is of type GGNumber) or if I should instead change GGArray to GGArrayUnion and create: GGArray = Array{T} where T <: GGNumber  and  GGCuArray = CuArray{T} where T <: GGNumber; This is however more difficult to read and understand for others.
+    function gpusendbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber
+        return reshape(gpusendbuf_flat(n,dim,i,A), halosize(dim,A));
+    end
+
+    function gpurecvbuf(n::Integer, dim::Integer, i::Integer, A::oneField{T}) where T <: GGNumber
+        return reshape(gpurecvbuf_flat(n,dim,i,A), halosize(dim,A));
+    end
+
+
+    # Make sendbufs_raw and recvbufs_raw accessible for unit testing.
+    global get_onesendbufs_raw, get_onerecvbufs_raw
+    get_onesendbufs_raw()  = deepcopy(onesendbufs_raw)
+    get_onerecvbufs_raw()  = deepcopy(onerecvbufs_raw)
+end
+
+
+##----------------------------------------------
+## FUNCTIONS TO WRITE AND READ SEND/RECV BUFFERS
+
+function ImplicitGlobalGrid.allocate_onestreams(fields::GGField...)
+    allocate_onestreams_iwrite(fields...);
+    allocate_onestreams_iread(fields...);
+end
+
+ImplicitGlobalGrid.iwrite_sendbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where {T <: GGNumber} = iwrite_sendbufs!(n,dim,F,i)
+ImplicitGlobalGrid.iread_recvbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where {T <: GGNumber} = iread_recvbufs!(n,dim,F,i)
+ImplicitGlobalGrid.wait_iwrite(n::Integer, A::oneField{T}, i::Integer) where {T <: GGNumber} = wait_iwrite(n,A,i)
+ImplicitGlobalGrid.wait_iread(n::Integer, A::oneField{T}, i::Integer) where {T <: GGNumber} = wait_iread(n,A,i)
+
+let
+    global iwrite_sendbufs!, allocate_onestreams_iwrite, wait_iwrite
+
+    onestreams = Array{oneStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iwrite(n::Integer, A::oneField{T}, i::Integer) where T <: GGNumber = oneAPI.synchronize(onestreams[n,i]; blocking=true);
+
+    function allocate_onestreams_iwrite(fields::GGField...)
+        if length(fields) > size(onestreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            onestreams = [onestreams [oneStream(; flags=ONEAPI.STREAM_NON_BLOCKING, priority=oneAPI.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(onestreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iwrite_sendbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || oneapiaware_MPI(dim) # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = sendranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=onestreams[n,i] write_d2x!(gpusendbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                write_d2h_async!(sendbuf_flat(n,dim,i,F), A, sendranges(n,dim,F), onestreams[n,i]);
+            end
+        end
+    end
+end
+
+let
+    global iread_recvbufs!, allocate_onestreams_iread, wait_iread
+
+    onestreams = Array{oneStream}(undef, NNEIGHBORS_PER_DIM, 0)
+
+    wait_iread(n::Integer, A::oneField{T}, i::Integer) where T <: GGNumber = oneAPI.synchronize(onestreams[n,i]; blocking=true);
+
+    function allocate_onestreams_iread(fields::GGField...)
+        if length(fields) > size(onestreams,2)  # Note: for simplicity, we create a stream for every field even if it is not a CuField
+            onestreams = [onestreams [oneStream(; flags=ONEAPI.STREAM_NON_BLOCKING, priority=oneAPI.priority_range()[end]) for n=1:NNEIGHBORS_PER_DIM, i=1:(length(fields)-size(onestreams,2))]];  # Create (additional) maximum priority nonblocking streams to enable overlap with computation kernels.
+        end
+    end
+
+    function iread_recvbufs!(n::Integer, dim::Integer, F::oneField{T}, i::Integer) where T <: GGNumber
+        A, halowidths = F;
+        if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
+            if dim == 1 || oneapiaware_MPI(dim)  # Use a custom copy kernel for the first dimension to obtain a good copy performance (the CUDA 3-D memcopy does not perform well for this extremely strided case).
+                ranges = recvranges(n, dim, F);
+                nthreads = (dim==1) ? (1, 32, 1) : (32, 1, 1);
+                halosize = [r[end] - r[1] + 1 for r in ranges];
+                nblocks  = Tuple(ceil.(Int, halosize./nthreads));
+                @cuda blocks=nblocks threads=nthreads stream=onestreams[n,i] read_x2d!(gpurecvbuf(n,dim,i,F), A, ranges[1], ranges[2], ranges[3], dim);
+            else
+                read_h2d_async!(recvbuf_flat(n,dim,i,F), A, recvranges(n,dim,F), onestreams[n,i]);
+            end
+        end
+    end
+end
+
+
+# (CUDA functions)
+
+# Write to the send buffer on the host or device from the array on the device (d2x).
+function ImplicitGlobalGrid.write_d2x!(gpusendbuf::oneArray{T}, A::oneArray{T}, sendrangex::UnitRange{Int64}, sendrangey::UnitRange{Int64}, sendrangez::UnitRange{Int64},  dim::Integer) where T <: GGNumber
+    ix = (oneAPI.blockIdx().x-1) * oneAPI.blockDim().x + oneAPI.threadIdx().x + sendrangex[1] - 1
+    iy = (oneAPI.blockIdx().y-1) * oneAPI.blockDim().y + oneAPI.threadIdx().y + sendrangey[1] - 1
+    iz = (oneAPI.blockIdx().z-1) * oneAPI.blockDim().z + oneAPI.threadIdx().z + sendrangez[1] - 1
+    if !(ix in sendrangex && iy in sendrangey && iz in sendrangez) return nothing; end
+    gpusendbuf[ix-(sendrangex[1]-1),iy-(sendrangey[1]-1),iz-(sendrangez[1]-1)] = A[ix,iy,iz];
+    return nothing
+end
+
+# Read from the receive buffer on the host or device and store on the array on the device (x2d).
+function ImplicitGlobalGrid.read_x2d!(gpurecvbuf::oneArray{T}, A::oneArray{T}, recvrangex::UnitRange{Int64}, recvrangey::UnitRange{Int64}, recvrangez::UnitRange{Int64}, dim::Integer) where T <: GGNumber
+    ix = (oneAPI.blockIdx().x-1) * oneAPI.blockDim().x + oneAPI.threadIdx().x + recvrangex[1] - 1
+    iy = (oneAPI.blockIdx().y-1) * oneAPI.blockDim().y + oneAPI.threadIdx().y + recvrangey[1] - 1
+    iz = (oneAPI.blockIdx().z-1) * oneAPI.blockDim().z + oneAPI.threadIdx().z + recvrangez[1] - 1
+    if !(ix in recvrangex && iy in recvrangey && iz in recvrangez) return nothing; end
+    A[ix,iy,iz] = gpurecvbuf[ix-(recvrangex[1]-1),iy-(recvrangey[1]-1),iz-(recvrangez[1]-1)];
+    return nothing
+end
+
+# Write to the send buffer on the host from the array on the device (d2h).
+function ImplicitGlobalGrid.write_d2h_async!(sendbuf::AbstractArray{T}, A::oneArray{T}, sendranges::Array{UnitRange{T2},1}, onestream::oneStream) where T <: GGNumber where T2 <: Integer
+    oneAPI.Mem.unsafe_copy3d!(
+        pointer(sendbuf), oneAPI.Mem.Host, pointer(A), oneAPI.Mem.Device,
+        length(sendranges[1]), length(sendranges[2]), length(sendranges[3]);
+        srcPos=(sendranges[1][1], sendranges[2][1], sendranges[3][1]),
+        srcPitch=sizeof(T)*size(A,1), srcHeight=size(A,2),
+        dstPitch=sizeof(T)*length(sendranges[1]), dstHeight=length(sendranges[2]),
+        async=true, stream=onestream
+    )
+end
+
+# Read from the receive buffer on the host and store on the array on the device (h2d).
+function ImplicitGlobalGrid.read_h2d_async!(recvbuf::AbstractArray{T}, A::CuArray{T}, recvranges::Array{UnitRange{T2},1}, onestream::oneStream) where T <: GGNumber where T2 <: Integer
+    oneAPI.Mem.unsafe_copy3d!(
+        pointer(A), oneAPI.Mem.Device, pointer(recvbuf), oneAPI.Mem.Host,
+        length(recvranges[1]), length(recvranges[2]), length(recvranges[3]);
+        dstPos=(recvranges[1][1], recvranges[2][1], recvranges[3][1]),
+        srcPitch=sizeof(T)*length(recvranges[1]), srcHeight=length(recvranges[2]),
+        dstPitch=sizeof(T)*size(A,1), dstHeight=size(A,2),
+        async=true, stream=onestream
+    )
+end
+
+
+##------------------------------
+## FUNCTIONS TO SEND/RECV FIELDS
+
+function ImplicitGlobalGrid.gpumemcopy!(dst::oneArray{T}, src::oneArray{T}) where T <: GGNumber
+    @inbounds oneAPI.copyto!(dst, src)
+end
+
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index dfcf347..35f684d 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -44,6 +44,8 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     set_cuda_functional()
     set_amdgpu_loaded()
     set_amdgpu_functional()
+    set_oneapi_loaded()
+    set_oneapi_functional()
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
     periods           = [periodx, periody, periodz];
@@ -51,12 +53,15 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
     halowidths        = [halowidths...];
     cuda_enabled      = false
     amdgpu_enabled    = false
+    oneapi_enabled    = false
     cudaaware_MPI     = [false, false, false]
     amdgpuaware_MPI   = [false, false, false]
+    oneapiaware_MPI   = [false, false, false]
     use_polyester     = [false, false, false]
     if haskey(ENV, "IGG_LOOPVECTORIZATION") error("Environment variable IGG_LOOPVECTORIZATION is not supported anymore. Use IGG_USE_POLYESTER instead.") end
     if haskey(ENV, "IGG_CUDAAWARE_MPI") cudaaware_MPI .= (parse(Int64, ENV["IGG_CUDAAWARE_MPI"]) > 0); end
     if haskey(ENV, "IGG_ROCMAWARE_MPI") amdgpuaware_MPI .= (parse(Int64, ENV["IGG_ROCMAWARE_MPI"]) > 0); end
+    if haskey(ENV, "IGG_ONEAPIAWARE_MPI") oneapiaware_MPI .= (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI"]) > 0); end
     if haskey(ENV, "IGG_USE_POLYESTER") use_polyester .= (parse(Int64, ENV["IGG_USE_POLYESTER"]) > 0); end
     if none(cudaaware_MPI)
         if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMX") cudaaware_MPI[1] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMX"]) > 0); end
@@ -73,11 +78,17 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         if haskey(ENV, "IGG_USE_POLYESTER_DIMY") use_polyester[2] = (parse(Int64, ENV["IGG_USE_POLYESTER_DIMY"]) > 0); end
         if haskey(ENV, "IGG_USE_POLYESTER_DIMZ") use_polyester[3] = (parse(Int64, ENV["IGG_USE_POLYESTER_DIMZ"]) > 0); end
     end
-    if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
-    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
+    if none(oneapiaware_MPI)
+        if haskey(ENV, "IGG_ONEAPIAWARE_MPI_DIMX") oneapiaware_MPI[1] = (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI_DIMX"]) > 0); end
+        if haskey(ENV, "IGG_ONEAPIAWARE_MPI_DIMY") oneapiaware_MPI[2] = (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI_DIMY"]) > 0); end
+        if haskey(ENV, "IGG_ONEAPIAWARE_MPI_DIMZ") oneapiaware_MPI[3] = (parse(Int64, ENV["IGG_ONEAPIAWARE_MPI_DIMZ"]) > 0); end
+    end
+    if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU, DEVICE_TYPE_ONEAPI]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_ONEAPI, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
+    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional() && oneapi_loaded() && oneapi_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU or $DEVICE_TYPE_ONEAPI.") end
     if (device_type != DEVICE_TYPE_NONE)
         if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded() && cuda_functional()  end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
         if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() && amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_ONEAPI,   DEVICE_TYPE_AUTO]) oneapi_enabled   = oneapi_loaded() && oneapi_functional()  end # NOTE: oneapi could be enabled/disabled depending on some additional criteria.
     end
     if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
     if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
@@ -105,13 +116,14 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
     end
     nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2
-    set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, cudaaware_MPI, amdgpuaware_MPI, use_polyester, quiet));
+    set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, halowidths, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cuda_enabled, amdgpu_enabled, oneapi_enabled, cudaaware_MPI, amdgpuaware_MPI, oneapiaware_MPI, use_polyester, quiet));
     cuda_support_string   = (cuda_enabled && all(cudaaware_MPI))     ? "CUDA-aware"   : (cuda_enabled && any(cudaaware_MPI))     ? "CUDA(-aware)"   : (cuda_enabled)   ? "CUDA"   : "";
     amdgpu_support_string = (amdgpu_enabled && all(amdgpuaware_MPI)) ? "AMDGPU-aware" : (amdgpu_enabled && any(amdgpuaware_MPI)) ? "AMDGPU(-aware)" : (amdgpu_enabled) ? "AMDGPU" : "";
-    gpu_support_string    = join(filter(!isempty, [cuda_support_string, amdgpu_support_string]), ", ");
+    oneapi_support_string   = (oneapi_enabled && all(oneapiaware_MPI))     ? "ONEAPI-aware"   : (oneapi_enabled && any(oneapiaware_MPI))     ? "ONEAPI(-aware)"   : (oneapi_enabled)   ? "ONEAPI"   : "";
+    gpu_support_string    = join(filter(!isempty, [cuda_support_string, amdgpu_support_string, oneapi_support_string]), ", ");
     support_string        = isempty(gpu_support_string) ? "none" : gpu_support_string;
     if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]); device support: $support_string)"); end
-    if ((cuda_enabled || amdgpu_enabled) && select_device) _select_device() end
+    if ((cuda_enabled || amdgpu_enabled || oneapi_enabled) && select_device) _select_device() end
     init_timing_functions();
     return me, dims, nprocs, coords, comm_cart; # The typical use case requires only these variables; the remaining can be obtained calling get_global_grid() if needed.
 end
diff --git a/src/select_device.jl b/src/select_device.jl
index 5df62cf..9b41e90 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -14,14 +14,17 @@ See also: [`init_global_grid`](@ref)
 """
 function select_device()
     check_initialized()
-    if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end
-    if cuda_enabled() || amdgpu_enabled()
+    if (cuda_enabled() && amdgpu_enabled() && oneapi_enabled) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end
+    if cuda_enabled() || amdgpu_enabled() || oneapi_enabled()
         if cuda_enabled()
             @assert cuda_functional()
             nb_devices = nb_cudevices()
         elseif amdgpu_enabled()
             @assert amdgpu_functional()
             nb_devices = nb_rocdevices()
+        elseif oneapi_enabled()
+            @assert oneapi_functional()
+            nb_devices = nb_oneapidevices()
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())
         if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end
@@ -29,6 +32,7 @@ function select_device()
         device_id = amdgpu_enabled() ? me_l+1 : me_l
         if     cuda_enabled()   cudevice!(device_id)
         elseif amdgpu_enabled() rocdevice!(device_id)
+        elseif oneapi_enabled() oneapidevice!(device_id)
         end
         return device_id
     else
diff --git a/src/shared.jl b/src/shared.jl
index 7bb108a..6d3f8b4 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -6,19 +6,25 @@ using Base.Threads
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
 let
-    global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional
+    global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, oneapi_loaded, oneapi_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional, set_oneapi_loaded, set_oneapi_functional
     _cuda_loaded::Bool        = false
     _cuda_functional::Bool    = false
     _amdgpu_loaded::Bool      = false
     _amdgpu_functional::Bool  = false
+    _oneapi_loaded::Bool      = false
+    _oneapi_functional::Bool  = false
     cuda_loaded()::Bool       = _cuda_loaded
     cuda_functional()::Bool   = _cuda_functional
     amdgpu_loaded()::Bool     = _amdgpu_loaded
     amdgpu_functional()::Bool = _amdgpu_functional
+    oneapi_loaded()::Bool     = _oneapi_loaded
+    oneapi_functional()::Bool = _oneapi_functional
     set_cuda_loaded()         = (_cuda_loaded = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
     set_cuda_functional()     = (_cuda_functional = is_functional(Val(:CUDA)))
     set_amdgpu_loaded()       = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
     set_amdgpu_functional()   = (_amdgpu_functional = is_functional(Val(:AMDGPU)))
+    set_oneapi_loaded()       = (_oneapi_loaded = is_loaded(Val(:ImplicitGlobalGrid_ONEAPIExt)))
+    set_oneapi_functional()     = (_oneapi_functional = is_functional(Val(:ONEAPI)))
 end
 
 
@@ -33,7 +39,8 @@ const DEVICE_TYPE_NONE = "none"
 const DEVICE_TYPE_AUTO = "auto"
 const DEVICE_TYPE_CUDA = "CUDA"
 const DEVICE_TYPE_AMDGPU = "AMDGPU"
-const SUPPORTED_DEVICE_TYPES = [DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]
+const DEVICE_TYPE_ONEAPI = "ONEAPI"
+const SUPPORTED_DEVICE_TYPES = [DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU, DEVICE_TYPE_ONEAPI]
 
 
 ##------
@@ -64,8 +71,10 @@ struct GlobalGrid
     comm::MPI.Comm
     cuda_enabled::Bool
     amdgpu_enabled::Bool
+    oneapi_enabled::Bool
     cudaaware_MPI::Vector{Bool}
     amdgpuaware_MPI::Vector{Bool}
+    oneapigpuaware_MPI::Vector{Bool}
     use_polyester::Vector{Bool}
     quiet::Bool
 end
@@ -104,19 +113,24 @@ neighbors(dim::Integer)                = global_grid().neighbors[:,dim]
 neighbor(n::Integer, dim::Integer)     = global_grid().neighbors[n,dim]
 cuda_enabled()                         = global_grid().cuda_enabled
 amdgpu_enabled()                       = global_grid().amdgpu_enabled
+oneapi_enabled()                       = global_grid().oneapi_enabled
 cudaaware_MPI()                        = global_grid().cudaaware_MPI
 cudaaware_MPI(dim::Integer)            = global_grid().cudaaware_MPI[dim]
 amdgpuaware_MPI()                      = global_grid().amdgpuaware_MPI
 amdgpuaware_MPI(dim::Integer)          = global_grid().amdgpuaware_MPI[dim]
+oneapiaware_MPI()                      = global_grid().oneapiaware_MPI
+oneapiaware_MPI(dim::Integer)          = global_grid().oneapiaware_MPI[dim]
 use_polyester()                        = global_grid().use_polyester
 use_polyester(dim::Integer)            = global_grid().use_polyester[dim]
 has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.PROC_NULL
 any_array(fields::GGField...)          = any([is_array(A.A) for A in fields])
 any_cuarray(fields::GGField...)        = any([is_cuarray(A.A) for A in fields])
 any_rocarray(fields::GGField...)       = any([is_rocarray(A.A) for A in fields])
+any_onearray(fields::GGField...)       = any([is_onearray(A.A) for A in fields])
 all_arrays(fields::GGField...)         = all([is_array(A.A) for A in fields])
 all_cuarrays(fields::GGField...)       = all([is_cuarray(A.A) for A in fields])
 all_rocarrays(fields::GGField...)      = all([is_rocarray(A.A) for A in fields])
+all_onearrays(fields::GGField...)      = all([is_onearray(A.A) for A in fields])
 is_array(A::GGArray)                   = typeof(A) <: Array
 
 
diff --git a/src/update_halo.jl b/src/update_halo.jl
index e917506..d7c1eaa 100644
--- a/src/update_halo.jl
+++ b/src/update_halo.jl
@@ -35,7 +35,7 @@ function update_halo!(A::Union{GGArray, GGField, GGFieldConvertible}...; dims=(N
 end
 #
 function _update_halo!(fields::GGField...; dims=dims)
-    if (!cuda_enabled() && !amdgpu_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later).
+    if (!cuda_enabled() && !amdgpu_enabled() && !oneapi_enabled() && !all_arrays(fields...)) error("not all arrays are CPU arrays, but no GPU extension is loaded.") end #NOTE: in the following, it is only required to check for `cuda_enabled()`/`amdgpu_enabled()` when the context does not imply `any_cuarray(fields...)` or `is_cuarray(A)` or the corresponding for AMDGPU. # NOTE: the case where only one of the two extensions are loaded, but an array dad would be for the other extension is passed is very unlikely and therefore not explicitly checked here (but could be added later).
     allocate_bufs(fields...);
     if any_array(fields...) allocate_tasks(fields...); end
     if any_cuarray(fields...) allocate_custreams(fields...); end
@@ -103,6 +103,7 @@ let
         free_update_halo_cpubuffers()
         if (cuda_enabled() && none(cudaaware_MPI()))     free_update_halo_cubuffers() end
         if (amdgpu_enabled() && none(amdgpuaware_MPI())) free_update_halo_rocbuffers() end
+        if (oneapi_enabled() && none(oneapiaware_MPI())) free_update_halo_rocbuffers() end
         GC.gc() #TODO: see how to modify this!
     end
 
@@ -122,21 +123,25 @@ let
             init_bufs_arrays();
             if cuda_enabled() init_cubufs_arrays(); end
             if amdgpu_enabled() init_rocbufs_arrays(); end
+            if oneapi_enabled() init_onebufs_arrays(); end
         end
         init_bufs(T, fields...);
         if cuda_enabled() init_cubufs(T, fields...); end
         if amdgpu_enabled() init_rocbufs(T, fields...); end
+        if oneapi_enabled() init_onebufs(T, fields...); end
         for i = 1:length(fields)
             A, halowidths = fields[i];
             for n = 1:NNEIGHBORS_PER_DIM # Ensure that the buffers are interpreted to contain elements of the same type as the array.
                 reinterpret_bufs(T, i, n);
                 if cuda_enabled() reinterpret_cubufs(T, i, n); end
                 if amdgpu_enabled() reinterpret_rocbufs(T, i, n); end
+                if oneapi_enabled() reinterpret_onebufs(T, i, n); end
             end
             max_halo_elems = maximum((size(A,1)*size(A,2)*halowidths[3], size(A,1)*size(A,3)*halowidths[2], size(A,2)*size(A,3)*halowidths[1]));
             reallocate_undersized_hostbufs(T, i, max_halo_elems, A);
             if (is_cuarray(A) && any(cudaaware_MPI())) reallocate_undersized_cubufs(T, i, max_halo_elems) end
             if (is_rocarray(A) && any(amdgpuaware_MPI())) reallocate_undersized_rocbufs(T, i, max_halo_elems) end
+            if (is_onearray(A) && any(oneapiaware_MPI())) reallocate_undersized_onebufs(T, i, max_halo_elems) end
         end
     end
 
@@ -164,6 +169,7 @@ let
                 reallocate_bufs(T, i, n, max_halo_elems);
                 if (is_cuarray(A) && none(cudaaware_MPI())) reregister_cubufs(T, i, n, sendbufs_raw, recvbufs_raw); end  # Host memory is page-locked (and mapped to device memory) to ensure optimal access performance (from kernel or with 3-D memcopy).
                 if (is_rocarray(A) && none(amdgpuaware_MPI())) reregister_rocbufs(T, i, n, sendbufs_raw, recvbufs_raw); end  # ...
+                if (is_onearray(A) && none(oneapiaware_MPI())) reregister_onebufs(T, i, n, sendbufs_raw, recvbufs_raw); end  # ...
             end
             GC.gc(); # Too small buffers had been replaced with larger ones; free the now unused memory.
         end
@@ -337,7 +343,7 @@ function irecv_halo!(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Inte
     req = MPI.REQUEST_NULL;
     A, halowidths = F;
     if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-        if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
+        if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A) || (oneapiaware_MPI(dim) && is_onearray(A)))
             req = MPI.Irecv!(gpurecvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
         else
             req = MPI.Irecv!(recvbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
@@ -350,7 +356,7 @@ function isend_halo(n::Integer, dim::Integer, F::GGField, i::Integer; tag::Integ
     req = MPI.REQUEST_NULL;
     A, halowidths = F;
     if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-        if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
+        if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A) || (oneapiaware_MPI(dim) && is_onearray(A)))
             req = MPI.Isend(gpusendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
         else
             req = MPI.Isend(sendbuf_flat(n,dim,i,F), neighbor(n,dim), tag, comm());
@@ -362,7 +368,7 @@ end
 function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer)
     A, halowidths = F;
     if ol(dim,A) >= 2*halowidths[dim] # There is only a halo and thus a halo update if the overlap is at least 2 times the halowidth...
-        if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A))
+        if (cudaaware_MPI(dim) && is_cuarray(A)) || (amdgpuaware_MPI(dim) && is_rocarray(A) || (oneapiaware_MPI(dim) && is_onearray(A)))
             if n == 1
                 gpumemcopy!(gpurecvbuf_flat(2,dim,i,F), gpusendbuf_flat(1,dim,i,F));
             elseif n == 2