diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
index 4d7b81e6a2e..4fc274f2445 100644
--- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
+++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
@@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
 {
 
 //------------------------------------------------------------------------
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
index 1558919b20f..69817f7b468 100644
--- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -42,9 +42,13 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>();
     auto __buf = __keep(__first, __last);
 
+    auto __view = __buf.all_view();
+
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view());
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __view);
     return __future_obj;
 }
 
@@ -65,9 +69,14 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
     auto __buf2 = __keep2(__first2, __first2 + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
 }
@@ -91,10 +100,15 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>();
     auto __buf3 = __keep3(__first3, __first3 + __n);
 
-    auto __future =
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                          __buf1.all_view(), __buf2.all_view(), __buf3.all_view());
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+    auto __view3 = __buf3.all_view();
+
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
+                                                decltype(__view3)>{__f, static_cast<size_t>(__n)},
+        __n, __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);
 }
diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h
index ef01be4b161..2727bc6b480 100644
--- a/include/oneapi/dpl/internal/binary_search_impl.h
+++ b/include/oneapi/dpl/internal/binary_search_impl.h
@@ -37,13 +37,19 @@ enum class search_algorithm
     binary_search
 };
 
-template <typename Comp, typename T, search_algorithm func>
-struct custom_brick
+#if _ONEDPL_BACKEND_SYCL
+template <typename Comp, typename T, typename _Range, search_algorithm func>
+struct custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
 {
     Comp comp;
     T size;
     bool use_32bit_indexing;
 
+    custom_brick(Comp comp, T size, bool use_32bit_indexing)
+        : comp(comp), size(size), use_32bit_indexing(use_32bit_indexing)
+    {
+    }
+
     template <typename _Size, typename _ItemId, typename _Acc>
     void
     search_impl(_ItemId idx, _Acc acc) const
@@ -68,17 +74,23 @@ struct custom_brick
             get<2>(acc[idx]) = (value != end_orig) && (get<1>(acc[idx]) == get<0>(acc[value]));
         }
     }
-
-    template <typename _ItemId, typename _Acc>
+    template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    operator()(_ItemId idx, _Acc acc) const
+    __scalar_path_impl(_IsFull, _ItemId idx, _Acc acc) const
     {
         if (use_32bit_indexing)
             search_impl<std::uint32_t>(idx, acc);
         else
             search_impl<std::uint64_t>(idx, acc);
     }
+    template <typename _IsFull, typename _ItemId, typename _Acc>
+    void
+    operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const
+    {
+        __scalar_path_impl(__is_full, idx, acc);
+    }
 };
+#endif
 
 template <class _Tag, typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator,
           typename StrictWeakOrdering>
@@ -155,7 +167,8 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, ::std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::lower_bound>{comp, size, use_32bit_indexing},
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -187,7 +200,8 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::upper_bound>{comp, size, use_32bit_indexing},
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -217,10 +231,11 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input
     auto result_buf = keep_result(result, result + value_size);
     auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view());
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
-    __bknd::__parallel_for(_BackendTag{}, std::forward<decltype(policy)>(policy),
-                           custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::binary_search>{
-                               comp, size, use_32bit_indexing},
-                           value_size, zip_vw)
+    __bknd::__parallel_for(
+        _BackendTag{}, std::forward<decltype(policy)>(policy),
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{
+            comp, size, use_32bit_indexing},
+        value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 65bf99c8777..94846618bed 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -56,8 +56,12 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>();
     auto __buf = __keep(__first, __last);
 
+    auto __view = __buf.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, __exec, unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view())
+        _BackendTag{}, __exec,
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __view)
         .__deferrable_wait();
 }
 
@@ -101,9 +105,14 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
     auto __buf2 = __keep2(__first2, __first2 + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
     __future.wait(_WaitMode{});
@@ -130,10 +139,28 @@ _ForwardIterator2
 __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1,
                _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f)
 {
-    return __pattern_walk2</*_WaitMode*/ __par_backend_hetero::__deferrable_mode,
-                           __par_backend_hetero::access_mode::read_write,
-                           __par_backend_hetero::access_mode::read_write>(
-        __tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __f);
+    auto __n = __last1 - __first1;
+    if (__n == 0)
+        return __first2;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator1>();
+    auto __buf1 = __keep1(__first1, __last1);
+
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator2>();
+    auto __buf2 = __keep2(__first2, __first2 + __n);
+
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __view1, __view2);
+    __future.wait(__par_backend_hetero::__deferrable_mode{});
+    return __first2 + __n;
 }
 
 //------------------------------------------------------------------------
@@ -160,9 +187,15 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode3, _ForwardIterator3>();
     auto __buf3 = __keep3(__first3, __first3 + __n);
 
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                      unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                      __buf1.all_view(), __buf2.all_view(), __buf3.all_view())
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+    auto __view3 = __buf3.all_view();
+
+    oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
+                                                decltype(__view3)>{__f, static_cast<std::size_t>(__n)},
+        __n, __view1, __view2, __view3)
         .__deferrable_wait();
 
     return __first3 + __n;
@@ -1564,9 +1597,10 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type>{__n}, __n / 2,
-        __buf.all_view())
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
+                                         decltype(__buf.all_view())>{__n},
+        __n / 2, __buf.all_view())
         .__deferrable_wait();
 }
 
@@ -1589,10 +1623,13 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     auto __keep2 =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>();
     auto __buf2 = __keep2(__result, __result + __n);
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n},
-        __n, __buf1.all_view(), __buf2.all_view())
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
+                                      decltype(__view1), decltype(__view2)>{__n},
+        __n, __view1, __view2)
         .__deferrable_wait();
 
     return __result + __n;
@@ -1626,24 +1663,27 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     auto __buf = __keep(__first, __last);
     auto __temp_buf = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp>(__exec, __n);
 
+    auto __view = __buf.all_view();
     auto __temp_rng_w =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::write>(__temp_buf.get_buffer());
 
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type>{__n, __shift}, __n,
-        __buf.all_view(), __temp_rng_w);
+        unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
+                                     decltype(__temp_rng_w)>{__n, __shift},
+        __n, __view, __temp_rng_w);
 
     //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and
     //SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns.
 
     using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-    auto __brick = unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}};
-
     auto __temp_rng_rw =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer());
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
+    auto __brick =
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
+                                                decltype(__buf.all_view())>{_Function{}, static_cast<std::size_t>(__n)};
+    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
 
@@ -1673,13 +1713,16 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>();
     auto __buf2 = __keep2(__result, __result + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
     const auto __shift = __new_first - __first;
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n,
-                                                                                                               __shift},
-        __n, __buf1.all_view(), __buf2.all_view())
+        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
+                                     decltype(__view1), decltype(__view2)>{__n, __shift},
+        __n, __view1, __view2)
         .__deferrable_wait();
 
     return __result + __n;
@@ -1936,19 +1979,22 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
     if (__n >= __mid)
     {
         using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-        auto __brick = oneapi::dpl::unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}};
 
         //TODO: to consider use just "read" access mode for a source range and just "write" - for a destination range.
         auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n);
         auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res);
 
+        auto __brick =
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{
+                _Function{}, static_cast<std::size_t>(__size_res)};
+
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                                                           __brick, __size_res, __src, __dst)
             .__deferrable_wait();
     }
     else //2. n < size/2; 'n' parallel copying
     {
-        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n};
+        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{__size, __n};
         oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>(
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index da7820b91a2..d0af8d513ff 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -57,10 +57,35 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
     auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...);
     if (__n > 0)
     {
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                          ::std::forward<_Ranges>(__rngs)...)
-            .__deferrable_wait();
+        constexpr std::size_t __num_ranges = sizeof...(_Ranges);
+        static_assert(__num_ranges <= 3, "__pattern_walk_n only supports up to three packed range parameters");
+        if constexpr (__num_ranges == 1)
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    __f, static_cast<std::size_t>(__n)},
+                __n, ::std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
+        else if constexpr (__num_ranges == 2)
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    __f, static_cast<std::size_t>(__n)},
+                __n, ::std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
+        else
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    __f, static_cast<std::size_t>(__n)},
+                __n, ::std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
     }
 }
 
@@ -147,20 +172,27 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang
 {
     if (__rng1.size() <= __rng2.size())
     {
-        oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-            __tag,
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(
-                ::std::forward<_ExecutionPolicy>(__exec)),
-            __f, __rng1, __rng2);
-        return __rng1.size();
+        std::size_t __n = __rng1.size();
+        auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(
+            std::forward<_ExecutionPolicy>(__exec));
+        auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, std::move(__exec1),
+            unseq_backend::__brick_swap<decltype(__exec1), _Function, std::decay_t<_Range1>, std::decay_t<_Range2>>{
+                __f, __n},
+            __n, __rng1, __rng2);
+        __future.wait(__par_backend_hetero::__deferrable_mode{});
+        return __n;
     }
-
-    oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-        __tag,
-        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(
-            ::std::forward<_ExecutionPolicy>(__exec)),
-        __f, __rng2, __rng1);
-    return __rng2.size();
+    std::size_t __n = __rng2.size();
+    auto __exec2 =
+        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec));
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::move(__exec2),
+        unseq_backend::__brick_swap<decltype(__exec2), _Function, std::decay_t<_Range2>, std::decay_t<_Range1>>{__f,
+                                                                                                                __n},
+        __n, __rng2, __rng1);
+    __future.wait(__par_backend_hetero::__deferrable_mode{});
+    return __n;
 }
 
 //------------------------------------------------------------------------
@@ -625,8 +657,9 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
                 std::forward<_ExecutionPolicy>(__exec)),
-            unseq_backend::walk_n<_ExecutionPolicy, _CopyBrick>{_CopyBrick{}}, __n, std::forward<_Range1>(__rng),
-            std::forward<_Range2>(__result))
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>,
+                                                    std::decay_t<_Range2>>{_CopyBrick{}, static_cast<std::size_t>(__n)},
+            __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result))
             .get();
 
         return 1;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 96d63e33aee..3fb809f78d6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -36,6 +36,7 @@
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
+#include "parallel_backend_sycl_for.h"
 #include "parallel_backend_sycl_reduce.h"
 #include "parallel_backend_sycl_merge.h"
 #include "parallel_backend_sycl_merge_sort.h"
@@ -218,57 +219,11 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
-//------------------------------------------------------------------------
-// parallel_for - async pattern
-//------------------------------------------------------------------------
-
-// Use the trick with incomplete type and partial specialization to deduce the kernel name
-// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
-// type (for explicitly specified name by the user)
-template <typename _KernelName>
-struct __parallel_for_submitter;
-
-template <typename... _Name>
-struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
-{
-    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
-    {
-        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
-        _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
-            //get an access to data under SYCL buffer:
-            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-
-            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
-                auto __idx = __item_id.get_linear_id();
-                __brick(__idx, __rngs...);
-            });
-        });
-        return __future(__event);
-    }
-};
-
-//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
-//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
-template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-auto
-__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
-               _Ranges&&... __rngs)
-{
-    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>;
-
-    return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                                  ::std::forward<_Ranges>(__rngs)...);
-}
-
 //------------------------------------------------------------------------
 // parallel_transform_scan - async pattern
 //------------------------------------------------------------------------
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _CustomName, typename _PropagateScanName>
 struct __parallel_scan_submitter;
 
@@ -2184,7 +2139,7 @@ struct __partial_merge_kernel
     }
 };
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _GlobalSortName, typename _CopyBackName>
 struct __parallel_partial_sort_submitter;
 
@@ -2419,7 +2374,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n), _Range2>(__binary_op, __n),
+        __intermediate_result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
                                                 __intermediate_result_end),
         std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
@@ -2465,7 +2421,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
             std::forward<_ExecutionPolicy>(__exec)),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end), _Range4>(
             __binary_op, __intermediate_result_end),
         __result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end),
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
new file mode 100644
index 00000000000..46e36d1fafb
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -0,0 +1,208 @@
+// -*- C++ -*-
+//===-- parallel_backend_sycl_for.h --------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H
+#define _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#include <tuple>
+
+#include "sycl_defs.h"
+#include "parallel_backend_sycl_utils.h"
+#include "execution_sycl_defs.h"
+#include "unseq_backend_sycl.h"
+#include "utils_ranges_sycl.h"
+
+#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __par_backend_hetero
+{
+
+template <typename... Name>
+class __parallel_for_small_kernel;
+
+template <typename... Name>
+class __parallel_for_large_kernel;
+
+//------------------------------------------------------------------------
+// parallel_for - async pattern
+//------------------------------------------------------------------------
+
+// Use the trick with incomplete type and partial specialization to deduce the kernel name
+// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
+// type (for explicitly specified name by the user)
+template <typename _KernelName>
+struct __parallel_for_small_submitter;
+
+template <typename... _Name>
+struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
+{
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+
+            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
+                auto __idx = __item_id.get_linear_id();
+                // For small inputs, do not vectorize or perform multiple iterations per work item. Spread input evenly
+                // across compute units.
+                __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
+            });
+        });
+        return __future(__event);
+    }
+};
+
+template <typename _KernelName, typename... _RangeTypes>
+struct __parallel_for_large_submitter;
+
+template <typename... _Name, typename... _RangeTypes>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
+{
+    // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
+    static constexpr std::uint16_t __max_work_group_size = 512;
+
+    // SPIR-V compilation targets show best performance with a stride of the sub-group size.
+    // Other compilation targets perform best with a work-group size stride. This utility can only be called from the
+    // device.
+    static inline std::tuple<std::size_t, std::size_t, bool>
+    __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
+                         std::size_t __adj_elements_per_work_item, std::size_t __work_group_size)
+    {
+        const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
+        if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
+        {
+            const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
+            const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+            const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+            const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+
+            const std::size_t __sub_group_start_idx =
+                __iters_per_work_item * __adj_elements_per_work_item *
+                (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+            const bool __is_full_sub_group =
+                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <=
+                __count;
+            const std::size_t __work_item_idx =
+                __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
+            return std::tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group);
+        }
+        else
+        {
+            const std::size_t __work_group_start_idx =
+                __work_group_id * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item;
+            const std::size_t __work_item_idx =
+                __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item;
+            const bool __is_full_work_group =
+                __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <=
+                __count;
+            return std::tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group);
+        }
+    }
+
+    // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
+    // then we should start using this code path.
+    template <typename _ExecutionPolicy, typename _Fp>
+    static std::size_t
+    __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick)
+    {
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+        return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu;
+    }
+
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+            constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
+            const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
+                __count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)),
+                [=](sycl::nd_item</*dim=*/1> __item) {
+                    auto [__idx, __stride, __is_full] = __stride_recommender(
+                        __item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
+                    __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
+                    if (__is_full)
+                    {
+                        __execute_loop(std::true_type{}, __idx, __stride, __brick, __rngs...);
+                    }
+                    // If we are not full, then take this branch only if there is work to process.
+                    else if (__idx < __count)
+                    {
+                        __execute_loop(std::false_type{}, __idx, __stride, __brick, __rngs...);
+                    }
+                });
+        });
+        return __future(__event);
+    }
+};
+
+//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
+//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
+template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+auto
+__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
+               _Ranges&&... __rngs)
+{
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+    using _ForKernelSmall =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
+    using _ForKernelLarge =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
+
+    using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
+    using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
+    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
+    // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
+    // then only compile the basic kernel as the two versions are effectively the same.
+    if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
+    {
+        if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
+        {
+
+            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
+        }
+    }
+    return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                               std::forward<_Ranges>(__rngs)...);
+}
+
+} // namespace __par_backend_hetero
+} // namespace dpl
+} // namespace oneapi
+
+#endif
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
index 7baee78b1b1..613cd07f6f2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -48,7 +48,7 @@ namespace __par_backend_hetero
 //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
 //for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _Name>
 struct __parallel_for_fpga_submitter;
 
@@ -71,7 +71,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...
 #pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor)
                 for (auto __idx = 0; __idx < __count; ++__idx)
                 {
-                    __brick(__idx, __rngs...);
+                    __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
                 }
             });
         });
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cadff26a15d..9c331148f3c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -112,7 +112,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I
     }
 }
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _IdType, typename _Name>
 struct __parallel_merge_submitter;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
index edad63d2a79..23e38268bf9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _
 
 //------------------------------------------------------------------------
 // parallel_transform_reduce - async patterns
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 // Parallel_transform_reduce for a small arrays using a single work group.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index f4eb557170e..21970044d7d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -834,6 +834,172 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
+// For use with __lazy_ctor_storage
+struct __lazy_load_op
+{
+    template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
+    {
+        __dest_acc[__idx_dest].__setup(__source_acc[__idx_source]);
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_load
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    std::size_t __n;
+    template <typename _IdxType, typename _LoadOp, typename... _Rngs>
+    void
+    operator()(/*__is_full*/ std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __load_op(__start_idx + __i, __i, __rngs...);
+    }
+
+    template <typename _IdxType, typename _LoadOp, typename... _Rngs>
+    void
+    operator()(/*__is_full*/ std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const
+    {
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __load_op(__start_idx + __i, __i, __rngs...);
+    }
+};
+
+// For use with __lazy_ctor_storage
+template <typename _TransformOp>
+struct __lazy_store_transform_op
+{
+    _TransformOp __transform;
+    // Unary transformations into an output buffer
+    template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
+    {
+        __transform(__source_acc[__idx_source].__v, __dest_acc[__idx_dest]);
+    }
+    // Binary transformations into an output buffer
+    template <typename _IdxType1, typename _IdxType2, typename _Source1Acc, typename _Source2Acc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc,
+               _DestAcc __dest_acc) const
+    {
+        __transform(__source1_acc[__idx_source].__v, __source2_acc[__idx_source].__v, __dest_acc[__idx_dest]);
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_walk
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    std::size_t __n;
+
+    template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
+    void
+    operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+        {
+            __f(__rngs[__idx + __i]...);
+        }
+    }
+    // For a non-full vector path, process it sequentially. This will always be the last sub or work group
+    // if it does not evenly divide into input
+    template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
+    void
+    operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
+    {
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+        {
+            __f(__rngs[__idx + __i]...);
+        }
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_store
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    std::size_t __n;
+
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
+    void
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
+    }
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
+    void
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const
+    {
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_reverse
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    template <typename _Idx, typename _Array>
+    void
+    operator()(/*__is_full*/ std::true_type, const _Idx __elements_to_process, _Array __array) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
+            std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
+    }
+    template <typename _Idx, typename _Array>
+    void
+    operator()(/*__is_full*/ std::false_type, const _Idx __elements_to_process, _Array __array) const
+    {
+        for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
+            std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
+    }
+};
+
+// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
+// (potentially with vectorization)
+template <std::uint8_t __num_strides>
+struct __strided_loop
+{
+    std::size_t __n;
+    template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
+    void
+    operator()(/*__is_full*/ std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op,
+               _Ranges&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
+        {
+            __loop_body_op(std::true_type{}, __idx, __rngs...);
+            __idx += __stride;
+        }
+    }
+    template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
+    void
+    operator()(/*__is_full*/ std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op,
+               _Ranges&&... __rngs) const
+    {
+        // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
+        const std::uint8_t __adjusted_iters_per_work_item =
+            oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
+        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+        {
+            __loop_body_op(std::false_type{}, __idx, __rngs...);
+            __idx += __stride;
+        }
+    }
+};
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 7d3fd829cc5..4658495e726 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -368,7 +368,16 @@ namespace oneapi::dpl::unseq_backend
 template <typename _ExecutionPolicy, typename _F>
 struct walk_n;
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct walk1_vector_or_scalar;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk2_vectors_or_scalars;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct walk3_vectors_or_scalars;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk_adjacent_difference;
 
 template <typename _ExecutionPolicy, typename _Operation1, typename _Operation2, typename _Tp, typename _Commutative,
@@ -418,7 +427,7 @@ struct __brick_includes;
 template <typename _ExecutionPolicy, typename _Compare, typename _Size1, typename _Size2, typename _IsOpDifference>
 class __brick_set_op;
 
-template <typename _BinaryOperator, typename _Size>
+template <typename _BinaryOperator, typename _Size, typename _Range>
 struct __brick_reduce_idx;
 
 } // namespace oneapi::dpl::unseq_backend
@@ -429,9 +438,30 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk1_vector_or_scalar,
+                                                       _ExecutionPolicy, _F, _Range)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk2_vectors_or_scalars,
+                                                       _ExecutionPolicy, _F, _Range1, _Range2)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk3_vectors_or_scalars,
+                                                       _ExecutionPolicy, _F, _Range1, _Range2, _Range3)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk_adjacent_difference,
-                                                       _ExecutionPolicy, _F)>
+                                                       _ExecutionPolicy, _F, _Range1, _Range2)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_F>
 {
 };
@@ -543,9 +573,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
-template <typename _BinaryOperator, typename _Size>
+template <typename _BinaryOperator, typename _Size, typename _Range>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::__brick_reduce_idx, _BinaryOperator,
-                                                       _Size)>
+                                                       _Size, _Range)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOperator, _Size>
 {
 };
@@ -555,7 +585,7 @@ namespace oneapi::dpl::internal
 
 enum class search_algorithm;
 
-template <typename Comp, typename T, search_algorithm func>
+template <typename Comp, typename T, typename _Range, search_algorithm func>
 struct custom_brick;
 
 template <typename T, typename Predicate>
@@ -575,8 +605,8 @@ class transform_if_stencil_fun;
 
 } // namespace oneapi::dpl::internal
 
-template <typename Comp, typename T, oneapi::dpl::internal::search_algorithm func>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, func)>
+template <typename Comp, typename T, typename _Range, oneapi::dpl::internal::search_algorithm func>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, _Range, func)>
     : oneapi::dpl::__internal::__are_all_device_copyable<Comp, T>
 {
 };
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 2caa6add318..0678907b953 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -22,6 +22,7 @@
 #include "../../onedpl_config.h"
 #include "../../utils.h"
 #include "sycl_defs.h"
+#include "utils_ranges_sycl.h"
 
 namespace oneapi
 {
@@ -109,6 +110,194 @@ struct walk_n
     }
 };
 
+// Base class which establishes tuning parameters including vectorization / scalar path decider at compile time
+// for walk / for based algorithms
+template <typename... _Ranges>
+struct walk_vector_or_scalar_base
+{
+  private:
+    using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
+    constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization
+    constexpr static std::uint8_t __bytes_per_item = 16;
+    // Maximum size supported by compilers to generate vector instructions
+    constexpr static std::uint8_t __max_vector_size = 4;
+
+  public:
+    constexpr static bool __can_vectorize =
+        (oneapi::dpl::__ranges::__is_vectorizable_range<std::decay_t<_Ranges>>::value && ...) &&
+        (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
+    // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
+    constexpr static std::uint8_t __preferred_vector_size =
+        __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, __min_type_size) : 1;
+    constexpr static std::uint8_t __preferred_iters_per_item =
+        __bytes_per_item / (__min_type_size * __preferred_vector_size);
+};
+
+// Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search)
+template <typename... _Ranges>
+struct walk_scalar_base
+{
+  private:
+    using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
+    constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    constexpr static std::uint8_t __bytes_per_item = 16;
+
+  public:
+    constexpr static bool __can_vectorize = false;
+    // With no vectorization, the vector size is 1
+    constexpr static std::uint8_t __preferred_vector_size = 1;
+    // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
+    constexpr static std::uint8_t __preferred_iters_per_item =
+        __bytes_per_item / (__min_type_size * __preferred_vector_size);
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
+                                                                                                 __rng);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range __rng) const
+    {
+        __f(__rng[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng);
+    }
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        // 1. Load input into a vector
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
+        // 2. Apply functor to vector and store into global memory
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __rng2);
+        // 3. Explicitly call destructor of lazy union type
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
+            __rng1_vector);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+
+        __f(__rng1[__idx], __rng2[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
+    }
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
+        // 1. Load inputs into vectors
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng2, __rng2_vector);
+        // 2. Apply binary functor to vector and store into global memory
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __rng2_vector, __rng3);
+        // 3. Explicitly call destructors of lazy union type
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
+            __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
+            __rng2_vector);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+
+        __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2, __rng3);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2, __rng3);
+    }
+};
+
 // If read accessor returns temporary value then __no_op returns lvalue reference to it.
 // After temporary value destroying it will be a reference on invalid object.
 // So let's don't call functor in case of __no_op
@@ -129,22 +318,62 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>
 // walk_adjacent_difference
 //------------------------------------------------------------------------
 
-template <typename _ExecutionPolicy, typename _F>
-struct walk_adjacent_difference
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
+    std::size_t __n;
 
-    template <typename _ItemId, typename _Acc1, typename _Acc2>
+  public:
+    walk_adjacent_difference(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(const _ItemId __idx, const _Acc1& _acc_src, _Acc2& _acc_dst) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        using ::std::get;
-
         // just copy an element if it is the first one
         if (__idx == 0)
-            _acc_dst[__idx] = _acc_src[__idx];
+            __rng2[__idx] = __rng1[__idx];
+        else
+            __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]);
+    }
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1];
+        // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
+        // followed by a vector load of the specified length.
+        if (__idx != 0)
+            __rng1_vector[0].__setup(__rng1[__idx - 1]);
+        else
+            __rng1_vector[0].__setup(__rng1[0]);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, &__rng1_vector[1]);
+        // 2. Perform a vector store of __preferred_vector_size adjacent differences.
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            &__rng1_vector[1], __rng2);
+        // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
+        // to better handle this results in performance degradation.
+        if (__idx == 0)
+            __rng2[0] = __rng1_vector[0].__v;
+        // 3. Delete temporary storage
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
+    }
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
         else
-            __f(_acc_src[__idx + (-1)], _acc_src[__idx], _acc_dst[__idx]);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -924,47 +1153,208 @@ struct __brick_includes
 //------------------------------------------------------------------------
 // reverse
 //------------------------------------------------------------------------
-template <typename _Size>
-struct __reverse_functor
+template <typename _Size, typename _Range>
+struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
     _Size __size;
-    template <typename _Idx, typename _Accessor>
+
+  public:
+    __reverse_functor(_Size __size) : __size(__size) {}
+
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(const _Idx __idx, _Accessor& __acc) const
+    __vector_path_impl(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
+    {
+        std::size_t __n = __size;
+        std::size_t __midpoint = __size / 2;
+        // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
+        // work-item. There may be some double processing (< 4 elements) between left and right vectors at the
+        // "crossover" point within a work item, but allowing this is to happen is likely more performant than
+        // additional branching for each work item (see reverse_copy).
+        if (__left_start_idx >= __midpoint)
+            return;
+
+        // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
+        const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
+
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_left_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
+
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng, __rng_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
+            __rng_right_vector);
+        // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
+        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+            std::true_type{}, __left_start_idx, __rng_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+            std::true_type{}, __right_start_idx, __rng_right_vector);
+        // 3. Store the left-half vector to the corresponding right-half indices and vice versa
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __right_start_idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __rng_left_vector, __rng);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __left_start_idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __rng_right_vector, __rng);
+        // 4. Call destructors of temporary storage
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng_right_vector);
+    }
+    template <typename _IsFull, typename _Idx>
+    void
+    __scalar_path_impl(_IsFull, const _Idx __idx, _Range __rng) const
     {
         using ::std::swap;
-        swap(__acc[__idx], __acc[__size - __idx - 1]);
+        swap(__rng[__idx], __rng[__size - __idx - 1]);
+    }
+    template <typename _IsFull, typename _Idx>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, _Range __rng) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng);
     }
 };
 
 //------------------------------------------------------------------------
 // reverse_copy
 //------------------------------------------------------------------------
-template <typename _Size>
-struct __reverse_copy
+template <typename _Size, typename _Range1, typename _Range2>
+struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
-    template <typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+
+  public:
+    __reverse_copy(_Size __size) : __size(__size) {}
+
+    template <typename _IsFull, typename _Idx>
+    void
+    __scalar_path_impl(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __rng2[__idx] = __rng1[__size - __idx - 1];
+    }
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    __vector_path_impl(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[__size - __idx - 1];
+        std::size_t __n = __size;
+        std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
+        std::uint8_t __elements_to_process =
+            std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+        const _Idx __output_start = __size - __idx - __elements_to_process;
+        // 1. Load vector to reverse
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
+        // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
+        if (__elements_to_process == __base_t::__preferred_vector_size)
+        {
+            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+                __is_full, __elements_to_process, __rng1_vector);
+            oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+                __is_full, __output_start,
+                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+                __rng1_vector, __rng2);
+        }
+        else
+        {
+            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+                std::false_type{}, __elements_to_process, __rng1_vector);
+            for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i)
+                __rng2[__output_start + __i] = __rng1_vector[__i].__v;
+        }
+        // 3. Cleanup
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__remaining_elements}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
+    }
+    template <typename _IsFull, typename _Idx>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
 //------------------------------------------------------------------------
 // rotate_copy
 //------------------------------------------------------------------------
-template <typename _Size>
-struct __rotate_copy
+template <typename _Size, typename _Range1, typename _Range2>
+struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
     _Size __shift;
-    template <typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+
+  public:
+    __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {}
+
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    __vector_path_impl(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[(__shift + __idx) % __size];
+        _Idx __shifted_idx = __shift + __idx;
+        _Idx __wrapped_idx = __shifted_idx % __size;
+        std::size_t __n = __size;
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
+        //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
+        if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
+        {
+            oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
+        }
+        else
+        {
+            std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
+            std::size_t __elements_to_process =
+                std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+            for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
+                __rng1_vector[__i].__setup(__rng1[(__shifted_idx + __i) % __size]);
+        }
+        // 2. Store the rotation
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __rng1_vector, __rng2);
+        // 3. Delete temporary storage
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
+    }
+    template <typename _IsFull, typename _Idx>
+    void
+    __scalar_path_impl(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __rng2[__idx] = __rng1[(__shift + __idx) % __size];
+    }
+    template <typename _IsFull, typename _Idx>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1038,15 +1428,24 @@ class __brick_set_op
     }
 };
 
-template <typename _ExecutionPolicy, typename _DiffType>
+// TODO: The implementation of shift left is reliant on exactly n (shift factor)
+// work items being launched by the parallel_for pattern, so it cannot be vectorized
+// or process multiple iterations per work items as is. For now, we must ensure that our
+// small submitter is launched in the SYCL backend's __parallel_for
+template <typename _ExecutionPolicy, typename _DiffType, typename _Range>
 struct __brick_shift_left
 {
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
+    constexpr static bool __can_vectorize = false;
+    constexpr static std::uint16_t __preferred_vector_size = 1;
+    constexpr static std::uint16_t __preferred_iters_per_item = 1;
     _DiffType __size;
     _DiffType __n;
 
-    template <typename _ItemId, typename _Range>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(const _ItemId __idx, _Range&& __rng) const
+    __scalar_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         const _DiffType __i = __idx - __n; //loop invariant
         for (_DiffType __k = __n; __k < __size; __k += __n)
@@ -1055,6 +1454,13 @@ struct __brick_shift_left
                 __rng[__k + __i] = ::std::move(__rng[__k + __idx]);
         }
     }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        __scalar_path_impl(__is_full, __idx, __rng);
+    }
 };
 
 struct __brick_assign_key_position
@@ -1071,8 +1477,8 @@ struct __brick_assign_key_position
 };
 
 // reduce the values in a segment associated with a key
-template <typename _BinaryOperator, typename _Size>
-struct __brick_reduce_idx
+template <typename _BinaryOperator, typename _Size, typename _Range>
+struct __brick_reduce_idx : public walk_scalar_base<_Range>
 {
     __brick_reduce_idx(const _BinaryOperator& __b, const _Size __n_) : __binary_op(__b), __n(__n_) {}
 
@@ -1087,23 +1493,89 @@ struct __brick_reduce_idx
             __res = __binary_op(__res, __values[__segment_begin]);
         return __res;
     }
-
-    template <typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
+    template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
     void
-    operator()(const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
-               _OutValues& __out_values) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+                       _OutValues& __out_values) const
     {
         using __value_type = decltype(__segment_starts[__idx]);
         __value_type __segment_end =
             (__idx == __segment_starts.size() - 1) ? __value_type(__n) : __segment_starts[__idx + 1];
         __out_values[__idx] = reduce(__segment_starts[__idx], __segment_end, __values);
     }
+    template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+               _OutValues& __out_values) const
+    {
+        __scalar_path_impl(__is_full, __idx, __segment_starts, __values, __out_values);
+    }
 
   private:
     _BinaryOperator __binary_op;
     _Size __n;
 };
 
+// std::swap_ranges is unique in that both sets of provided ranges will be modified. Due to this,
+// we define a separate functor from __walk2_vectors_or_scalars with a customized vectorization path.
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    __brick_swap(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size];
+        // 1. Load inputs into vectors
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng2, __rng2_vector);
+        // 2. Swap the two ranges
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector,
+            __rng1);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __rng2);
+        // 3. Explicitly call destructor of lazy union type
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
+            __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
+            __rng2_vector);
+    }
+
+    template <typename _IsFull, typename _Idx>
+    void
+    __scalar_path_impl(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __f(__rng1[__idx], __rng2[__idx]);
+    }
+
+    template <typename _IsFull, typename _Idx>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
+    }
+};
+
 } // namespace unseq_backend
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index 0f757c2ce3e..f126df3ab9e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -18,6 +18,9 @@
 
 #include <iterator>
 #include <type_traits>
+#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
+#include <ranges> // std::ranges::contiguous_range
+#endif
 
 #include "../../utils_ranges.h"
 #include "../../iterator_impl.h"
@@ -752,6 +755,31 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&.
 }
 #endif
 
+// Check the outer view type type to see if we can vectorize. Any non-contiguous inputs (e.g. reverse
+// views, permutation views, etc.) cannot be vectorized. If C++20 ranges are present, then we can
+// use the std::ranges::contiguous_range concept.
+template <typename _Rng>
+struct __is_vectorizable_range
+{
+    constexpr static bool value =
+#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
+        std::ranges::contiguous_range<_Rng>;
+#else
+        false;
+#endif
+};
+// If the outer view is a guard view, then the input is passed directly as a pointer and we can use.
+template <typename... _Args>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::guard_view<_Args...>> : std::true_type
+{
+};
+// If all_view is passed, then we are processing a sycl::buffer directly which is contiguous and can
+// be used.
+template <typename... _Args>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::all_view<_Args...>> : std::true_type
+{
+};
+
 } // namespace __ranges
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
index 6970f57f4d7..937608bfc39 100644
--- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
@@ -143,7 +143,9 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando
 
         auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec),
-            unseq_backend::walk_n<_ExecutionPolicy, decltype(__fill_func)>{__fill_func}, __num_bins, __bins);
+            unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{
+                __fill_func, static_cast<std::size_t>(__num_bins)},
+            __num_bins, __bins);
 
         if (__n > 0)
         {
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
index d040e828eef..80d2aaccbac 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -263,10 +263,14 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&
             oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator2>();
         auto __buf2 = __keep2(__d_first, __d_last);
 
-        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn)>;
+        auto __view1 = __buf1.all_view();
+        auto __view2 = __buf2.all_view();
 
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{__fn}, __n,
-                                                          __buf1.all_view(), __buf2.all_view())
+        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__view1),
+                                                                  decltype(__view2)>;
+
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, __exec, _Function{__fn, static_cast<std::size_t>(__n)}, __n, __view1, __view2)
             .__deferrable_wait();
     }
 
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 8a8dfdae1bc..361c25021e6 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -25,6 +25,7 @@
 #include <iterator>
 #include <functional>
 #include <type_traits>
+#include <algorithm>
 
 #if _ONEDPL_BACKEND_SYCL
 #    include "hetero/dpcpp/sycl_defs.h"
@@ -782,6 +783,40 @@ union __lazy_ctor_storage
     {
         __v.~_Tp();
     }
+    static auto
+    __get_callable_deleter()
+    {
+        return [](__lazy_ctor_storage& __storage) { __storage.__destroy(); };
+    }
+};
+
+// To implement __min_nested_type_size, a general utility with an internal tuple
+// specialization, we need to forward declare our internal tuple first as tuple_impl.h
+// already includes this header.
+template <typename... T>
+struct tuple;
+
+// Returns the smallest type within a set of potentially nested template types. This function
+// recursively explores std::tuple and oneapi::dpl::__internal::tuple for the smallest type.
+// For all other types, its size is used directly.
+// E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
+// then __min_nested_type_size<T>::value returns sizeof(short).
+template <typename _T>
+struct __min_nested_type_size
+{
+    constexpr static std::size_t value = sizeof(_T);
+};
+
+template <typename... _Ts>
+struct __min_nested_type_size<std::tuple<_Ts...>>
+{
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
+};
+
+template <typename... _Ts>
+struct __min_nested_type_size<oneapi::dpl::__internal::tuple<_Ts...>>
+{
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
 };
 
 } // namespace __internal
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 322d93d4824..499269bfb1c 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -46,9 +46,11 @@ test_device_copyable()
                   "constant_iterator_device_copyable is not device copyable");
 
     //custom_brick
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-                      noop_device_copyable, int_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
-                  "custom_brick is not device copyable with device copyable types");
+    static_assert(
+        sycl::is_device_copyable_v<
+            oneapi::dpl::internal::custom_brick<noop_device_copyable, int_device_copyable, range_device_copyable,
+                                                oneapi::dpl::internal::search_algorithm::lower_bound>>,
+        "custom_brick is not device copyable with device copyable types");
     //replace_if_fun
     static_assert(
         sycl::is_device_copyable_v<oneapi::dpl::internal::replace_if_fun<int_device_copyable, noop_device_copyable>>,
@@ -76,11 +78,23 @@ test_device_copyable()
     static_assert(sycl::is_device_copyable_v<
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_device_copyable>>,
                   "walk_n is not device copyable with device copyable types");
+    //walk1_vector_or_scalar
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk1_vector_or_scalar<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable>>,
+                  "walk1_vector_or_scalar is not device copyable with device copyable types");
+    //walk2_vectors_or_scalars
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk2_vectors_or_scalars<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
+                  "walk2_vectors_or_scalars is not device copyable with device copyable types");
+    //walk3_vectors_or_scalars
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk3_vectors_or_scalars<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable,
+                      range_device_copyable>>,
+                  "walk3_vectors_or_scalars is not device copyable with device copyable types");
     //walk_adjacent_difference
-    static_assert(
-        sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_device_copyable>>,
-        "walk_adjacent_difference is not device copyable with device copyable types");
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk_adjacent_difference<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
+                  "walk_adjacent_difference is not device copyable with device copyable types");
     //transform_reduce
     static_assert(
         sycl::is_device_copyable_v<
@@ -148,8 +162,8 @@ test_device_copyable()
                                                        int_device_copyable, int_device_copyable, std::true_type>>,
         "__brick_set_op is not device copyable with device copyable types");
     // __brick_reduce_idx
-    static_assert(sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_device_copyable>>,
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::__brick_reduce_idx<
+                      noop_device_copyable, int_device_copyable, range_device_copyable>>,
                   "__brick_reduce_idx is not device copyable with device copyable types");
 
     //__gen_transform_input
@@ -307,10 +321,10 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<range_non_device_copyable>, "range_non_device_copyable is device copyable");
 
     //custom_brick
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-            noop_device_copyable, int_non_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
-        "custom_brick is device copyable with non device copyable types");
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
+                      noop_device_copyable, int_non_device_copyable, range_non_device_copyable,
+                      oneapi::dpl::internal::search_algorithm::lower_bound>>,
+                  "custom_brick is device copyable with non device copyable types");
     //replace_if_fun
     static_assert(!sycl::is_device_copyable_v<
                       oneapi::dpl::internal::replace_if_fun<int_device_copyable, noop_non_device_copyable>>,
@@ -339,10 +353,26 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_non_device_copyable>>,
                   "walk_n is device copyable with non device copyable types");
+    //walk1_vector_or_scalar
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk1_vector_or_scalar<
+                      policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable>>,
+                  "walk1_vector_or_scalar is device copyable with non device copyable types");
+    //walk2_vectors_or_scalars
+    static_assert(
+        !sycl::is_device_copyable_v<
+            oneapi::dpl::unseq_backend::walk2_vectors_or_scalars<policy_non_device_copyable, noop_non_device_copyable,
+                                                                 range_non_device_copyable, range_non_device_copyable>>,
+        "walk2_vectors_or_scalars is device copyable with non device copyable types");
+    //walk3_vectors_or_scalars
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk3_vectors_or_scalars<
+                      policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable,
+                      range_non_device_copyable, range_non_device_copyable>>,
+                  "walk3_vectors_or_scalars is device copyable with non device copyable types");
     //walk_adjacent_difference
     static_assert(
         !sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable>>,
+            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable,
+                                                                 range_non_device_copyable, range_non_device_copyable>>,
         "walk_adjacent_difference is device copyable with non device copyable types");
     //transform_reduce
     static_assert(
@@ -411,8 +441,8 @@ test_non_device_copyable()
                                                        int_device_copyable, int_device_copyable, std::true_type>>,
         "__brick_set_op is device copyable with non device copyable types");
     //__brick_reduce_idx
-    static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable>>,
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::__brick_reduce_idx<
+                      noop_device_copyable, int_non_device_copyable, range_non_device_copyable>>,
                   "__brick_reduce_idx is device copyable with non device copyable types");
 
     //__gen_transform_input
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
index 13f03fdb3dd..41afe0693e1 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
@@ -69,7 +69,7 @@ template <typename T>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> actual(max_len);
 
@@ -99,6 +99,7 @@ int
 main()
 {
     test<std::int32_t>();
+    test<std::uint8_t>();
     test<std::uint16_t>();
     test<float64_t>();
     test<wrapper<float32_t>>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
index 44f9dc6fe7c..cbe63db7e47 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
@@ -82,7 +82,7 @@ template <typename T1, typename T2>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
     Sequence<T2> actual(max_len);
     Sequence<T1> data(max_len, [](::std::size_t i) { return T1(i); });
 
@@ -100,6 +100,7 @@ main()
 {
     // clang-3.8 fails to correctly auto vectorize the loop in some cases of different types of container's elements,
     // for example: std::int32_t and std::int8_t. This issue isn't detected for clang-3.9 and newer versions.
+    test<std::uint8_t, std::uint8_t>();
     test<std::int16_t, std::int8_t>();
     test<std::uint16_t, float32_t>();
     test<float64_t, std::int64_t>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
index 9ad4cfc13bc..7dfc95b1485 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
@@ -125,8 +125,9 @@ template <typename T, typename Convert>
 void
 test(T trash, Convert convert)
 {
+    size_t max_n = TestUtils::get_pattern_for_max_n();
     // Try sequences of various lengths.
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         // count is number of output elements, plus a handful
         // more for sake of detecting buffer overruns.
@@ -166,6 +167,9 @@ main()
     test<std::int32_t>(-666, [](size_t j) { return std::int32_t(j); });
     test<float64_t>(-666.0, [](size_t j) { return float64_t(j); });
 
+    test<std::uint16_t>(42, [](size_t j) { return std::uint16_t(j); });
+    test<std::uint8_t>(42, [](size_t j) { return std::uint8_t(j); });
+
 #if !TEST_DPCPP_BACKEND_PRESENT
     /*TODO: copy support of a class with no default constructor*/
     test<Wrapper<float64_t>>(Wrapper<float64_t>(-666.0), [](std::int32_t j) { return Wrapper<float64_t>(j); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
index 66c8e321627..6c707b62079 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
@@ -101,10 +101,12 @@ int
 main()
 {
 
-    const ::std::size_t N = 100000;
+    const std::size_t N = TestUtils::get_pattern_for_max_n();
 
     for (::std::size_t n = 0; n < N; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
+        test_fill_by_type<std::int8_t>(n);
+        test_fill_by_type<std::int16_t>(n);
         test_fill_by_type<std::int32_t>(n);
         test_fill_by_type<float64_t>(n);
     }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
index 3ee13107011..bcd65426526 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
@@ -82,7 +82,8 @@ template <typename T>
 void
 test_generate_by_type()
 {
-    for (size_t n = 0; n <= 100000; n = n < 16 ? n + 1 : size_t(3.1415 * n))
+    size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in(n, [](size_t) -> T { return T(0); }); //fill by zero
 
@@ -123,6 +124,7 @@ struct test_non_const_generate_n
 int
 main()
 {
+    test_generate_by_type<std::uint8_t>();
     test_generate_by_type<std::int32_t>();
     test_generate_by_type<float64_t>();
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
index 92820d2012f..91291851dcb 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
@@ -114,7 +114,7 @@ template <typename T1, typename T2, typename Pred>
 void
 test(Pred pred)
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     const T1 value = T1(0);
     const T1 new_value = T1(666);
@@ -162,6 +162,7 @@ int
 main()
 {
     test<std::int32_t, float32_t>(oneapi::dpl::__internal::__equal_value<std::int32_t>(666));
+    test<std::uint8_t, std::uint8_t>([](const std::uint8_t& elem) { return elem % 3 < 2; });
     test<std::uint16_t, std::uint8_t>([](const std::uint16_t& elem) { return elem % 3 < 2; });
     test<float64_t, std::int64_t>([](const float64_t& elem) { return elem * elem - 3.5 * elem > 10; });
     //test<copy_int, copy_int>([](const copy_int& val) { return val.value / 5 > 2; });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
index 50851e3d299..b89c3aa3f40 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
@@ -75,8 +75,9 @@ template <typename T, typename Convert, typename Predicate>
 void
 test(T trash, const T& old_value, const T& new_value, Predicate pred, Convert convert)
 {
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
     // Try sequences of various lengths.
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in(n, [&](size_t k) -> T { return convert(n ^ k); });
         Sequence<T> out(n, [=](size_t) { return trash; });
@@ -123,6 +124,10 @@ main()
     test<std::int32_t>(-666, 42, 99, [](const std::int32_t& x) { return x != 42; },
                   [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : -1 - std::int32_t(j); });
 
+    test<std::uint8_t>(123, 42, 99, [](const std::uint8_t& x) { return x != 42; },
+                  [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : 255; });
+
+
 #if !TEST_DPCPP_BACKEND_PRESENT
     test<Number>(Number(42, OddTag()), Number(2001, OddTag()), Number(2017, OddTag()), IsMultiple(3, OddTag()),
                  [](std::int32_t j) { return ((j + 1) % 3 & 2) != 0 ? Number(2001, OddTag()) : Number(j, OddTag()); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
index 2147999ce3f..a6f4600c2a8 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
@@ -132,7 +132,7 @@ template <typename T>
 void
 test()
 {
-    const std::int32_t max_len = 100000;
+    const std::int32_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> actual(max_len, [](::std::size_t i) { return T(i); });
     Sequence<T> data(max_len, [](::std::size_t i) { return T(i); });
@@ -154,6 +154,8 @@ test()
 int
 main()
 {
+    test<std::int8_t>();
+    test<std::int16_t>();
     test<std::int32_t>();
 #if !TEST_DPCPP_BACKEND_PRESENT
     test<wrapper<float64_t>>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
index edf5f56651c..1df986deb6a 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
@@ -100,7 +100,7 @@ void
 test()
 {
 
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T2> actual(max_len, [](::std::size_t i) { return T1(i); });
 
@@ -125,6 +125,8 @@ test()
 int
 main()
 {
+    test<std::int8_t, std::int8_t>();
+    test<std::int16_t, std::int16_t>();
     test<std::int32_t, std::int8_t>();
     test<std::uint16_t, float32_t>();
     test<float64_t, std::int64_t>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
index afd10f55f50..f23ea69e05b 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
@@ -115,7 +115,7 @@ template <typename T>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> data(max_len);
     Sequence<T> actual(max_len);
@@ -132,6 +132,7 @@ main()
 {
     test<wrapper<std::uint16_t>>();
     test<wrapper<float32_t>>();
+    test<std::uint8_t>();
     test<std::int32_t>();
     test<float64_t>();
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
index 113c24be7c8..5deae6ea667 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
@@ -104,7 +104,7 @@ test(Predicate pred, _IteratorAdapter adap = {})
 #if PSTL_USE_DEBUG && ONEDPL_USE_OPENMP_BACKEND
         10000;
 #else
-        100000;
+        TestUtils::get_pattern_for_max_n();
 #endif
     for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
@@ -153,5 +153,7 @@ main()
     //test case for zip iterator
     test<50, std::int32_t, std::int32_t, std::int32_t>(TheOperationZip<std::int32_t>(1), _ZipIteratorAdapter{});
 
+    test<60, std::uint16_t, std::uint16_t, std::int32_t>(TheOperation<std::uint16_t, std::uint16_t, std::int32_t>(1));
+
     return done();
 }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
index 20c1403f6b9..19fe906935c 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
@@ -84,7 +84,8 @@ template <::std::size_t CallNumber, typename Tin, typename Tout, typename _Op =
 void
 test()
 {
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<Tin> in(n, [](std::int32_t k) { return k % 5 != 1 ? 3 * k - 7 : 0; });
 
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
index 526c1c45fcf..c23fd3013de 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
@@ -85,7 +85,8 @@ template <typename T>
 void
 test()
 {
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in_out(n, Gen<T>());
         Sequence<T> expected(n, Gen<T>());
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
index 973ca3658f8..97198b72f76 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
@@ -162,10 +162,11 @@ void
 test()
 {
     const ::std::int64_t init_val = 999;
-    for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return (3 * k); });
+            Sequence<_Type> in1(n, [=](size_t k) { return (3 * k) % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> in2(n, [=](size_t k) { return k % 2 == 0 ? 1 : 0; });
 
             Sequence<_Type> out(n, [=](size_t) { return init_val; });
@@ -178,7 +179,7 @@ test()
 #endif
         }
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return k; });
+            Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> out(n, [=](size_t) { return init_val; });
 
             invoke_on_all_policies<2>()(test_transform_if_unary<_Type>(), in1.begin(), in1.end(), out.begin(),
@@ -196,10 +197,11 @@ void
 test_inplace()
 {
     const ::std::int64_t init_val = 999;
-    for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return k; });
+            Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> out(n, [=](size_t) { return 0; });
 
             invoke_on_all_policies<4>()(test_transform_if_unary_inplace<_Type>(), in1.begin(), in1.end(), out.begin(),
@@ -211,11 +213,15 @@ test_inplace()
 int
 main()
 {
-    test<::std::int32_t>();
-    test<::std::int64_t>();
-
-    test_inplace<::std::int32_t>();
-    test_inplace<::std::int64_t>();
+    test<std::int8_t>();
+    test<std::int16_t>();
+    test<std::int32_t>();
+    test<std::int64_t>();
+
+    test_inplace<std::int8_t>();
+    test_inplace<std::int16_t>();
+    test_inplace<std::int32_t>();
+    test_inplace<std::int64_t>();
 
     return done();
 }
diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
index a617d3205d0..08b93265541 100644
--- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
@@ -139,7 +139,7 @@ template <typename T1, typename T2, typename Pred>
 void
 test(Pred pred)
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     const T2 value = T2(77);
     const T1 trash = T1(31);
@@ -165,6 +165,7 @@ int
 main()
 {
     test<std::uint8_t, std::uint32_t>([](std::uint32_t a, std::uint32_t b) { return a - b; });
+    test<std::uint16_t, std::uint16_t>([](std::uint16_t a, std::uint16_t b) { return a > b ? a - b : b - a; });
     test<std::int32_t, std::int64_t>([](std::int64_t a, std::int64_t b) { return a / (b + 1); });
     test<std::int64_t, float32_t>([](float32_t a, float32_t b) { return (a + b) / 2; });
 #if !TEST_DPCPP_BACKEND_PRESENT
diff --git a/test/support/utils.h b/test/support/utils.h
index 666d5ca3646..bef28868b1b 100644
--- a/test/support/utils.h
+++ b/test/support/utils.h
@@ -1013,6 +1013,28 @@ generate_arithmetic_data(T* input, std::size_t size, std::uint32_t seed)
         input[j] = input[i];
     }
 }
+
+// Utility that models __estimate_best_start_size in the SYCL backend parallel_for
+// to ensure large enough inputs are used to test the large submitter path.
+// A multiplier to the max n is added to ensure we get a few separate test inputs for
+// this path.
+std::size_t
+get_pattern_for_max_n()
+{
+#if TEST_DPCPP_BACKEND_PRESENT
+    sycl::queue q = TestUtils::get_test_queue();
+    sycl::device d = q.get_device();
+    constexpr std::size_t max_iters_per_item = 16;
+    constexpr std::size_t multiplier = 4;
+    std::size_t __max_n = multiplier * max_iters_per_item * d.get_info<sycl::info::device::max_work_group_size>() *
+                          d.get_info<sycl::info::device::max_compute_units>();
+    __max_n = std::min(std::size_t{10000000}, __max_n);
+    return __max_n;
+#else
+    return TestUtils::max_n;
+#endif
+}
+
 } /* namespace TestUtils */
 
 #endif // _UTILS_H