From 3bb1b51604e3f21916e9164a0ec6873098fac173 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Wed, 29 Nov 2023 16:39:27 -0600
Subject: [PATCH] allocate empty arrays (#9)

---
 coreforecast/grouped_array.py | 22 +++++++++----------
 src/coreforecast.cpp          | 40 +++++++++++++++++++++++++++++------
 tests/test_lag_transforms.py  |  2 +-
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/coreforecast/grouped_array.py b/coreforecast/grouped_array.py
index f6a1f95..713c176 100644
--- a/coreforecast/grouped_array.py
+++ b/coreforecast/grouped_array.py
@@ -68,7 +68,7 @@ def _pyfloat_to_c(self, x: float) -> Union[ctypes.c_float, ctypes.c_double]:
         return out
 
     def scaler_fit(self, scaler_type: str) -> np.ndarray:
-        stats = np.full_like(self.data, np.nan, shape=(len(self), 2))
+        stats = np.empty_like(self.data, shape=(len(self), 2))
         _LIB[f"{self.prefix}_{scaler_type}ScalerStats"](
             self._handle,
             _data_as_void_ptr(stats),
@@ -76,7 +76,7 @@ def scaler_fit(self, scaler_type: str) -> np.ndarray:
         return stats
 
     def scaler_transform(self, stats: np.ndarray) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_ScalerTransform"](
             self._handle,
             _data_as_void_ptr(stats),
@@ -103,7 +103,7 @@ def take_from_groups(self, k: int) -> np.ndarray:
         return out
 
     def lag_transform(self, lag: int) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_LagTransform"](
             self._handle,
             ctypes.c_int(lag),
@@ -114,7 +114,7 @@ def lag_transform(self, lag: int) -> np.ndarray:
     def rolling_transform(
         self, stat_name: str, lag: int, window_size: int, min_samples: int
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_Rolling{stat_name}Transform"](
             self._handle,
             ctypes.c_int(lag),
@@ -127,7 +127,7 @@ def rolling_transform(
     def rolling_quantile_transform(
         self, lag: int, p: float, window_size: int, min_samples: int
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_RollingQuantileTransform"](
             self._handle,
             ctypes.c_int(lag),
@@ -173,7 +173,7 @@ def seasonal_rolling_transform(
         window_size: int,
         min_samples: int,
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_SeasonalRolling{stat_name}Transform"](
             self._handle,
             ctypes.c_int(lag),
@@ -211,7 +211,7 @@ def seasonal_rolling_quantile_transform(
         window_size: int,
         min_samples: int,
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_SeasonalRollingQuantileTransform"](
             self._handle,
             ctypes.c_int(lag),
@@ -249,7 +249,7 @@ def expanding_transform_with_aggs(
         lag: int,
         aggs: np.ndarray,
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_Expanding{stat_name}Transform"](
             self._handle,
             ctypes.c_int(lag),
@@ -263,7 +263,7 @@ def expanding_transform(
         stat_name: str,
         lag: int,
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_Expanding{stat_name}Transform"](
             self._handle,
             ctypes.c_int(lag),
@@ -272,7 +272,7 @@ def expanding_transform(
         return out
 
     def expanding_quantile_transform(self, lag: int, p: float) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_ExpandingQuantileTransform"](
             self._handle,
             ctypes.c_int(lag),
@@ -297,7 +297,7 @@ def exponentially_weighted_transform(
         lag: int,
         alpha: float,
     ) -> np.ndarray:
-        out = np.full_like(self.data, np.nan)
+        out = np.empty_like(self.data)
         _LIB[f"{self.prefix}_ExponentiallyWeighted{stat_name}Transform"](
             self._handle,
             ctypes.c_int(lag),
diff --git a/src/coreforecast.cpp b/src/coreforecast.cpp
index 8556834..c255274 100644
--- a/src/coreforecast.cpp
+++ b/src/coreforecast.cpp
@@ -22,6 +22,16 @@ template <typename T> inline indptr_t FirstNotNaN(const T *data, indptr_t n) {
   return i;
 }
 
+template <typename T>
+inline indptr_t FirstNotNaN(const T *data, indptr_t n, T *out) {
+  indptr_t i = 0;
+  while (std::isnan(data[i]) && i < n) {
+    out[i] = std::numeric_limits<T>::quiet_NaN();
+    ++i;
+  }
+  return i;
+}
+
 template <typename T>
 inline void TakeFromGroups(const T *data, int n, T *out, int k) {
   if (k > n) {
@@ -118,8 +128,11 @@ inline void RollingMeanTransform(const T *data, int n, T *out, int window_size,
   int upper_limit = std::min(window_size, n);
   for (int i = 0; i < upper_limit; ++i) {
     accum += data[i];
-    if (i + 1 >= min_samples)
+    if (i + 1 < min_samples) {
+      out[i] = std::numeric_limits<T>::quiet_NaN();
+    } else {
       out[i] = accum / (i + 1);
+    }
   }
 
   for (int i = window_size; i < n; ++i) {
@@ -140,8 +153,11 @@ inline void RollingStdTransformWithStats(const T *data, int n, T *out, T *agg,
     prev_avg = curr_avg;
     curr_avg = prev_avg + (data[i] - prev_avg) / (i + 1);
     m2 += (data[i] - prev_avg) * (data[i] - curr_avg);
-    if (i + 1 >= min_samples)
+    if (i + 1 < min_samples) {
+      out[i] = std::numeric_limits<T>::quiet_NaN();
+    } else {
       out[i] = sqrt(m2 / i);
+    }
   }
   for (int i = window_size; i < n; ++i) {
     T delta = data[i] - data[i - window_size];
@@ -176,7 +192,9 @@ inline void RollingCompTransform(Func Comp, const T *data, int n, T *out,
     if (Comp(data[i], pivot)) {
       pivot = data[i];
     }
-    if (i + 1 >= min_samples) {
+    if (i + 1 < min_samples) {
+      out[i] = std::numeric_limits<T>::quiet_NaN();
+    } else {
       out[i] = pivot;
     }
   }
@@ -490,6 +508,13 @@ inline void ExponentiallyWeightedMeanTransform(const T *data, int n, T *out,
   }
 }
 
+template <typename T> inline void SkipLags(T *out, int n, int lag) {
+  int replacements = std::min(lag, n);
+  for (int i = 0; i < replacements; ++i) {
+    out[i] = std::numeric_limits<T>::quiet_NaN();
+  }
+}
+
 template <class T> class GroupedArray {
 private:
   const T *data_;
@@ -515,7 +540,8 @@ template <class T> class GroupedArray {
       indptr_t start_idx = FirstNotNaN(data_ + start, n);
       if (start_idx + lag >= n)
         continue;
-      f(data_ + start + start_idx, n - start_idx - lag, out + n_out * i,
+      start += start_idx;
+      f(data_ + start, n - start_idx - lag, out + n_out * i,
         std::forward<Args>(args)...);
     }
   }
@@ -544,7 +570,8 @@ template <class T> class GroupedArray {
       indptr_t start = indptr_[i];
       indptr_t end = indptr_[i + 1];
       indptr_t n = end - start;
-      indptr_t start_idx = FirstNotNaN(data_ + start, n);
+      indptr_t start_idx = FirstNotNaN(data_ + start, n, out + start);
+      SkipLags(out + start + start_idx, n - start_idx, lag);
       if (start_idx + lag >= n) {
         continue;
       }
@@ -562,7 +589,8 @@ template <class T> class GroupedArray {
       indptr_t start = indptr_[i];
       indptr_t end = indptr_[i + 1];
       indptr_t n = end - start;
-      indptr_t start_idx = FirstNotNaN(data_ + start, n);
+      indptr_t start_idx = FirstNotNaN(data_ + start, n, out + start);
+      SkipLags(out + start + start_idx, n - start_idx, lag);
       if (start_idx + lag >= n) {
         continue;
       }
diff --git a/tests/test_lag_transforms.py b/tests/test_lag_transforms.py
index dbfb8b5..03585f1 100644
--- a/tests/test_lag_transforms.py
+++ b/tests/test_lag_transforms.py
@@ -102,7 +102,7 @@ def test_correctness(data, comb, dtype):
         if "rolling_std" in comb:
             rtol = 1e-2
         elif "expanding_std" in comb:
-            rtol = 5e-5
+            rtol = 1e-4
     data = data.astype(dtype, copy=True)
     ga = GroupedArray(data, indptr)
     wf, cf, args = combs_map[comb]