Vladyslav Moroshan commited on Nov 7

Commit

c4b87d2

1 Parent(s): 5af912c

Initial upload of TempoPFN model, code, and weights

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.vscode/settings.json +2 -0
LICENSE +201 -0
README.md +151 -3
configs/example.yaml +119 -0
data/dataset_properties.json +152 -0
data/nan_stats.json +0 -0
examples/generate_synthetic_data.py +204 -0
examples/gift_eval/gift_eval_runner.py +251 -0
examples/gift_eval/gift_eval_submission.ipynb +1439 -0
examples/quick_start_tempo_pfn.ipynb +280 -0
examples/quick_start_tempo_pfn.py +101 -0
examples/utils.py +115 -0
gitignore +167 -0
models/checkpoint_38M.pth +3 -0
pyproject.toml +62 -0
requirements.txt +25 -0
src/__init__.py +0 -0
src/data/__init__.py +0 -0
src/data/augmentations.py +1318 -0
src/data/batch_composer.py +705 -0
src/data/constants.py +25 -0
src/data/containers.py +204 -0
src/data/datasets.py +267 -0
src/data/filter.py +73 -0
src/data/frequency.py +538 -0
src/data/loaders.py +661 -0
src/data/scalers.py +360 -0
src/data/time_features.py +564 -0
src/data/utils.py +75 -0
src/gift_eval/__init__.py +15 -0
src/gift_eval/constants.py +186 -0
src/gift_eval/core.py +64 -0
src/gift_eval/data.py +234 -0
src/gift_eval/evaluate.py +421 -0
src/gift_eval/predictor.py +318 -0
src/gift_eval/results.py +243 -0
src/models/__init__.py +0 -0
src/models/blocks.py +62 -0
src/models/gated_deltaproduct/README.md +344 -0
src/models/gated_deltaproduct/__init__.py +11 -0
src/models/gated_deltaproduct/configuration_gated_deltaproduct.py +108 -0
src/models/gated_deltaproduct/gated_deltaproduct.py +351 -0
src/models/gated_deltaproduct/modeling_gated_deltaproduct.py +105 -0
src/models/model.py +427 -0
src/optim/lr_scheduler.py +360 -0
src/plotting/__init__.py +0 -0
src/plotting/gift_eval_utils.py +215 -0
src/plotting/plot_timeseries.py +292 -0
src/synthetic_generation/__init__.py +0 -0
src/synthetic_generation/abstract_classes.py +97 -0

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {
2	+ }

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,151 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+library_name: tempo-pfn
+tags:
+- time-series-forecasting
+- zero-shot
+- rnn
+- linear-rnn
+- synthetic-data
+- foundation-model
+- automl
+arxiv: 2510.25502
+---
+# TempoPFN: Synthetic Pre-Training of Linear RNNs for Zero-Shot Time Series Forecasting
+[![arXiv](https://img.shields.io/badge/arXiv-2510.25502-b31b1b.svg)](https://arxiv.org/abs/2510.25502) [![License](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://github.com/automl/TempoPFN/blob/main/LICENSE)
+---
+**TempoPFN** introduced in [TempoPFN: Synthetic Pre-Training of Linear RNNs for Zero-Shot Time Series Forecasting](https://arxiv.org/abs/2510.25502), is a univariate time series foundation model pretrained **entirely on synthetic data**. It delivers top-tier zero-shot forecasting accuracy while remaining fully reproducible and free from real-data leakage.
+Built on a **Linear RNN (GatedDeltaProduct)** backbone, TempoPFN performs end-to-end forecasting without patching or windowing. Its design enables fully parallelizable training and inference while maintaining stable temporal state-tracking across long sequences. The GatedDeltaProduct architecture is based on [DeltaProduct](https://arxiv.org/html/2502.10297v3), extended with state-weaving for time series forecasting. For detailed information about the architecture and custom modifications, see [`src/models/gated_deltaproduct/README.md`](src/models/gated_deltaproduct/README.md).
+This repository includes the **pretrained 38M parameter model** (`models/checkpoint_38M.pth`), all training and inference code, and the **complete synthetic data generation pipeline** used for pretraining.
+## ✨ Why TempoPFN?
+* **High Performance, No Real Data:** Achieves top-tier competitive results on **GIFT-Eval, outperforming all existing synthetic-only approaches** and **surpassing the vast majority of models trained on real-world data**. This ensures full reproducibility and eliminates benchmark leakage.
+* **Parallel and Efficient:** The linear recurrence design enables full-sequence parallelization. This gives us the best of both worlds: the linear efficiency of an RNN, but with the training parallelism of a Transformer.
+* **Open and Reproducible:** Includes the full synthetic data pipeline, configurations, and scripts to reproduce training from scratch.
+* **State-Tracking Stability:** The GatedDeltaProduct recurrence and *state-weaving* mechanism preserve temporal continuity and information flow across long horizons, improving robustness without non-linear recurrence.
+![TempoPFN Overview](https://iili.io/KlUjfcP.png)
+## ⚙️ Installation
+> **Note on Model Weights:** This repository uses [Git LFS](https://git-lfs.github.com/) to store the model checkpoint (`.pth` file). You **must** have Git LFS installed to clone the repository correctly.
+>
+> ```bash
+> # Install Git LFS (e.g., on Ubuntu)
+> sudo apt-get install git-lfs
+> git lfs install
+> ```
+1.  **Clone the repository:**
+```bash
+    git clone https://huggingface.co/AutoML-org/TempoPFN
+    cd TempoPFN
+```
+2.  **Set up the environment:**
+```bash
+    python -m venv venv && source venv/bin/activate
+    # 1. Install PyTorch version matching your CUDA version
+    # Example for CUDA 12.8:
+    pip install torch --index-url https://download.pytorch.org/whl/cu128
+    # 2. Install TempoPFN and all other dependencies
+    pip install -r requirements.txt
+    export PYTHONPATH=$PWD
+```
+## 🚀 Quick Start: Run the Demo
+**Prerequisites:**
+* You must have a **CUDA-capable GPU** with a matching PyTorch version installed.
+* You have run `export PYTHONPATH=$PWD` from the repo's root directory (see Installation).
+### 1. Run the Quick Start Script
+Run a demo forecast on a synthetic sine wave. This script will automatically find and load the `models/checkpoint_38M.pth` file included in this repository.
+```bash
+python examples/quick_start_tempo_pfn.py
+```
+### 2. Run with a Different Checkpoint (Optional)
+If you have trained your own model, you can point the script to it:
+```bash
+python examples/quick_start_tempo_pfn.py --checkpoint /path/to/your/checkpoint.pth
+```
+### 3. Run the Notebook version
+```bash
+jupyter notebook examples/quick_start_tempo_pfn.ipynb
+```
+### Hardware & Performance Tips
+**GPU Required:** Inference requires a CUDA-capable GPU. Tested on NVIDIA A100/H100.
+**First Inference May Be Slow:** Initial calls for unseen sequence lengths trigger Triton kernel compilation. Subsequent runs are cached and fast.
+**Triton Caches:** To prevent slowdowns from writing caches to a network filesystem, route caches to a local directory (like `/tmp`) before running:
+```bash
+LOCAL_CACHE_BASE="${TMPDIR:-/tmp}/tsf-$(date +%s)"
+mkdir -p "${LOCAL_CACHE_BASE}/triton" "${LOCAL_CACHE_BASE}/torchinductor"
+export TRITON_CACHE_DIR="${LOCAL_CACHE_BASE}/triton"
+export TORCHINDUCTOR_CACHE_DIR="${LOCAL_CACHE_BASE}/torchinductor"
+python examples/quick_start_tempo_pfn.py
+```
+## 🚂 Training
+### Single-GPU Training (for debugging)
+```bash
+torchrun --standalone --nproc_per_node=1 src/training/trainer_dist.py --config ./configs/train.yaml
+```
+### Multi-GPU Training (Single-Node)
+This example uses 8 GPUs. The training script uses PyTorch DistributedDataParallel (DDP).
+```bash
+torchrun --standalone --nproc_per_node=8 src/training/trainer_dist.py --config ./configs/train.yaml
+```
+### Configuration
+All training and model parameters are controlled via YAML files in `configs/` (architecture, optimizers, paths).
+## 💾 Synthetic Data Generation
+A core contribution of this work is our open-source synthetic data pipeline, located in `src/synthetic_generation/`. It combines diverse generators with a powerful augmentation cascade.
+**Generators Used:**
+* **Adapted Priors:** ForecastPFN, KernelSynth, GaussianProcess (GP), and CauKer (Structural Causal Models).
+* **Novel Priors:** SDE (a flexible regime-switching Ornstein-Uhlenbeck process), Sawtooth, StepFunction, Anomaly, Spikes, SineWave, and Audio-Inspired generators (Stochastic Rhythms, Financial Volatility, Network Topology, Multi-Scale Fractals).
+You can easily generate your own data by installing the development dependencies and instantiating a generator wrapper. See `examples/generate_synthetic_data.py` for a minimal script, or inspect the generator code in `src/synthetic_generation/`.
+## 🤝 License
+This project is licensed under the Apache 2.0 License. See the LICENSE file for details. This permissive license allows for both academic and commercial use.
+## 📚 Citation
+If you find TempoPFN useful in your research, please consider citing our paper:
+```bibtex
+@misc{moroshan2025tempopfn,
+  title={TempoPFN: Synthetic Pre-training of Linear RNNs for Zero-Shot Time Series Forecasting},
+  author={Vladyslav Moroshan and Julien Siems and Arber Zela and Timur Carstensen and Frank Hutter},
+  year={2025},
+  eprint={2510.25502},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}
+```

configs/example.yaml ADDED Viewed

	@@ -0,0 +1,119 @@

+train_data_path: null # Replace with the path to root of the training data directory with subdirectories for each generator (e.g. gp, kernel, etc.)
+model_path: ./models # Path where the model will be saved
+model_name: TempoPFN
+continue_training: false
+checkpoint_path: null # Replace with the path to the checkpoint file
+seed: 2025
+wandb: true # whether to log to wandb
+wandb_project_name: TempoPFNTraining
+wandb_entity: university-of-freiburg-2024
+wandb_plots: false
+batch_size: 40
+num_training_iterations: 1000000 # 1M
+validation_batch_size: 64
+num_validation_batches: 1
+num_workers: 4
+gradient_accumulation_enabled: true
+accumulation_steps: 5  # Number of batches to accumulate before updating (effective batch size = batch_size * accumulation_steps)
+log_interval: 2048
+save_every: 100000
+generator_proportions:
+  forecast_pfn: 1.0
+  gp: 1.0
+  kernel: 1.0
+  sawtooth: 1.0
+  sinewave: 1.0
+  step: 1.0
+  anomaly: 1.0
+  spike: 1.0
+  cauker_univariate: 1.0
+  ou_process: 3.0
+  audio_financial_volatility: 0.1
+  audio_multi_scale_fractal: 0.1
+  audio_network_topology: 0.5
+  audio_stochastic_rhythm: 0.5
+  augmented_per_sample_2048: 2.0
+  augmented_temp_batch_2048: 2.0
+# Learning Rate Scheduler Configuration
+lr_scheduler: cosine  # Options: "warmup_stable_decay", "cosine_with_warmup", "cosine_with_restarts", "cosine"
+# Learning Rate Parameters
+peak_lr: 0.0002           # 2e-4 - Peak learning rate
+min_lr_ratio: 0.01        # Minimum LR as fraction of peak LR
+# WSD Scheduler Specific Parameters
+warmup_ratio: 0.003       # 0.3% of total steps for warmup
+stable_ratio: 0.90        # 90% of total steps at stable learning rate
+decay_type: cosine        # Type of decay: "cosine" or "linear"
+# Alternative Scheduler Parameters (if using different schedulers)
+num_cycles: 0.5           # For cosine_with_warmup: 0.5 = half cosine wave
+num_restart_cycles: 4     # For cosine_with_restarts: number of restart cycles
+# Optimizer Configuration
+weight_decay: 0.01        # Weight decay for AdamW
+beta1: 0.9               # Adam beta1 parameter
+beta2: 0.98              # Adam beta2 parameter (optimized for transformers)
+optimizer_eps: 1e-6      # Adam epsilon
+# Training Stability
+gradient_clip_val: 100.0
+scaler: custom_robust
+gift_eval:
+  evaluate_on_gift_eval: false
+  max_context_length: 3072
+  create_plots: false
+  max_plots: 5
+  dataset_storage_path: null # Replace with the path to the dataset storage path
+data_augmentation:
+  nan_augmentation: true
+  scaler_augmentation: false
+  length_shortening: true
+  nan_stats_path: ./data/nan_stats.json
+augmentation_probabilities:
+  scaler_augmentation: 0.5
+TimeSeriesModel:
+  # Core architecture
+  embed_size: 512
+  num_encoder_layers: 10
+  # Scaling and preprocessing
+  scaler: custom_robust
+  epsilon: 0.00001
+  scaler_clamp_value: null
+  handle_constants: false
+  # Time features
+  K_max: 25
+  time_feature_config:
+    use_enhanced_features: true
+    use_holiday_features: false
+    use_index_features: true
+    include_seasonality_info: true
+  drop_enc_allow: false
+  encoding_dropout: 0.0
+  # Encoder configuration
+  encoder_config:
+    attn_mode: chunk
+    num_heads: 4
+    expand_v: 1.0
+    use_short_conv: true
+    conv_size: 32
+    allow_neg_eigval: true
+    hidden_ratio: 1.0
+    use_gate: true
+    use_forget_gate: true
+    num_householder: 4
+    weaving: true
+  loss_type: 'quantile'
+  quantiles: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

data/dataset_properties.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+    "m4_yearly": {
+        "domain": "Econ/Fin",
+        "frequency": "A",
+        "num_variates": 1
+    },
+    "m4_quarterly": {
+        "domain": "Econ/Fin",
+        "frequency": "Q",
+        "num_variates": 1
+    },
+    "m4_monthly": {
+        "domain": "Econ/Fin",
+        "frequency": "M",
+        "num_variates": 1
+    },
+    "m4_weekly": {
+        "domain": "Econ/Fin",
+        "frequency": "W",
+        "num_variates": 1
+    },
+    "m4_daily": {
+        "domain": "Econ/Fin",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "m4_hourly": {
+        "domain": "Econ/Fin",
+        "frequency": "H",
+        "num_variates": 1
+    },
+    "electricity": {
+        "domain": "Energy",
+        "frequency": "W",
+        "num_variates": 1
+    },
+    "ett1": {
+        "domain": "Energy",
+        "frequency": "W",
+        "num_variates": 7
+    },
+    "ett2": {
+        "domain": "Energy",
+        "frequency": "W",
+        "num_variates": 7
+    },
+    "solar": {
+        "domain": "Energy",
+        "frequency": "W",
+        "num_variates": 1
+    },
+    "hospital": {
+        "domain": "Healthcare",
+        "frequency": "M",
+        "num_variates": 1
+    },
+    "covid_deaths": {
+        "domain": "Healthcare",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "us_births": {
+        "domain": "Healthcare",
+        "frequency": "M",
+        "num_variates": 1
+    },
+    "saugeen": {
+        "domain": "Nature",
+        "frequency": "M",
+        "num_variates": 1
+    },
+    "temperature_rain": {
+        "domain": "Nature",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "kdd_cup_2018": {
+        "domain": "Nature",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "jena_weather": {
+        "domain": "Nature",
+        "frequency": "D",
+        "num_variates": 21
+    },
+    "car_parts": {
+        "domain": "Sales",
+        "frequency": "M",
+        "num_variates": 1
+    },
+    "restaurant": {
+        "domain": "Sales",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "hierarchical_sales": {
+        "domain": "Sales",
+        "frequency": "W-WED",
+        "num_variates": 1
+    },
+    "loop_seattle": {
+        "domain": "Transport",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "sz_taxi": {
+        "domain": "Transport",
+        "frequency": "H",
+        "num_variates": 1
+    },
+    "m_dense": {
+        "domain": "Transport",
+        "frequency": "D",
+        "num_variates": 1
+    },
+    "bitbrains_fast_storage": {
+        "domain": "Web/CloudOps",
+        "frequency": "H",
+        "num_variates": 2
+    },
+    "bitbrains_rnd": {
+        "domain": "Web/CloudOps",
+        "frequency": "H",
+        "num_variates": 2
+    },
+    "bizitobs_application": {
+        "domain": "Web/CloudOps",
+        "frequency": "10S",
+        "num_variates": 2
+    },
+    "bizitobs_service": {
+        "domain": "Web/CloudOps",
+        "frequency": "10S",
+        "num_variates": 2
+    },
+    "bizitobs_l2c": {
+        "domain": "Web/CloudOps",
+        "frequency": "H",
+        "num_variates": 7
+    },
+    "dd_benchmark_short": {
+        "domain": "Web/Observability",
+        "frequency": "Short",
+        "num_variates": 32
+    },
+    "dd_benchmark_long": {
+        "domain": "Web/Observability",
+        "frequency": "Long",
+        "num_variates": 32
+    }
+}

data/nan_stats.json ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/generate_synthetic_data.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import logging
+import os
+from typing import List, Optional
+import torch
+from src.data.containers import BatchTimeSeriesContainer
+from src.data.utils import sample_future_length
+from src.plotting.plot_timeseries import plot_from_container
+from src.synthetic_generation.anomalies.anomaly_generator_wrapper import (
+    AnomalyGeneratorWrapper,
+)
+from src.synthetic_generation.cauker.cauker_generator_wrapper import (
+    CauKerGeneratorWrapper,
+)
+from src.synthetic_generation.forecast_pfn_prior.forecast_pfn_generator_wrapper import (
+    ForecastPFNGeneratorWrapper,
+)
+from src.synthetic_generation.generator_params import (
+    AnomalyGeneratorParams,
+    CauKerGeneratorParams,
+    FinancialVolatilityAudioParams,
+    ForecastPFNGeneratorParams,
+    GPGeneratorParams,
+    KernelGeneratorParams,
+    MultiScaleFractalAudioParams,
+    NetworkTopologyAudioParams,
+    OrnsteinUhlenbeckProcessGeneratorParams,
+    SawToothGeneratorParams,
+    SineWaveGeneratorParams,
+    SpikesGeneratorParams,
+    StepGeneratorParams,
+    StochasticRhythmAudioParams,
+)
+from src.synthetic_generation.gp_prior.gp_generator_wrapper import GPGeneratorWrapper
+from src.synthetic_generation.kernel_synth.kernel_generator_wrapper import (
+    KernelGeneratorWrapper,
+)
+from src.synthetic_generation.ornstein_uhlenbeck_process.ou_generator_wrapper import (
+    OrnsteinUhlenbeckProcessGeneratorWrapper,
+)
+from src.synthetic_generation.sawtooth.sawtooth_generator_wrapper import (
+    SawToothGeneratorWrapper,
+)
+from src.synthetic_generation.sine_waves.sine_wave_generator_wrapper import (
+    SineWaveGeneratorWrapper,
+)
+from src.synthetic_generation.spikes.spikes_generator_wrapper import (
+    SpikesGeneratorWrapper,
+)
+from src.synthetic_generation.steps.step_generator_wrapper import StepGeneratorWrapper
+PYO_AVAILABLE = True
+try:
+    import pyo  # requires portaudio to be installed
+except (ImportError, OSError):
+    PYO_AVAILABLE = False
+else:
+    from src.synthetic_generation.audio_generators.financial_volatility_wrapper import (
+        FinancialVolatilityAudioWrapper,
+    )
+    from src.synthetic_generation.audio_generators.multi_scale_fractal_wrapper import (
+        MultiScaleFractalAudioWrapper,
+    )
+    from src.synthetic_generation.audio_generators.network_topology_wrapper import (
+        NetworkTopologyAudioWrapper,
+    )
+    from src.synthetic_generation.audio_generators.stochastic_rhythm_wrapper import (
+        StochasticRhythmAudioWrapper,
+    )
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+def visualize_batch_sample(
+    generator,
+    batch_size: int = 8,
+    output_dir: str = "outputs/plots",
+    sample_idx: Optional[int] = None,
+    prefix: str = "",
+    seed: Optional[int] = None,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+    name = generator.__class__.__name__
+    logger.info(f"[{name}] Generating batch of size {batch_size}")
+    batch = generator.generate_batch(batch_size=batch_size, seed=seed)
+    values = torch.from_numpy(batch.values)
+    if values.ndim == 2:
+        values = values.unsqueeze(-1)
+    future_length = sample_future_length(range="gift_eval")
+    history_values = values[:, :-future_length, :]
+    future_values = values[:, -future_length:, :]
+    container = BatchTimeSeriesContainer(
+        history_values=history_values,
+        future_values=future_values,
+        start=batch.start,
+        frequency=batch.frequency,
+    )
+    indices = [sample_idx] if sample_idx is not None else range(batch_size)
+    for i in indices:
+        filename = (
+            f"{prefix}_{name.lower().replace('generatorwrapper', '')}_sample_{i}.png"
+        )
+        output_file = os.path.join(output_dir, filename)
+        title = f"{prefix.capitalize()} {name.replace('GeneratorWrapper', '')} Synthetic Series (Sample {i})"
+        plot_from_container(
+            container, sample_idx=i, output_file=output_file, show=False, title=title
+        )
+        logger.info(f"[{name}] Saved plot to {output_file}")
+def generator_factory(global_seed: int, total_length: int) -> List:
+    generators = [
+        KernelGeneratorWrapper(
+            KernelGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        GPGeneratorWrapper(
+            GPGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        ForecastPFNGeneratorWrapper(
+            ForecastPFNGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        SineWaveGeneratorWrapper(
+            SineWaveGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        SawToothGeneratorWrapper(
+            SawToothGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        StepGeneratorWrapper(
+            StepGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        AnomalyGeneratorWrapper(
+            AnomalyGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        SpikesGeneratorWrapper(
+            SpikesGeneratorParams(global_seed=global_seed, length=total_length)
+        ),
+        CauKerGeneratorWrapper(
+            CauKerGeneratorParams(
+                global_seed=global_seed, length=total_length, num_channels=5
+            )
+        ),
+        OrnsteinUhlenbeckProcessGeneratorWrapper(
+            OrnsteinUhlenbeckProcessGeneratorParams(
+                global_seed=global_seed, length=total_length
+            )
+        ),
+    ]
+    if PYO_AVAILABLE:
+        generators.extend(
+            [
+                StochasticRhythmAudioWrapper(
+                    StochasticRhythmAudioParams(
+                        global_seed=global_seed, length=total_length
+                    )
+                ),
+                FinancialVolatilityAudioWrapper(
+                    FinancialVolatilityAudioParams(
+                        global_seed=global_seed, length=total_length
+                    )
+                ),
+                MultiScaleFractalAudioWrapper(
+                    MultiScaleFractalAudioParams(
+                        global_seed=global_seed, length=total_length
+                    )
+                ),
+                NetworkTopologyAudioWrapper(
+                    NetworkTopologyAudioParams(
+                        global_seed=global_seed, length=total_length
+                    )
+                ),
+            ]
+        )
+    else:
+        logger.warning("Audio generators skipped (pyo not available)")
+    return generators
+if __name__ == "__main__":
+    batch_size = 2
+    total_length = 2048
+    output_dir = "outputs/plots"
+    global_seed = 2025
+    logger.info(f"Saving plots to {output_dir}")
+    for gen in generator_factory(global_seed, total_length):
+        prefix = "multivariate" if getattr(gen.params, "num_channels", 1) > 1 else ""
+        visualize_batch_sample(
+            gen,
+            batch_size=batch_size,
+            output_dir=output_dir,
+            prefix=prefix,
+            seed=global_seed,
+        )

examples/gift_eval/gift_eval_runner.py ADDED Viewed

	@@ -0,0 +1,251 @@

+#!/usr/bin/env python
+"""
+GIFT-Eval Runner Script
+This script evaluates the Time Series model on GIFT-Eval datasets using the `src/gift_eval` pipeline.
+- Uses `src/gift_eval/data.py` for dataset handling.
+- Uses `src/gift_eval/predictor.TimeSeriesPredictor` for inference.
+- Loads a model from a checkpoint.
+- Writes per-dataset CSV metrics to `output_dir` without creating plots.
+"""
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Optional
+from examples.utils import download_checkpoint_if_needed
+from src.gift_eval.constants import ALL_DATASETS
+from src.gift_eval.evaluate import evaluate_datasets
+from src.gift_eval.predictor import TimeSeriesPredictor
+from src.gift_eval.results import aggregate_results, write_results_to_disk
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
+logger = logging.getLogger("gift_eval_runner")
+def _expand_datasets_arg(datasets_arg: List[str] | str) -> List[str]:
+    """Expand dataset argument to list of dataset names."""
+    if isinstance(datasets_arg, str):
+        if datasets_arg == "all":
+            return list(ALL_DATASETS)
+        datasets_list = [datasets_arg]
+    else:
+        datasets_list = datasets_arg
+        if datasets_list and datasets_list[0] == "all":
+            return list(ALL_DATASETS)
+    for ds in datasets_list:
+        if ds not in ALL_DATASETS:
+            raise ValueError(f"Invalid dataset: {ds}. Use one of {ALL_DATASETS}")
+    return datasets_list
+def run_evaluation(
+    predictor: TimeSeriesPredictor,
+    datasets_arg: List[str] | str,
+    terms_arg: List[str],
+    dataset_storage_path: str,
+    max_windows_arg: Optional[int],
+    batch_size_arg: int,
+    max_context_length_arg: Optional[int],
+    output_dir_arg: str,
+    model_name_arg: str,
+    after_each_dataset_flush: bool = True,
+) -> None:
+    """Run evaluation on specified datasets."""
+    datasets_to_run = _expand_datasets_arg(datasets_arg)
+    results_root = Path(output_dir_arg)
+    for ds_name in datasets_to_run:
+        items = evaluate_datasets(
+            predictor=predictor,
+            dataset=ds_name,
+            dataset_storage_path=dataset_storage_path,
+            terms=terms_arg,
+            max_windows=max_windows_arg,
+            batch_size=batch_size_arg,
+            max_context_length=max_context_length_arg,
+            create_plots=False,
+            max_plots_per_dataset=0,
+        )
+        write_results_to_disk(
+            items=items,
+            dataset_name=ds_name,
+            output_dir=results_root,
+            model_name=model_name_arg,
+            create_plots=False,
+        )
+        if after_each_dataset_flush:
+            logger.info("Flushed results for %s", ds_name)
+def main():
+    """Main execution function."""
+    parser = argparse.ArgumentParser(
+        description="GIFT-Eval Runner: Evaluate TimeSeriesModel on GIFT-Eval datasets"
+    )
+    # Model configuration
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default=None,
+        help="Path to model checkpoint. If not provided, will download from checkpoint_url.",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        default="configs/example.yaml",
+        help="Path to model config YAML (default: configs/example.yaml)",
+    )
+    parser.add_argument(
+        "--checkpoint_url",
+        type=str,
+        default="https://www.dropbox.com/scl/fi/mqsni5lehooyaw93y3uzq/checkpoint_38M.pth?rlkey=3uyehvmtted02xkha24zgpzb6&st=seevsbkn&dl=0",
+        help="URL to download checkpoint from if model_path is not provided",
+    )
+    parser.add_argument(
+        "--download_dir",
+        type=str,
+        default="models",
+        help="Directory to download checkpoint to (default: models)",
+    )
+    # Dataset configuration
+    parser.add_argument(
+        "--datasets",
+        type=str,
+        nargs="+",
+        default=["all"],
+        help='List of dataset names or ["all"] (default: all)',
+    )
+    parser.add_argument(
+        "--terms",
+        type=str,
+        nargs="+",
+        default=["short", "medium", "long"],
+        help="Prediction terms to evaluate (default: short medium long)",
+    )
+    parser.add_argument(
+        "--dataset_storage_path",
+        type=str,
+        default="/work/dlclarge2/moroshav-GiftEvalPretrain/gift_eval",
+        # required=True,
+        help="Path to the root of the gift eval datasets storage directory",
+    )
+    parser.add_argument(
+        "--max_windows",
+        type=int,
+        default=20,
+        help="Maximum number of windows to use for evaluation (default: 20)",
+    )
+    # Inference configuration
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=64,
+        help="Batch size for inference (default: 128)",
+    )
+    parser.add_argument(
+        "--max_context_length",
+        type=int,
+        default=3072,
+        help="Maximum context length (default: 3072)",
+    )
+    # Output configuration
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="gift_eval_results",
+        help="Output directory for results (default: gift_eval_results)",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="TempoPFN",
+        help="Model name identifier for results (default: TempoPFN)",
+    )
+    parser.add_argument(
+        "--no_flush",
+        action="store_true",
+        help="Disable flushing results after each dataset",
+    )
+    args = parser.parse_args()
+    # Resolve paths
+    config_path = Path(args.config_path)
+    download_dir = Path(args.download_dir)
+    output_dir = Path(args.output_dir)
+    # Determine model path
+    resolved_model_path = None
+    if args.model_path:
+        resolved_model_path = args.model_path
+    elif args.checkpoint_url:
+        resolved_model_path = download_checkpoint_if_needed(
+            args.checkpoint_url, target_dir=download_dir
+        )
+    if not resolved_model_path:
+        raise FileNotFoundError(
+            "No model checkpoint provided. Set --model_path or --checkpoint_url."
+        )
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config not found: {config_path}")
+    logger.info("Loading predictor from checkpoint: %s", resolved_model_path)
+    predictor = TimeSeriesPredictor.from_paths(
+        model_path=resolved_model_path,
+        config_path=str(config_path),
+        ds_prediction_length=1,  # placeholder; set per dataset
+        ds_freq="D",  # placeholder; set per dataset
+        batch_size=args.batch_size,
+        max_context_length=args.max_context_length,
+    )
+    logger.info("Starting evaluation...")
+    logger.info("  Datasets: %s", args.datasets)
+    logger.info("  Terms: %s", args.terms)
+    logger.info("  Output directory: %s", output_dir)
+    # Run evaluation
+    run_evaluation(
+        predictor=predictor,
+        datasets_arg=args.datasets,
+        terms_arg=args.terms,
+        dataset_storage_path=args.dataset_storage_path,
+        max_windows_arg=args.max_windows,
+        batch_size_arg=args.batch_size,
+        max_context_length_arg=args.max_context_length,
+        output_dir_arg=str(output_dir),
+        model_name_arg=args.model_name,
+        after_each_dataset_flush=not args.no_flush,
+    )
+    logger.info("Evaluation complete. See results under: %s", output_dir)
+    # Aggregate all results into a single CSV file
+    logger.info("Aggregating results from all datasets...")
+    combined_df = aggregate_results(result_root_dir=output_dir)
+    if combined_df is not None:
+        logger.info("Successfully created aggregated results file: %s/all_results.csv", output_dir)
+    else:
+        logger.warning("No results to aggregate. Check that evaluation completed successfully.")
+if __name__ == "__main__":
+    main()

examples/gift_eval/gift_eval_submission.ipynb ADDED Viewed

	@@ -0,0 +1,1439 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e8a9f0b1",
+   "metadata": {},
+   "source": [
+    "# Running TempoPFN on GIFT-Eval Benchmark\n",
+    "\n",
+    "This notebook evaluates the **TempoPFN** model on the GIFT-Eval benchmark. \n",
+    "\n",
+    "Make sure you download the gift-eval benchmark and set the `GIFT_EVAL_DATASET_STORAGE_PATH` environment variable correctly before running this notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1d2e3c4",
+   "metadata": {},
+   "source": [
+    "## 1. Setup and Dependencies\n",
+    "\n",
+    "First, install the required packages. \n",
+    "\n",
+    "**Note:** This notebook assumes that the core `TempoPFN` model code (e.g., `src.models.model`, `src.data.containers`) and dependencies are installed as a Python package or are otherwise available in the `PYTHONPATH`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9c8d7e6",
+   "metadata": {},
+   "source": [
+    "## 2. Imports\n",
+    "\n",
+    "Import all necessary libraries. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7d8e9f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import logging\n",
+    "import os\n",
+    "import math\n",
+    "import csv\n",
+    "import glob\n",
+    "import argparse\n",
+    "import warnings\n",
+    "import yaml\n",
+    "from pathlib import Path\n",
+    "from typing import List, Optional, Dict, Tuple, Union, Iterator, Iterable, Any\n",
+    "from functools import cached_property\n",
+    "from enum import Enum\n",
+    "from dataclasses import dataclass\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch.nn.parallel import DistributedDataParallel as DDP\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# GluonTS and Data Handling\n",
+    "import datasets\n",
+    "import pyarrow.compute as pc\n",
+    "from gluonts.dataset import DataEntry\n",
+    "from gluonts.dataset.common import ProcessDataEntry\n",
+    "from gluonts.dataset.split import TestData, TrainingDataset, split\n",
+    "from gluonts.itertools import Map\n",
+    "from gluonts.time_feature import norm_freq_str, get_seasonality\n",
+    "from gluonts.transform import Transformation\n",
+    "from pandas.tseries.frequencies import to_offset\n",
+    "from toolz import compose\n",
+    "\n",
+    "# GluonTS Evaluation\n",
+    "from gluonts.ev.metrics import (\n",
+    "    MAE,\n",
+    "    MAPE,\n",
+    "    MASE,\n",
+    "    MSE,\n",
+    "    MSIS,\n",
+    "    ND,\n",
+    "    NRMSE,\n",
+    "    RMSE,\n",
+    "    SMAPE,\n",
+    "    MeanWeightedSumQuantileLoss,\n",
+    ")\n",
+    "from gluonts.model.evaluation import evaluate_model\n",
+    "from gluonts.model.forecast import QuantileForecast\n",
+    "from gluonts.model.predictor import Predictor\n",
+    "\n",
+    "# Plotting and Warnings\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "from linear_operator.utils.cholesky import NumericalWarning\n",
+    "\n",
+    "# --- TempoPFN Core Model Imports ---\n",
+    "# These are assumed to be installed or in the PYTHONPATH\n",
+    "from src.data.containers import BatchTimeSeriesContainer\n",
+    "from src.data.frequency import parse_frequency\n",
+    "from src.data.scalers import RobustScaler\n",
+    "from src.models.model import TimeSeriesModel\n",
+    "from src.utils.utils import device\n",
+    "\n",
+    "# --- Setup Logging ---\n",
+    "logging.basicConfig(level=logging.INFO, format=\"%(asctime)s - %(levelname)s - %(message)s\")\n",
+    "logging.getLogger(\"matplotlib\").setLevel(logging.WARNING)\n",
+    "logging.getLogger(\"matplotlib.font_manager\").setLevel(logging.WARNING)\n",
+    "logging.getLogger(\"PIL\").setLevel(logging.WARNING)\n",
+    "logger = logging.getLogger(\"gift_eval_runner\")\n",
+    "\n",
+    "# Filter out specific gluonts warnings\n",
+    "class WarningFilter(logging.Filter):\n",
+    "    def __init__(self, text_to_filter: str) -> None:\n",
+    "        super().__init__()\n",
+    "        self.text_to_filter = text_to_filter\n",
+    "\n",
+    "    def filter(self, record: logging.LogRecord) -> bool:\n",
+    "        return self.text_to_filter not in record.getMessage()\n",
+    "\n",
+    "gts_logger = logging.getLogger(\"gluonts.model.forecast\")\n",
+    "gts_logger.addFilter(\n",
+    "    WarningFilter(\"The mean prediction is not stored in the forecast data\")\n",
+    ")\n",
+    "\n",
+    "# Filter out numerical warnings\n",
+    "warnings.filterwarnings(\"ignore\", category=NumericalWarning)\n",
+    "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n",
+    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
+    "\n",
+    "# Load environment variables (e.g., GIFT_EVAL_DATASET_STORAGE_PATH)\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6e7f8a1",
+   "metadata": {},
+   "source": [
+    "## 3. Constants and Configuration\n",
+    "\n",
+    "Define dataset lists, metrics, and other constants following GIFT-Eval standards."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "g4h5j6k7",
+   "metadata": {},
+   "source": [
+    "### 3.1. Constants "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "h5j6k7l8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Environment setup\n",
+    "os.environ[\"CUBLAS_WORKSPACE_CONFIG\"] = \":4096:8\"\n",
+    "\n",
+    "# Use absolute path relative to the project root\n",
+    "_MODULE_DIR = Path.cwd().parent.parent  # Assumes notebook is in `examples/gift_eval/`\n",
+    "DATASET_PROPERTIES_PATH = _MODULE_DIR / \"data\" / \"dataset_properties.json\"\n",
+    "\n",
+    "try:\n",
+    "    with open(DATASET_PROPERTIES_PATH, \"r\") as f:\n",
+    "        DATASET_PROPERTIES = json.load(f)\n",
+    "except Exception as exc:  # pragma: no cover - logging path\n",
+    "    DATASET_PROPERTIES = {}\n",
+    "    logger.warning(\n",
+    "        \"Could not load dataset properties from %s: %s. Domain and num_variates will fall back to defaults.\",\n",
+    "        DATASET_PROPERTIES_PATH,\n",
+    "        exc,\n",
+    "    )\n",
+    "\n",
+    "# Datasets\n",
+    "SHORT_DATASETS = (\n",
+    "    \"m4_yearly\",\n",
+    "    \"m4_quarterly\",\n",
+    "    \"m4_monthly\",\n",
+    "    \"m4_weekly\",\n",
+    "    \"m4_daily\",\n",
+    "    \"m4_hourly\",\n",
+    "    \"electricity/15T\",\n",
+    "    \"electricity/H\",\n",
+    "    \"electricity/D\",\n",
+    "    \"electricity/W\",\n",
+    "    \"solar/10T\",\n",
+    "    \"solar/H\",\n",
+    "    \"solar/D\",\n",
+    "    \"solar/W\",\n",
+    "    \"hospital\",\n",
+    "    \"covid_deaths\",\n",
+    "    \"us_births/D\",\n",
+    "    \"us_births/M\",\n",
+    "    \"us_births/W\",\n",
+    "    \"saugeenday/D\",\n",
+    "    \"saugeenday/M\",\n",
+    "    \"saugeenday/W\",\n",
+    "    \"temperature_rain_with_missing\",\n",
+    "    \"kdd_cup_2018_with_missing/H\",\n",
+    "    \"kdd_cup_2018_with_missing/D\",\n",
+    "    \"car_parts_with_missing\",\n",
+    "    \"restaurant\",\n",
+    "    \"hierarchical_sales/D\",\n",
+    "    \"hierarchical_sales/W\",\n",
+    "    \"LOOP_SEATTLE/5T\",\n",
+    "    \"LOOP_SEATTLE/H\",\n",
+    "    \"LOOP_SEATTLE/D\",\n",
+    "    \"SZ_TAXI/15T\",\n",
+    "    \"SZ_TAXI/H\",\n",
+    "    \"M_DENSE/H\",\n",
+    "    \"M_DENSE/D\",\n",
+    "    \"ett1/15T\",\n",
+    "    \"ett1/H\",\n",
+    "    \"ett1/D\",\n",
+    "    \"ett1/W\",\n",
+    "    \"ett2/15T\",\n",
+    "    \"ett2/H\",\n",
+    "    \"ett2/D\",\n",
+    "    \"ett2/W\",\n",
+    "    \"jena_weather/10T\",\n",
+    "    \"jena_weather/H\",\n",
+    "    \"jena_weather/D\",\n",
+    "    \"bitbrains_fast_storage/5T\",\n",
+    "    \"bitbrains_fast_storage/H\",\n",
+    "    \"bitbrains_rnd/5T\",\n",
+    "    \"bitbrains_rnd/H\",\n",
+    "    \"bizitobs_application\",\n",
+    "    \"bizitobs_service\",\n",
+    "    \"bizitobs_l2c/5T\",\n",
+    "    \"bizitobs_l2c/H\",\n",
+    ")\n",
+    "\n",
+    "MED_LONG_DATASETS = (\n",
+    "    \"electricity/15T\",\n",
+    "    \"electricity/H\",\n",
+    "    \"solar/10T\",\n",
+    "    \"solar/H\",\n",
+    "    \"kdd_cup_2018_with_missing/H\",\n",
+    "    \"LOOP_SEATTLE/5T\",\n",
+    "    \"LOOP_SEATTLE/H\",\n",
+    "    \"SZ_TAXI/15T\",\n",
+    "    \"M_DENSE/H\",\n",
+    "    \"ett1/15T\",\n",
+    "    \"ett1/H\",\n",
+    "    \"ett2/15T\",\n",
+    "    \"ett2/H\",\n",
+    "    \"jena_weather/10T\",\n",
+    "    \"jena_weather/H\",\n",
+    "    \"bitbrains_fast_storage/5T\",\n",
+    "    \"bitbrains_rnd/5T\",\n",
+    "    \"bizitobs_application\",\n",
+    "    \"bizitobs_service\",\n",
+    "    \"bizitobs_l2c/5T\",\n",
+    "    \"bizitobs_l2c/H\",\n",
+    ")\n",
+    "\n",
+    "# Preserve insertion order\n",
+    "ALL_DATASETS = list(dict.fromkeys(SHORT_DATASETS + MED_LONG_DATASETS))\n",
+    "\n",
+    "# Evaluation terms\n",
+    "TERMS = (\"short\", \"medium\", \"long\")\n",
+    "\n",
+    "# Pretty names mapping\n",
+    "PRETTY_NAMES = {\n",
+    "    \"saugeenday\": \"saugeen\",\n",
+    "    \"temperature_rain_with_missing\": \"temperature_rain\",\n",
+    "    \"kdd_cup_2018_with_missing\": \"kdd_cup_2018\",\n",
+    "    \"car_parts_with_missing\": \"car_parts\",\n",
+    "}\n",
+    "\n",
+    "# Metrics\n",
+    "METRICS = (\n",
+    "    MSE(forecast_type=\"mean\"),\n",
+    "    MSE(forecast_type=0.5),\n",
+    "    MAE(),\n",
+    "    MASE(),\n",
+    "    MAPE(),\n",
+    "    SMAPE(),\n",
+    "    MSIS(),\n",
+    "    RMSE(),\n",
+    "    NRMSE(),\n",
+    "    ND(),\n",
+    "    MeanWeightedSumQuantileLoss(\n",
+    "        quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "# Standard metric names for CSV header\n",
+    "STANDARD_METRIC_NAMES = (\n",
+    "    \"MSE[mean]\",\n",
+    "    \"MSE[0.5]\",\n",
+    "    \"MAE[0.5]\",\n",
+    "    \"MASE[0.5]\",\n",
+    "    \"MAPE[0.5]\",\n",
+    "    \"sMAPE[0.5]\",\n",
+    "    \"MSIS\",\n",
+    "    \"RMSE[mean]\",\n",
+    "    \"NRMSE[mean]\",\n",
+    "    \"ND[0.5]\",\n",
+    "    \"mean_weighted_sum_quantile_loss\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "i7j8k9l0",
+   "metadata": {},
+   "source": [
+    "### 3.2. Core Data Structures "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "j8k9l0m1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@dataclass\n",
+    "class DatasetMetadata:\n",
+    "    \"\"\"Structured description of a dataset/term combination.\"\"\"\n",
+    "\n",
+    "    full_name: str\n",
+    "    key: str\n",
+    "    freq: str\n",
+    "    term: str\n",
+    "    season_length: int\n",
+    "    target_dim: int\n",
+    "    to_univariate: bool\n",
+    "    prediction_length: int\n",
+    "    windows: int\n",
+    "\n",
+    "\n",
+    "@dataclass\n",
+    "class EvaluationItem:\n",
+    "    \"\"\"Container for evaluation results and optional figures.\"\"\"\n",
+    "\n",
+    "    dataset_metadata: DatasetMetadata\n",
+    "    metrics: Dict\n",
+    "    figures: List[Tuple[object, str]]\n",
+    "\n",
+    "\n",
+    "DatasetSelection = Union[List[str], Tuple[str, ...], str]\n",
+    "\n",
+    "\n",
+    "def expand_datasets_arg(datasets: DatasetSelection) -> List[str]:\n",
+    "    \"\"\"Normalize dataset selection strings to explicit lists.\"\"\"\n",
+    "\n",
+    "    if isinstance(datasets, str):\n",
+    "        dataset_list = [datasets]\n",
+    "    else:\n",
+    "        dataset_list = list(datasets)\n",
+    "\n",
+    "    if not dataset_list:\n",
+    "        return []\n",
+    "\n",
+    "    if dataset_list[0] == \"all\":\n",
+    "        return list(ALL_DATASETS)\n",
+    "\n",
+    "    for dataset in dataset_list:\n",
+    "        if dataset not in ALL_DATASETS:\n",
+    "            raise ValueError(f\"Invalid dataset: {dataset}. Use one of {ALL_DATASETS}\")\n",
+    "\n",
+    "    return dataset_list"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "k9l0m1n2",
+   "metadata": {},
+   "source": [
+    "### 3.3. GIFT-Eval Dataset Class (`data.py`)\n",
+    "\n",
+    "The `Dataset` class handles loading and preprocessing GIFT-Eval benchmark datasets. This implementation is adapted from the official GIFT-Eval repository."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "l0m1n2o3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TEST_SPLIT = 0.1\n",
+    "MAX_WINDOW = 20\n",
+    "\n",
+    "M4_PRED_LENGTH_MAP = {\n",
+    "    \"A\": 6,\n",
+    "    \"Q\": 8,\n",
+    "    \"M\": 18,\n",
+    "    \"W\": 13,\n",
+    "    \"D\": 14,\n",
+    "    \"H\": 48,\n",
+    "    \"h\": 48,\n",
+    "    \"Y\": 6,\n",
+    "}\n",
+    "\n",
+    "PRED_LENGTH_MAP = {\n",
+    "    \"M\": 12,\n",
+    "    \"W\": 8,\n",
+    "    \"D\": 30,\n",
+    "    \"H\": 48,\n",
+    "    \"h\": 48,\n",
+    "    \"T\": 48,\n",
+    "    \"S\": 60,\n",
+    "    \"s\": 60,\n",
+    "    \"min\": 48,\n",
+    "}\n",
+    "\n",
+    "TFB_PRED_LENGTH_MAP = {\n",
+    "    \"A\": 6,\n",
+    "    \"Y\": 6,\n",
+    "    \"H\": 48,\n",
+    "    \"h\": 48,\n",
+    "    \"Q\": 8,\n",
+    "    \"D\": 14,\n",
+    "    \"M\": 18,\n",
+    "    \"W\": 13,\n",
+    "    \"U\": 8,\n",
+    "    \"T\": 8,\n",
+    "    \"min\": 8,\n",
+    "    \"us\": 8,\n",
+    "}\n",
+    "\n",
+    "\n",
+    "class Term(Enum):\n",
+    "    SHORT = \"short\"\n",
+    "    MEDIUM = \"medium\"\n",
+    "    LONG = \"long\"\n",
+    "\n",
+    "    @property\n",
+    "    def multiplier(self) -> int:\n",
+    "        if self == Term.SHORT:\n",
+    "            return 1\n",
+    "        elif self == Term.MEDIUM:\n",
+    "            return 10\n",
+    "        elif self == Term.LONG:\n",
+    "            return 15\n",
+    "\n",
+    "\n",
+    "def itemize_start(data_entry: DataEntry) -> DataEntry:\n",
+    "    data_entry[\"start\"] = data_entry[\"start\"].item()\n",
+    "    return data_entry\n",
+    "\n",
+    "\n",
+    "class MultivariateToUnivariate(Transformation):\n",
+    "    def __init__(self, field):\n",
+    "        self.field = field\n",
+    "\n",
+    "    def __call__(\n",
+    "        self, data_it: Iterable[DataEntry], is_train: bool = False\n",
+    "    ) -> Iterator:\n",
+    "        for data_entry in data_it:\n",
+    "            item_id = data_entry[\"item_id\"]\n",
+    "            val_ls = list(data_entry[self.field])\n",
+    "            for id, val in enumerate(val_ls):\n",
+    "                univariate_entry = data_entry.copy()\n",
+    "                univariate_entry[self.field] = val\n",
+    "                univariate_entry[\"item_id\"] = item_id + \"_dim\" + str(id)\n",
+    "                yield univariate_entry\n",
+    "\n",
+    "\n",
+    "class Dataset:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        name: str,\n",
+    "        term: Term | str = Term.SHORT,\n",
+    "        to_univariate: bool = False,\n",
+    "        storage_path: str = None,\n",
+    "        max_windows: Optional[int] = None,\n",
+    "    ):\n",
+    "        storage_path = Path(storage_path)\n",
+    "        self.hf_dataset = datasets.load_from_disk(str(storage_path / name)).with_format(\n",
+    "            \"numpy\"\n",
+    "        )\n",
+    "        process = ProcessDataEntry(\n",
+    "            self.freq,\n",
+    "            one_dim_target=self.target_dim == 1,\n",
+    "        )\n",
+    "\n",
+    "        self.gluonts_dataset = Map(compose(process, itemize_start), self.hf_dataset)\n",
+    "        if to_univariate:\n",
+    "            self.gluonts_dataset = MultivariateToUnivariate(\"target\").apply(\n",
+    "                self.gluonts_dataset\n",
+    "            )\n",
+    "\n",
+    "        self.term = Term(term)\n",
+    "        self.name = name\n",
+    "        self.max_windows = max_windows if max_windows is not None else MAX_WINDOW\n",
+    "\n",
+    "    @cached_property\n",
+    "    def prediction_length(self) -> int:\n",
+    "        freq = norm_freq_str(to_offset(self.freq).name)\n",
+    "        if freq.endswith(\"E\"):\n",
+    "            freq = freq[:-1]\n",
+    "        pred_len = (\n",
+    "            M4_PRED_LENGTH_MAP[freq] if \"m4\" in self.name else PRED_LENGTH_MAP[freq]\n",
+    "        )\n",
+    "        return self.term.multiplier * pred_len\n",
+    "\n",
+    "    @cached_property\n",
+    "    def freq(self) -> str:\n",
+    "        return self.hf_dataset[0][\"freq\"]\n",
+    "\n",
+    "    @cached_property\n",
+    "    def target_dim(self) -> int:\n",
+    "        return (\n",
+    "            target.shape[0]\n",
+    "            if len((target := self.hf_dataset[0][\"target\"]).shape) > 1\n",
+    "            else 1\n",
+    "        )\n",
+    "\n",
+    "    @cached_property\n",
+    "    def past_feat_dynamic_real_dim(self) -> int:\n",
+    "        if \"past_feat_dynamic_real\" not in self.hf_dataset[0]:\n",
+    "            return 0\n",
+    "        elif (\n",
+    "            len(\n",
+    "                (\n",
+    "                    past_feat_dynamic_real := self.hf_dataset[0][\n",
+    "                        \"past_feat_dynamic_real\"\n",
+    "                    ]\n",
+    "                ).shape\n",
+    "            )\n",
+    "            > 1\n",
+    "        ):\n",
+    "            return past_feat_dynamic_real.shape[0]\n",
+    "        else:\n",
+    "            return 1\n",
+    "\n",
+    "    @cached_property\n",
+    "    def windows(self) -> int:\n",
+    "        if \"m4\" in self.name:\n",
+    "            return 1\n",
+    "        w = math.ceil(TEST_SPLIT * self._min_series_length / self.prediction_length)\n",
+    "        return min(max(1, w), self.max_windows)\n",
+    "\n",
+    "    @cached_property\n",
+    "    def _min_series_length(self) -> int:\n",
+    "        if self.hf_dataset[0][\"target\"].ndim > 1:\n",
+    "            lengths = pc.list_value_length(\n",
+    "                pc.list_flatten(\n",
+    "                    pc.list_slice(self.hf_dataset.data.column(\"target\"), 0, 1)\n",
+    "                )\n",
+    "            )\n",
+    "        else:\n",
+    "            lengths = pc.list_value_length(self.hf_dataset.data.column(\"target\"))\n",
+    "        return min(lengths.to_numpy())\n",
+    "\n",
+    "    @cached_property\n",
+    "    def sum_series_length(self) -> int:\n",
+    "        if self.hf_dataset[0][\"target\"].ndim > 1:\n",
+    "            lengths = pc.list_value_length(\n",
+    "                pc.list_flatten(self.hf_dataset.data.column(\"target\"))\n",
+    "            )\n",
+    "        else:\n",
+    "            lengths = pc.list_value_length(self.hf_dataset.data.column(\"target\"))\n",
+    "        return sum(lengths.to_numpy())\n",
+    "\n",
+    "    @property\n",
+    "    def training_dataset(self) -> TrainingDataset:\n",
+    "        training_dataset, _ = split(\n",
+    "            self.gluonts_dataset, offset=-self.prediction_length * (self.windows + 1)\n",
+    "        )\n",
+    "        return training_dataset\n",
+    "\n",
+    "    @property\n",
+    "    def validation_dataset(self) -> TrainingDataset:\n",
+    "        validation_dataset, _ = split(\n",
+    "            self.gluonts_dataset, offset=-self.prediction_length * self.windows\n",
+    "        )\n",
+    "        return validation_dataset\n",
+    "\n",
+    "    @property\n",
+    "    def test_data(self) -> TestData:\n",
+    "        _, test_template = split(\n",
+    "            self.gluonts_dataset, offset=-self.prediction_length * self.windows\n",
+    "        )\n",
+    "        test_data = test_template.generate_instances(\n",
+    "            prediction_length=self.prediction_length,\n",
+    "            windows=self.windows,\n",
+    "            distance=self.prediction_length,\n",
+    "        )\n",
+    "        return test_data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "m1n2o3p4",
+   "metadata": {},
+   "source": [
+    "### 3.4. Predictor Wrapper (`predictor.py`)\n",
+    "\n",
+    "This is the model-specific `TimeSeriesPredictor` class for `TempoPFN`. It wraps the core `TimeSeriesModel` and adapts it to the `gluonts`-style `Predictor` interface, which expects a `.predict()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "n2o3p4q5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TimeSeriesPredictor(Predictor):\n",
+    "    \"\"\"Unified predictor for TimeSeriesModel supporting flexible construction.\"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        model: TimeSeriesModel,\n",
+    "        config: dict,\n",
+    "        ds_prediction_length: int,\n",
+    "        ds_freq: str,\n",
+    "        batch_size: int = 32,\n",
+    "        max_context_length: Optional[int] = None,\n",
+    "        debug: bool = False,\n",
+    "    ) -> None:\n",
+    "        # Dataset-specific context (can be updated per dataset/term)\n",
+    "        self.ds_prediction_length = ds_prediction_length\n",
+    "        self.ds_freq = ds_freq\n",
+    "        self.batch_size = batch_size\n",
+    "        self.max_context_length = max_context_length\n",
+    "        self.debug = debug\n",
+    "\n",
+    "        # Persistent model/config (unwrap DDP if needed)\n",
+    "        self.model = model.module if isinstance(model, DDP) else model\n",
+    "        self.model.eval()\n",
+    "        self.config = config\n",
+    "\n",
+    "        # Initialize scaler (using same type as model)\n",
+    "        scaler_type = self.config.get(\"TimeSeriesModel\", {}).get(\n",
+    "            \"scaler\", \"custom_robust\"\n",
+    "        )\n",
+    "        epsilon = self.config.get(\"TimeSeriesModel\", {}).get(\"epsilon\", 1e-3)\n",
+    "        if scaler_type == \"custom_robust\":\n",
+    "            self.scaler = RobustScaler(epsilon=epsilon)\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unsupported scaler type: {scaler_type}\")\n",
+    "\n",
+    "    def set_dataset_context(\n",
+    "        self,\n",
+    "        prediction_length: Optional[int] = None,\n",
+    "        freq: Optional[str] = None,\n",
+    "        batch_size: Optional[int] = None,\n",
+    "        max_context_length: Optional[int] = None,\n",
+    "    ) -> None:\n",
+    "        \"\"\"Update lightweight dataset-specific attributes without reloading the model.\"\"\"\n",
+    "\n",
+    "        if prediction_length is not None:\n",
+    "            self.ds_prediction_length = prediction_length\n",
+    "        if freq is not None:\n",
+    "            self.ds_freq = freq\n",
+    "        if batch_size is not None:\n",
+    "            self.batch_size = batch_size\n",
+    "        if max_context_length is not None:\n",
+    "            self.max_context_length = max_context_length\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_model(\n",
+    "        cls,\n",
+    "        model: TimeSeriesModel,\n",
+    "        config: dict,\n",
+    "        ds_prediction_length: int,\n",
+    "        ds_freq: str,\n",
+    "        batch_size: int = 32,\n",
+    "        max_context_length: Optional[int] = None,\n",
+    "        debug: bool = False,\n",
+    "    ) -> \"TimeSeriesPredictor\":\n",
+    "        return cls(\n",
+    "            model=model,\n",
+    "            config=config,\n",
+    "            ds_prediction_length=ds_prediction_length,\n",
+    "            ds_freq=ds_freq,\n",
+    "            batch_size=batch_size,\n",
+    "            max_context_length=max_context_length,\n",
+    "            debug=debug,\n",
+    "        )\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_paths(\n",
+    "        cls,\n",
+    "        model_path: str,\n",
+    "        config_path: str,\n",
+    "        ds_prediction_length: int,\n",
+    "        ds_freq: str,\n",
+    "        batch_size: int = 32,\n",
+    "        max_context_length: Optional[int] = None,\n",
+    "        debug: bool = False,\n",
+    "    ) -> \"TimeSeriesPredictor\":\n",
+    "        with open(config_path, \"r\") as f:\n",
+    "            config = yaml.safe_load(f)\n",
+    "        model = cls._load_model_from_path(config=config, model_path=model_path)\n",
+    "        return cls(\n",
+    "            model=model,\n",
+    "            config=config,\n",
+    "            ds_prediction_length=ds_prediction_length,\n",
+    "            ds_freq=ds_freq,\n",
+    "            batch_size=batch_size,\n",
+    "            max_context_length=max_context_length,\n",
+    "            debug=debug,\n",
+    "        )\n",
+    "\n",
+    "    @staticmethod\n",
+    "    def _load_model_from_path(config: dict, model_path: str) -> TimeSeriesModel:\n",
+    "        try:\n",
+    "            model = TimeSeriesModel(**config[\"TimeSeriesModel\"]).to(device)\n",
+    "            checkpoint = torch.load(model_path, map_location=device)\n",
+    "            model.load_state_dict(checkpoint[\"model_state_dict\"])\n",
+    "            model.eval()\n",
+    "            logger.info(f\"Successfully loaded model from {model_path}\")\n",
+    "            return model\n",
+    "        except Exception as exc:  # pragma: no cover - logging path\n",
+    "            logger.error(f\"Failed to load model from {model_path}: {exc}\")\n",
+    "            raise\n",
+    "\n",
+    "    def predict(self, test_data_input) -> Iterator[QuantileForecast]:\n",
+    "        \"\"\"Generate forecasts for the test data.\"\"\"\n",
+    "\n",
+    "        if hasattr(test_data_input, \"__iter__\") and not isinstance(test_data_input, list):\n",
+    "            test_data_input = list(test_data_input)\n",
+    "        logger.debug(f\"Processing {len(test_data_input)} time series\")\n",
+    "\n",
+    "        # Group series by their effective length (after optional truncation),\n",
+    "        # then process each uniform-length group in sub-batches up to batch_size.\n",
+    "        def _effective_length(entry) -> int:\n",
+    "            target = entry[\"target\"]\n",
+    "            if target.ndim == 1:\n",
+    "                seq_len = len(target)\n",
+    "            else:\n",
+    "                # target shape is [num_channels, seq_len]\n",
+    "                seq_len = target.shape[1]\n",
+    "            if self.max_context_length is not None:\n",
+    "                seq_len = min(seq_len, self.max_context_length)\n",
+    "            return seq_len\n",
+    "\n",
+    "        length_to_items: dict[int, List[tuple[int, object]]] = {}\n",
+    "        for idx, entry in enumerate(test_data_input):\n",
+    "            seq_len = _effective_length(entry)\n",
+    "            length_to_items.setdefault(seq_len, []).append((idx, entry))\n",
+    "\n",
+    "        total = len(test_data_input)\n",
+    "        ordered_results: List[Optional[QuantileForecast]] = [None] * total\n",
+    "\n",
+    "        for _, items in length_to_items.items():\n",
+    "            for i in range(0, len(items), self.batch_size):\n",
+    "                chunk = items[i : i + self.batch_size]\n",
+    "                entries = [entry for (_orig_idx, entry) in chunk]\n",
+    "                batch_forecasts = self._predict_batch(entries)\n",
+    "                for forecast_idx, (orig_idx, _entry) in enumerate(chunk):\n",
+    "                    ordered_results[orig_idx] = batch_forecasts[forecast_idx]\n",
+    "\n",
+    "        return ordered_results  # type: ignore[return-value]\n",
+    "\n",
+    "    def _predict_batch(self, test_data_batch: List) -> List[QuantileForecast]:\n",
+    "        \"\"\"Generate predictions for a batch of time series.\"\"\"\n",
+    "\n",
+    "        logger.debug(f\"Processing batch of size: {len(test_data_batch)}\")\n",
+    "\n",
+    "        try:\n",
+    "            batch_container = self._convert_to_batch_container(test_data_batch)\n",
+    "\n",
+    "            if isinstance(device, torch.device):\n",
+    "                device_type = device.type\n",
+    "            else:\n",
+    "                device_type = \"cuda\" if \"cuda\" in str(device).lower() else \"cpu\"\n",
+    "            enable_autocast = device_type == \"cuda\"\n",
+    "\n",
+    "            with torch.autocast(\n",
+    "                device_type=device_type,\n",
+    "                dtype=torch.bfloat16,\n",
+    "                enabled=enable_autocast,\n",
+    "            ):\n",
+    "                with torch.no_grad():\n",
+    "                    model_output = self.model(batch_container, drop_enc_allow=False)\n",
+    "\n",
+    "            forecasts = self._convert_to_forecasts(\n",
+    "                model_output, test_data_batch, batch_container\n",
+    "            )\n",
+    "\n",
+    "            logger.debug(f\"Generated {len(forecasts)} forecasts\")\n",
+    "            return forecasts\n",
+    "        except Exception as exc:  # pragma: no cover - logging path\n",
+    "            logger.error(f\"Error in batch prediction: {exc}\")\n",
+    "            raise\n",
+    "\n",
+    "    def _convert_to_batch_container(\n",
+    "        self, test_data_batch: List\n",
+    "    ) -> BatchTimeSeriesContainer:\n",
+    "        \"\"\"Convert gluonts test data to BatchTimeSeriesContainer.\"\"\"\n",
+    "\n",
+    "        batch_size = len(test_data_batch)\n",
+    "        history_values_list = []\n",
+    "        start_dates = []\n",
+    "        frequencies = []\n",
+    "\n",
+    "        for entry in test_data_batch:\n",
+    "            target = entry[\"target\"]\n",
+    "\n",
+    "            if target.ndim == 1:\n",
+    "                target = target.reshape(-1, 1)\n",
+    "            else:\n",
+    "                target = target.T\n",
+    "\n",
+    "            if (\n",
+    "                self.max_context_length is not None\n",
+    "                and len(target) > self.max_context_length\n",
+    "            ):\n",
+    "                target = target[-self.max_context_length :]\n",
+    "\n",
+    "            history_values_list.append(target)\n",
+    "            start_dates.append(entry[\"start\"].to_timestamp().to_datetime64())\n",
+    "            frequencies.append(parse_frequency(entry[\"freq\"]))\n",
+    "\n",
+    "        history_values_np = np.stack(history_values_list, axis=0)\n",
+    "        num_channels = history_values_np.shape[2]\n",
+    "\n",
+    "        history_values = torch.tensor(\n",
+    "            history_values_np, dtype=torch.float32, device=device\n",
+    "        )\n",
+    "\n",
+    "        future_values = torch.zeros(\n",
+    "            (batch_size, self.ds_prediction_length, num_channels),\n",
+    "            dtype=torch.float32,\n",
+    "            device=device,\n",
+    "        )\n",
+    "\n",
+    "        return BatchTimeSeriesContainer(\n",
+    "            history_values=history_values,\n",
+    "            future_values=future_values,\n",
+    "            start=start_dates,\n",
+    "            frequency=frequencies,\n",
+    "        )\n",
+    "\n",
+    "    def _convert_to_forecasts(\n",
+    "        self,\n",
+    "        model_output: dict,\n",
+    "        test_data_batch: List,\n",
+    "        batch_container: BatchTimeSeriesContainer,\n",
+    "    ) -> List[QuantileForecast]:\n",
+    "        \"\"\"Convert model predictions to QuantileForecast objects.\"\"\"\n",
+    "\n",
+    "        predictions = model_output[\"result\"]\n",
+    "        scale_statistics = model_output[\"scale_statistics\"]\n",
+    "\n",
+    "        if predictions.ndim == 4:\n",
+    "            predictions_unscaled = self.scaler.inverse_scale(\n",
+    "                predictions, scale_statistics\n",
+    "            )\n",
+    "            is_quantile = True\n",
+    "            quantile_levels = self.model.quantiles\n",
+    "        else:\n",
+    "            predictions_unscaled = self.scaler.inverse_scale(\n",
+    "                predictions, scale_statistics\n",
+    "            )\n",
+    "            is_quantile = False\n",
+    "            quantile_levels = [0.5]\n",
+    "\n",
+    "        forecasts: List[QuantileForecast] = []\n",
+    "        for idx, entry in enumerate(test_data_batch):\n",
+    "            history_length = int(batch_container.history_values.shape[1])\n",
+    "            start_date = entry[\"start\"]\n",
+    "            forecast_start = start_date + history_length\n",
+    "\n",
+    "            if is_quantile:\n",
+    "                pred_array = predictions_unscaled[idx].cpu().numpy()\n",
+    "\n",
+    "                if pred_array.shape[1] == 1:\n",
+    "                    pred_array = pred_array.squeeze(1)\n",
+    "                    forecast_arrays = pred_array.T\n",
+    "                else:\n",
+    "                    forecast_arrays = pred_array.transpose(2, 0, 1)\n",
+    "\n",
+    "                forecast = QuantileForecast(\n",
+    "                    forecast_arrays=forecast_arrays,\n",
+    "                    forecast_keys=[str(q) for q in quantile_levels],\n",
+    "                    start_date=forecast_start,\n",
+    "                )\n",
+    "            else:\n",
+    "                pred_array = predictions_unscaled[idx].cpu().numpy()\n",
+    "\n",
+    "                if pred_array.shape[1] == 1:\n",
+    "                    pred_array = pred_array.squeeze(1)\n",
+    "                    forecast_arrays = pred_array.reshape(1, -1)\n",
+    "                else:\n",
+    "                    forecast_arrays = pred_array.reshape(1, *pred_array.shape)\n",
+    "\n",
+    "                forecast = QuantileForecast(\n",
+    "                    forecast_arrays=forecast_arrays,\n",
+    "                    forecast_keys=[\"0.5\"],\n",
+    "                    start_date=forecast_start,\n",
+    "                )\n",
+    "\n",
+    "            forecasts.append(forecast)\n",
+    "\n",
+    "        return forecasts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "o3p4q5r6",
+   "metadata": {},
+   "source": [
+    "### 3.5. Result Handling \n",
+    "\n",
+    "These functions handle writing the per-dataset metrics to CSV files and aggregating all results into a single `all_results.csv` at the end."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "p4q5r6s7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _ensure_results_csv(csv_file_path: Path) -> None:\n",
+    "    if not csv_file_path.exists():\n",
+    "        csv_file_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "        with open(csv_file_path, \"w\", newline=\"\") as csvfile:\n",
+    "            writer = csv.writer(csvfile)\n",
+    "            header = (\n",
+    "                [\"dataset\", \"model\"]\n",
+    "                + [f\"eval_metrics/{name}\" for name in STANDARD_METRIC_NAMES]\n",
+    "                + [\"domain\", \"num_variates\"]\n",
+    "            )\n",
+    "            writer.writerow(header)\n",
+    "\n",
+    "\n",
+    "def write_results_to_disk(\n",
+    "    items: List[EvaluationItem],\n",
+    "    dataset_name: str,\n",
+    "    output_dir: Path,\n",
+    "    model_name: str,\n",
+    "    create_plots: bool,\n",
+    ") -> None:\n",
+    "    output_dir = output_dir / dataset_name\n",
+    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    output_csv_path = output_dir / \"results.csv\"\n",
+    "    _ensure_results_csv(output_csv_path)\n",
+    "\n",
+    "    with open(output_csv_path, \"a\", newline=\"\") as csvfile:\n",
+    "        writer = csv.writer(csvfile)\n",
+    "        for item in items:\n",
+    "            md: DatasetMetadata = item.dataset_metadata\n",
+    "            metric_values: List[Optional[float]] = []\n",
+    "            for metric_name in STANDARD_METRIC_NAMES:\n",
+    "                value = item.metrics.get(metric_name, None)\n",
+    "                if value is None:\n",
+    "                    metric_values.append(None)\n",
+    "                else:\n",
+    "                    if (\n",
+    "                        hasattr(value, \"__len__\")\n",
+    "                        and not isinstance(value, (str, bytes))\n",
+    "                        and len(value) == 1\n",
+    "                    ):\n",
+    "                        value = value[0]\n",
+    "                    elif hasattr(value, \"item\"):\n",
+    "                        value = value.item()\n",
+    "                    metric_values.append(value)\n",
+    "\n",
+    "            ds_key = md.key.lower()\n",
+    "            props = DATASET_PROPERTIES.get(ds_key, {})\n",
+    "            domain = props.get(\"domain\", \"unknown\")\n",
+    "            num_variates = props.get(\n",
+    "                \"num_variates\", 1 if md.to_univariate else md.target_dim\n",
+    "            )\n",
+    "\n",
+    "            row = [md.full_name, model_name] + metric_values + [domain, num_variates]\n",
+    "            writer.writerow(row)\n",
+    "\n",
+    "            if create_plots and item.figures and plt is not None:\n",
+    "                plots_dir = output_dir / \"plots\" / md.key / md.term\n",
+    "                plots_dir.mkdir(parents=True, exist_ok=True)\n",
+    "                for fig, filename in item.figures:\n",
+    "                    filepath = plots_dir / filename\n",
+    "                    fig.savefig(filepath, dpi=300, bbox_inches=\"tight\")\n",
+    "                    plt.close(fig)\n",
+    "\n",
+    "    logger.info(\n",
+    "        \"Evaluation complete for dataset '%s'. Results saved to %s\",\n",
+    "        dataset_name,\n",
+    "        output_csv_path,\n",
+    "    )\n",
+    "    if create_plots:\n",
+    "        logger.info(\"Plots saved under %s\", output_dir / \"plots\")\n",
+    "\n",
+    "\n",
+    "def get_all_datasets_full_name() -> List[str]:\n",
+    "    \"\"\"Get all possible dataset full names for validation.\"\"\"\n",
+    "\n",
+    "    terms = [\"short\", \"medium\", \"long\"]\n",
+    "    datasets_full_names: List[str] = []\n",
+    "\n",
+    "    for name in ALL_DATASETS:\n",
+    "        for term in terms:\n",
+    "            if term in [\"medium\", \"long\"] and name not in MED_LONG_DATASETS:\n",
+    "                continue\n",
+    "\n",
+    "            if \"/\" in name:\n",
+    "                ds_key, ds_freq = name.split(\"/\")\n",
+    "                ds_key = ds_key.lower()\n",
+    "                ds_key = PRETTY_NAMES.get(ds_key, ds_key)\n",
+    "            else:\n",
+    "                ds_key = name.lower()\n",
+    "                ds_key = PRETTY_NAMES.get(ds_key, ds_key)\n",
+    "                ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get(\"frequency\")\n",
+    "\n",
+    "            datasets_full_names.append(\n",
+    "                f\"{ds_key}/{ds_freq if ds_freq else 'unknown'}/{term}\"\n",
+    "            )\n",
+    "\n",
+    "    return datasets_full_names\n",
+    "\n",
+    "\n",
+    "def aggregate_results(result_root_dir: str | Path) -> pd.DataFrame | None:\n",
+    "    \"\"\"Aggregate results from multiple CSV files into a single dataframe.\"\"\"\n",
+    "\n",
+    "    result_root = Path(result_root_dir)\n",
+    "\n",
+    "    logger.info(\"Aggregating results in: %s\", result_root)\n",
+    "\n",
+    "    result_files = glob.glob(f\"{result_root}/**/results.csv\", recursive=True)\n",
+    "\n",
+    "    if not result_files:\n",
+    "        logger.error(\"No result files found!\")\n",
+    "        return None\n",
+    "\n",
+    "    dataframes: List[pd.DataFrame] = []\n",
+    "    for file in result_files:\n",
+    "        try:\n",
+    "            df = pd.read_csv(file)\n",
+    "            if len(df) > 0:\n",
+    "                dataframes.append(df)\n",
+    "            else:\n",
+    "                logger.warning(\"Empty file: %s\", file)\n",
+    "        except pd.errors.EmptyDataError:\n",
+    "            logger.warning(\"Skipping empty file: %s\", file)\n",
+    "        except Exception as exc:\n",
+    "            logger.error(\"Error reading %s: %s\", file, exc)\n",
+    "\n",
+    "    if not dataframes:\n",
+    "        logger.warning(\"No valid CSV files found to combine\")\n",
+    "        return None\n",
+    "\n",
+    "    combined_df = pd.concat(dataframes, ignore_index=True).sort_values(\"dataset\")\n",
+    "\n",
+    "    if len(combined_df) != len(set(combined_df.dataset)):\n",
+    "        duplicate_datasets = combined_df.dataset[\n",
+    "            combined_df.dataset.duplicated()\n",
+    "        ].tolist()\n",
+    "        logger.warning(\"Warning: Duplicate datasets found: %s\", duplicate_datasets)\n",
+    "        combined_df = combined_df.drop_duplicates(subset=[\"dataset\"], keep=\"first\")\n",
+    "        logger.info(\n",
+    "            \"Removed duplicates, %s unique datasets remaining\", len(combined_df)\n",
+    "        )\n",
+    "\n",
+    "    logger.info(\"Combined results: %s datasets\", len(combined_df))\n",
+    "\n",
+    "    all_datasets_full_name = get_all_datasets_full_name()\n",
+    "    completed_experiments = combined_df.dataset.tolist()\n",
+    "\n",
+    "    completed_experiments_clean = [\n",
+    "        exp for exp in completed_experiments if exp in all_datasets_full_name\n",
+    "    ]\n",
+    "    missing_or_failed_experiments = [\n",
+    "        exp for exp in all_datasets_full_name if exp not in completed_experiments_clean\n",
+    "    ]\n",
+    "\n",
+    "    logger.info(\"=== EXPERIMENT SUMMARY ===\")\n",
+    "    logger.info(\"Total expected datasets: %s\", len(all_datasets_full_name))\n",
+    "    logger.info(\"Completed experiments: %s\", len(completed_experiments_clean))\n",
+    "    logger.info(\"Missing/failed experiments: %s\", len(missing_or_failed_experiments))\n",
+    "\n",
+    "    output_file = result_root / \"all_results.csv\"\n",
+    "    combined_df.to_csv(output_file, index=False)\n",
+    "    logger.info(\"Combined results saved to: %s\", output_file)\n",
+    "\n",
+    "    return combined_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "q5r6s7t8",
+   "metadata": {},
+   "source": [
+    "### 3.6. Evaluation Harness (`evaluate.py`)\n",
+    "\n",
+    "This is the main evaluation logic that iterates over dataset terms, prepares the data, calls the predictor, and gathers metrics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "r6s7t8u9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def construct_evaluation_data(\n",
+    "    dataset_name: str,\n",
+    "    dataset_storage_path: str,\n",
+    "    terms: List[str] = [\"short\", \"medium\", \"long\"],\n",
+    "    max_windows: Optional[int] = None,\n",
+    ") -> List[Tuple[Dataset, DatasetMetadata]]:\n",
+    "    \"\"\"Build datasets and rich metadata per term for a dataset name.\"\"\"\n",
+    "    sub_datasets: List[Tuple[Dataset, DatasetMetadata]] = []\n",
+    "\n",
+    "    if \"/\" in dataset_name:\n",
+    "        ds_key, ds_freq = dataset_name.split(\"/\")\n",
+    "        ds_key = ds_key.lower()\n",
+    "        ds_key = PRETTY_NAMES.get(ds_key, ds_key)\n",
+    "    else:\n",
+    "        ds_key = dataset_name.lower()\n",
+    "        ds_key = PRETTY_NAMES.get(ds_key, ds_key)\n",
+    "        ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get(\"frequency\")\n",
+    "\n",
+    "    for term in terms:\n",
+    "        # Skip medium/long terms for datasets that don't support them\n",
+    "        if (\n",
+    "            term == \"medium\" or term == \"long\"\n",
+    "        ) and dataset_name not in MED_LONG_DATASETS:\n",
+    "            continue\n",
+    "\n",
+    "        # Probe once to determine dimensionality\n",
+    "        probe_dataset = Dataset(\n",
+    "            name=dataset_name,\n",
+    "            term=term,\n",
+    "            to_univariate=False,\n",
+    "            storage_path=dataset_storage_path,\n",
+    "            max_windows=max_windows,\n",
+    "        )\n",
+    "\n",
+    "        to_univariate = probe_dataset.target_dim > 1\n",
+    "\n",
+    "        dataset = Dataset(\n",
+    "            name=dataset_name,\n",
+    "            term=term,\n",
+    "            to_univariate=to_univariate,\n",
+    "            storage_path=dataset_storage_path,\n",
+    "            max_windows=max_windows,\n",
+    "        )\n",
+    "\n",
+    "        # Compute metadata\n",
+    "        season_length = get_seasonality(dataset.freq)\n",
+    "        actual_freq = ds_freq if ds_freq else dataset.freq\n",
+    "        \n",
+    "        metadata = DatasetMetadata(\n",
+    "            full_name=f\"{ds_key}/{actual_freq}/{term}\",\n",
+    "            key=ds_key,\n",
+    "            freq=actual_freq,\n",
+    "            term=term,\n",
+    "            season_length=season_length,\n",
+    "            target_dim=probe_dataset.target_dim,\n",
+    "            to_univariate=to_univariate,\n",
+    "            prediction_length=dataset.prediction_length,\n",
+    "            windows=dataset.windows,\n",
+    "        )\n",
+    "\n",
+    "        sub_datasets.append((dataset, metadata))\n",
+    "\n",
+    "    return sub_datasets\n",
+    "\n",
+    "\n",
+    "def evaluate_datasets(\n",
+    "    predictor: TimeSeriesPredictor,\n",
+    "    dataset: str,\n",
+    "    dataset_storage_path: str,\n",
+    "    terms: List[str] = [\"short\", \"medium\", \"long\"],\n",
+    "    max_windows: Optional[int] = None,\n",
+    "    batch_size: int = 48,\n",
+    "    max_context_length: Optional[int] = 1024,\n",
+    "    create_plots: bool = False,\n",
+    "    max_plots_per_dataset: int = 10,\n",
+    ") -> List[EvaluationItem]:\n",
+    "    \"\"\"Evaluate predictor on one dataset across the requested terms.\"\"\"\n",
+    "    sub_datasets = construct_evaluation_data(\n",
+    "        dataset_name=dataset,\n",
+    "        dataset_storage_path=dataset_storage_path,\n",
+    "        terms=terms,\n",
+    "        max_windows=max_windows,\n",
+    "    )\n",
+    "\n",
+    "    results: List[EvaluationItem] = []\n",
+    "    for i, (sub_dataset, metadata) in enumerate(sub_datasets):\n",
+    "        logger.info(f\"Evaluating {i + 1}/{len(sub_datasets)}: {metadata.full_name}\")\n",
+    "        logger.info(f\"  Dataset size: {len(sub_dataset.test_data)}\")\n",
+    "        logger.info(f\"  Frequency: {sub_dataset.freq}\")\n",
+    "        logger.info(f\"  Term: {metadata.term}\")\n",
+    "        logger.info(f\"  Prediction length: {sub_dataset.prediction_length}\")\n",
+    "        logger.info(f\"  Target dimensions: {sub_dataset.target_dim}\")\n",
+    "        logger.info(f\"  Windows: {sub_dataset.windows}\")\n",
+    "\n",
+    "        # Update context on the reusable predictor\n",
+    "        predictor.set_dataset_context(\n",
+    "            prediction_length=sub_dataset.prediction_length,\n",
+    "            freq=sub_dataset.freq,\n",
+    "            batch_size=batch_size,\n",
+    "            max_context_length=max_context_length,\n",
+    "        )\n",
+    "\n",
+    "        res = evaluate_model(\n",
+    "            model=predictor,\n",
+    "            test_data=sub_dataset.test_data,\n",
+    "            metrics=METRICS,\n",
+    "            axis=None,\n",
+    "            mask_invalid_label=True,\n",
+    "            allow_nan_forecast=False,\n",
+    "            seasonality=metadata.season_length,\n",
+    "        )\n",
+    "\n",
+    "        figs: List[Tuple[object, str]] = []\n",
+    "        if create_plots:\n",
+    "            # We are missing `src.plotting.gift_eval_utils.create_plots_for_dataset`\n",
+    "            # As this was not provided, plotting will be skipped.\n",
+    "            logger.warning(\"Plotting is enabled but `create_plots_for_dataset` is not defined. Skipping plot generation.\")\n",
+    "            pass\n",
+    "\n",
+    "        results.append(\n",
+    "            EvaluationItem(dataset_metadata=metadata, metrics=res, figures=figs)\n",
+    "        )\n",
+    "\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "s7t8u9v0",
+   "metadata": {},
+   "source": [
+    "## 4. Configuration\n",
+    "\n",
+    "Set the parameters for the evaluation run. Update `config_path` and `checkpoint_url` to point to your model's files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "t8u9v0w1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Parameters ---\n",
+    "model_path = None  # e.g., \"/path/to/checkpoint.pth\"; if None, try checkpoint_url\n",
+    "config_path = Path.cwd().parent.parent / \"configs/example.yaml\" \n",
+    "checkpoint_url = \"https://www.dropbox.com/scl/fi/mqsni5lehooyaw93y3uzq/checkpoint_38M.pth?rlkey=3uyehvmtted02xkha24zgpzb6&st=seevsbkn&dl=0\"   \n",
+    "\n",
+    "# --- Datasets and evaluation controls ---\n",
+    "# Use a small subset for testing, e.g., [\"m4_weekly\"]\n",
+    "datasets_arg = [\"all\"] # list of dataset names or [\"all\"]. \n",
+    "terms = [\"short\", \"medium\", \"long\"]\n",
+    "dataset_storage_path = os.getenv(\"GIFT_EVAL_DATASET_STORAGE_PATH\")\n",
+    "max_windows = 20\n",
+    "batch_size = 64\n",
+    "max_context_length = 3072   \n",
+    "\n",
+    "# --- Output ---\n",
+    "after_each_dataset_flush = True  # write CSV as each dataset completes\n",
+    "model_name = \"TempoPFN\"\n",
+    "download_dir = Path.cwd().parent / \"models\"\n",
+    "output_dir = Path.cwd().parent / \"gift_eval_results\" / model_name\n",
+    "\n",
+    "# --- Helper Functions ---\n",
+    "\n",
+    "def download_checkpoint_if_needed(url: str, target_dir: Path, target_filename: str = \"checkpoint.pth\") -> Path:\n",
+    "    \"\"\"Downloads a file from a URL if it doesn't exist.\"\"\"\n",
+    "    try:\n",
+    "        import requests\n",
+    "    except ImportError:\n",
+    "        logger.error(\"requests package not found. Please install it: pip install requests\")\n",
+    "        raise\n",
+    "        \n",
+    "    target_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    target_file_path = target_dir / target_filename\n",
+    "    \n",
+    "    if target_file_path.exists():\n",
+    "        logger.info(f\"Checkpoint already exists: {target_file_path}\")\n",
+    "        return target_file_path\n",
+    "    \n",
+    "    logger.info(f\"Downloading checkpoint from {url} to {target_file_path}...\")\n",
+    "    \n",
+    "    # Handle Dropbox links\n",
+    "    if \"dropbox.com\" in url:\n",
+    "        url = url.replace(\"dl=0\", \"dl=1\").replace(\"st=\", \"dl=1&st=\")\n",
+    "        \n",
+    "    try:\n",
+    "        with requests.get(url, stream=True) as r:\n",
+    "            r.raise_for_status()\n",
+    "            with open(target_file_path, 'wb') as f:\n",
+    "                for chunk in r.iter_content(chunk_size=8192):\n",
+    "                    f.write(chunk)\n",
+    "        logger.info(\"Download complete.\")\n",
+    "        return target_file_path\n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"Failed to download checkpoint: {e}\")\n",
+    "        if target_file_path.exists():\n",
+    "            os.remove(target_file_path) # Clean up partial download\n",
+    "        raise\n",
+    "\n",
+    "def _load_yaml(path: str) -> dict:\n",
+    "    with open(path, \"r\") as f:\n",
+    "        return yaml.safe_load(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "u9v0w1x2",
+   "metadata": {},
+   "source": [
+    "## 5. Main Evaluation Loop\n",
+    "\n",
+    "This cell sets up the predictor and runs the main evaluation loop over all specified datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "v0w1x2y3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logger.info(\"Starting evaluation for model: %s\", model_name)\n",
+    "\n",
+    "# 1. Build predictor from a checkpoint\n",
+    "resolved_model_path = None\n",
+    "if model_path:\n",
+    "    resolved_model_path = model_path\n",
+    "elif checkpoint_url:\n",
+    "    resolved_model_path = download_checkpoint_if_needed(\n",
+    "        checkpoint_url, \n",
+    "        target_dir=download_dir,\n",
+    "        target_filename=f\"{model_name}_checkpoint.pth\"\n",
+    "    )\n",
+    "\n",
+    "if not resolved_model_path or not Path(resolved_model_path).exists():\n",
+    "    raise FileNotFoundError(\n",
+    "        f\"No model checkpoint found. Set `model_path` or `checkpoint_url`. Tried: {resolved_model_path}\"\n",
+    "    )\n",
+    "\n",
+    "assert Path(config_path).exists(), f\"Config not found: {config_path}\"\n",
+    "logger.info(\"Loading predictor from checkpoint: %s\", resolved_model_path)\n",
+    "\n",
+    "predictor = TimeSeriesPredictor.from_paths(\n",
+    "    model_path=resolved_model_path,\n",
+    "    config_path=config_path,\n",
+    "    ds_prediction_length=1,  # placeholder; set per dataset\n",
+    "    ds_freq=\"D\",  # placeholder; set per dataset\n",
+    "    batch_size=batch_size,\n",
+    "    max_context_length=max_context_length,\n",
+    ")\n",
+    "\n",
+    "# 2. Run evaluation loop\n",
+    "datasets_to_run = expand_datasets_arg(datasets_arg)\n",
+    "results_root = Path(output_dir)\n",
+    "\n",
+    "for ds_name in datasets_to_run:\n",
+    "    try:\n",
+    "        items = evaluate_datasets(\n",
+    "            predictor=predictor,\n",
+    "            dataset=ds_name,\n",
+    "            dataset_storage_path=dataset_storage_path,\n",
+    "            terms=terms,\n",
+    "            max_windows=max_windows,\n",
+    "            batch_size=batch_size,\n",
+    "            max_context_length=max_context_length,\n",
+    "            create_plots=False,  # Set to True if you implement plotting\n",
+    "            max_plots_per_dataset=0,\n",
+    "        )\n",
+    "        write_results_to_disk(\n",
+    "            items=items,\n",
+    "            dataset_name=ds_name,\n",
+    "            output_dir=results_root,\n",
+    "            model_name=model_name,\n",
+    "            create_plots=False,\n",
+    "        )\n",
+    "        if after_each_dataset_flush:\n",
+    "            logger.info(\"Flushed results for %s\", ds_name)\n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"FAILED evaluation for dataset: {ds_name}. Error: {e} !!!\")\n",
+    "        logger.exception(e)\n",
+    "        continue # Continue to the next dataset\n",
+    "\n",
+    "print(f\"\\nEvaluation complete. See results under: {output_dir}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "w1x2y3z4",
+   "metadata": {},
+   "source": [
+    "## 6. Aggregate Results\n",
+    "\n",
+    "Finally, we'll aggregate the individual CSV files into a single `all_results.csv` file for easy analysis, following the `gift-eval` convention."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "x2y3z4a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "logger.info(\"Aggregating results from all datasets...\")\n",
+    "combined_df = aggregate_results(result_root_dir=output_dir)\n",
+    "\n",
+    "if combined_df is not None:\n",
+    "    agg_path = Path(output_dir) / \"all_results.csv\"\n",
+    "    logger.info(\"Successfully created aggregated results file: %s\", agg_path)\n",
+    "    print(f\"\\n✅ Aggregated results saved to: {agg_path}\")\n",
+    "    print(combined_df.head())\n",
+    "else:\n",
+    "    logger.warning(\"No results to aggregate. Check that evaluation completed successfully.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

examples/quick_start_tempo_pfn.ipynb ADDED Viewed

	@@ -0,0 +1,280 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "231c6227",
+   "metadata": {},
+   "source": [
+    "# Quick Start: Univariate Quantile Forecasting (CUDA, bfloat16)\n",
+    "\n",
+    "This notebook demonstrates how to:\n",
+    "- Generate synthetic sine wave time series data\n",
+    "- Pack data into `BatchTimeSeriesContainer`\n",
+    "- Load a pretrained model (from Dropbox)\n",
+    "- Run inference with bfloat16 on CUDA\n",
+    "- Visualize predictions\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bb6c5424-1c63-4cb0-a818-45d4199914e5",
+   "metadata": {},
+   "source": [
+    "## 1) Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "612a78e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import urllib.request\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Ensure CUDA is available\n",
+    "if not torch.cuda.is_available():\n",
+    "    raise RuntimeError(\"CUDA is required to run this demo. No CUDA device detected.\")\n",
+    "\n",
+    "device = torch.device(\"cuda:0\")\n",
+    "\n",
+    "# Resolve repository root to be robust to running from subdirectories (e.g., examples/)\n",
+    "repo_root = Path.cwd()\n",
+    "if not (repo_root / \"configs\").exists():\n",
+    "    repo_root = repo_root.parent\n",
+    "\n",
+    "# Inline plotting\n",
+    "%matplotlib inline\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3facf37d-0a77-4222-8464-6e42182547f8",
+   "metadata": {},
+   "source": [
+    "## 2) Define Checkpoint Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16dcb883",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CHECKPOINT_DIR = repo_root / \"models\"\n",
+    "CHECKPOINT_NAME = \"checkpoint_38M.pth\" \n",
+    "CHECKPOINT_PATH = CHECKPOINT_DIR / CHECKPOINT_NAME\n",
+    "\n",
+    "# Ensure the models directory exists\n",
+    "CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True) \n",
+    "\n",
+    "if not CHECKPOINT_PATH.exists():\n",
+    "    print(f\"--- WARNING: Checkpoint not found at: {CHECKPOINT_PATH} ---\")\n",
+    "    print(\"Please ensure 'checkpoint_38M.pth' is in the 'models/' directory.\")\n",
+    "    print(\"If you cloned from Hugging Face, you may need to run 'git lfs pull'.\")\n",
+    "    raise FileNotFoundError(f\"Model checkpoint not found at {CHECKPOINT_PATH}\")\n",
+    "else:\n",
+    "    print(f\"Using existing checkpoint at {CHECKPOINT_PATH}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9be77e34-0c7a-4056-822f-ed2e3e090c40",
+   "metadata": {},
+   "source": [
+    "## 3) Generate synthetic sine wave data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1127526c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.synthetic_generation.generator_params import SineWaveGeneratorParams\n",
+    "from src.synthetic_generation.sine_waves.sine_wave_generator_wrapper import (\n",
+    "    SineWaveGeneratorWrapper,\n",
+    ")\n",
+    "\n",
+    "batch_size = 3\n",
+    "total_length = 1024\n",
+    "seed = 2025\n",
+    "\n",
+    "sine_params = SineWaveGeneratorParams(global_seed=seed, length=total_length)\n",
+    "wrapper = SineWaveGeneratorWrapper(sine_params)\n",
+    "\n",
+    "batch = wrapper.generate_batch(batch_size=batch_size, seed=seed)\n",
+    "values = torch.from_numpy(batch.values).to(torch.float32)\n",
+    "if values.ndim == 2:\n",
+    "    values = values.unsqueeze(-1)  # [B, S, 1]\n",
+    "\n",
+    "future_length = 256\n",
+    "history_values = values[:, :-future_length, :]\n",
+    "future_values = values[:, -future_length:, :]\n",
+    "\n",
+    "print(\"History:\", history_values.shape, \"Future:\", future_values.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8844488-e51c-4805-baa9-491bfc67e8ca",
+   "metadata": {},
+   "source": [
+    "## 4) Build BatchTimeSeriesContainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3b4d361",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.data.containers import BatchTimeSeriesContainer\n",
+    "\n",
+    "container = BatchTimeSeriesContainer(\n",
+    "    history_values=history_values.to(device),\n",
+    "    future_values=future_values.to(device),\n",
+    "    start=batch.start,\n",
+    "    frequency=batch.frequency,\n",
+    ")\n",
+    "\n",
+    "container.batch_size, container.history_length, container.future_length"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5e7e790-a9aa-49c2-9d45-2dc823036883",
+   "metadata": {},
+   "source": [
+    "## 5) Load model and run inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1dd4e0e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import yaml\n",
+    "from src.models.model import TimeSeriesModel\n",
+    "\n",
+    "with open(repo_root / \"configs/example.yaml\", \"r\") as f:\n",
+    "    config = yaml.safe_load(f)\n",
+    "\n",
+    "model = TimeSeriesModel(**config[\"TimeSeriesModel\"]).to(device)\n",
+    "ckpt = torch.load(CHECKPOINT_PATH, map_location=device)\n",
+    "model.load_state_dict(ckpt[\"model_state_dict\"])\n",
+    "model.eval()\n",
+    "\n",
+    "# bfloat16 autocast on CUDA\n",
+    "with (\n",
+    "    torch.no_grad(),\n",
+    "    torch.autocast(device_type=\"cuda\", dtype=torch.bfloat16, enabled=True),\n",
+    "):\n",
+    "    output = model(container)\n",
+    "\n",
+    "preds = output[\"result\"].to(torch.float32)\n",
+    "if hasattr(model, \"scaler\") and \"scale_statistics\" in output:\n",
+    "    preds = model.scaler.inverse_scale(preds, output[\"scale_statistics\"])\n",
+    "\n",
+    "preds.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba16120f-27c8-4462-91cb-c9b3e0630a9d",
+   "metadata": {},
+   "source": [
+    "## 6) Plot predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9bf02a0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.set_loglevel(\"error\")\n",
+    "\n",
+    "# preds: [B, P, N, Q] for quantiles (univariate -> N=1)\n",
+    "preds_np = preds.cpu().numpy()\n",
+    "\n",
+    "batch_size = preds_np.shape[0]\n",
+    "prediction_length = preds_np.shape[1]\n",
+    "num_quantiles = preds_np.shape[-1]\n",
+    "\n",
+    "for i in range(batch_size):\n",
+    "    fig, ax = plt.subplots(figsize=(12, 4))\n",
+    "\n",
+    "    history = container.history_values[i, :, 0].detach().cpu().numpy()\n",
+    "    future = container.future_values[i, :, 0].detach().cpu().numpy()\n",
+    "\n",
+    "    # Time axes\n",
+    "    hist_t = np.arange(len(history))\n",
+    "    fut_t = np.arange(len(history), len(history) + len(future))\n",
+    "\n",
+    "    # Plot history and ground truth future\n",
+    "    ax.plot(hist_t, history, label=\"History\", color=\"black\")\n",
+    "    ax.plot(fut_t, future, label=\"Ground Truth\", color=\"blue\")\n",
+    "\n",
+    "    # Plot quantiles\n",
+    "    median_idx = num_quantiles // 2\n",
+    "    ax.plot(\n",
+    "        fut_t,\n",
+    "        preds_np[i, :, 0, median_idx],\n",
+    "        label=\"Prediction (Median)\",\n",
+    "        color=\"orange\",\n",
+    "        linestyle=\"--\",\n",
+    "    )\n",
+    "    if num_quantiles >= 3:\n",
+    "        ax.fill_between(\n",
+    "            fut_t,\n",
+    "            preds_np[i, :, 0, 0],\n",
+    "            preds_np[i, :, 0, -1],\n",
+    "            color=\"orange\",\n",
+    "            alpha=0.2,\n",
+    "            label=\"Prediction Interval\",\n",
+    "        )\n",
+    "\n",
+    "    ax.axvline(x=len(history), color=\"k\", linestyle=\":\", alpha=0.7)\n",
+    "    ax.set_xlabel(\"Time Steps\")\n",
+    "    ax.set_ylabel(\"Value\")\n",
+    "    ax.set.title(f\"Sample {i + 1}\")\n",
+    "    ax.legend()\n",
+    "    ax.grid(True, alpha=0.3)\n",
+    "    plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

examples/quick_start_tempo_pfn.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import argparse
+import logging
+import os
+import torch
+from examples.utils import (
+    load_model,
+    run_inference_and_plot,
+)
+from src.data.containers import BatchTimeSeriesContainer
+from src.synthetic_generation.generator_params import SineWaveGeneratorParams
+from src.synthetic_generation.sine_waves.sine_wave_generator_wrapper import (
+    SineWaveGeneratorWrapper,
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+def main():
+    """Main execution function."""
+    # CLI
+    parser = argparse.ArgumentParser(description="Quick start demo for TimeSeriesModel")
+    parser.add_argument(
+        "--config",
+        default="configs/example.yaml",
+        help="Path to model config YAML (default: configs/example.yaml)",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        default="models/checkpoint_38M.pth",
+        help="Path to model checkpoint file (default: models/checkpoint_38M.pth)",
+    )
+    parser.add_argument("--batch_size", type=int, default=3)
+    parser.add_argument("--total_length", type=int, default=2048)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--output_dir", default="outputs")
+    args = parser.parse_args()
+    # Configuration
+    batch_size = args.batch_size
+    total_length = args.total_length
+    output_dir = args.output_dir
+    seed = args.seed
+    config_path = args.config
+    model_path = args.checkpoint
+    # Check if the checkpoint file exists
+    if not os.path.exists(model_path):
+        logger.error(f"Checkpoint file not found at: {model_path}")
+        logger.error(
+            "Please ensure 'checkpoint_38M.pth' is in the root directory"
+            " (or that you've cloned the repo with Git LFS)."
+        )
+        logger.error("You can also specify a different path using --checkpoint.")
+        return  # Exit if no model
+    logger.info("=== Time Series Model Demo (Univariate Quantile) ===")
+    # 1) Generate synthetic sine wave data
+    sine_params = SineWaveGeneratorParams(global_seed=seed, length=total_length)
+    sine_generator = SineWaveGeneratorWrapper(sine_params)
+    batch = sine_generator.generate_batch(batch_size=batch_size, seed=seed)
+    values = torch.from_numpy(batch.values).to(torch.float32)
+    if values.ndim == 2:
+        values = values.unsqueeze(-1)  # Ensure [B, S, 1] for univariate
+    future_length = 256
+    history_values = values[:, :-future_length, :]
+    future_values = values[:, -future_length:, :]
+    # 2) Load the pretrained model (CUDA-only). This demo requires a CUDA GPU.
+    if not torch.cuda.is_available():
+        raise RuntimeError(
+            "CUDA is required to run this demo. No CUDA device detected."
+        )
+    device = torch.device("cuda:0")
+    model = load_model(config_path=config_path, model_path=model_path, device=device)
+    # 3) Pack tensors into the model's input container
+    container = BatchTimeSeriesContainer(
+        history_values=history_values.to(device),
+        future_values=future_values.to(device),
+        start=batch.start,
+        frequency=batch.frequency,
+    )
+    # 4) Run inference (bfloat16 on CUDA) and plot results
+    run_inference_and_plot(
+        model=model, container=container, output_dir=output_dir, use_bfloat16=True
+    )
+    logger.info("=== Demo completed successfully! ===")
+if __name__ == "__main__":
+    main()

examples/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import logging
+import os
+import urllib.request
+from typing import List
+import numpy as np
+import torch
+import yaml
+from src.data.containers import BatchTimeSeriesContainer
+from src.models.model import TimeSeriesModel
+from src.plotting.plot_timeseries import plot_from_container
+logger = logging.getLogger(__name__)
+def load_model(
+    config_path: str, model_path: str, device: torch.device
+) -> TimeSeriesModel:
+    """Load the TimeSeriesModel from config and checkpoint."""
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+    model = TimeSeriesModel(**config["TimeSeriesModel"]).to(device)
+    checkpoint = torch.load(model_path, map_location=device)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+    logger.info(f"Successfully loaded TimeSeriesModel from {model_path} on {device}")
+    return model
+def download_checkpoint_if_needed(url: str, target_dir: str = "models") -> str:
+    """Download checkpoint from URL into target_dir if not present and return its path.
+    Ensures direct download for Dropbox links by forcing dl=1.
+    """
+    os.makedirs(target_dir, exist_ok=True)
+    target_path = os.path.join(target_dir, "checkpoint.pth")
+    # Normalize Dropbox URL to force direct download
+    if "dropbox.com" in url and "dl=0" in url:
+        url = url.replace("dl=0", "dl=1")
+    if not os.path.exists(target_path):
+        logger.info(f"Downloading checkpoint from {url} to {target_path}...")
+        urllib.request.urlretrieve(url, target_path)
+        logger.info("Checkpoint downloaded successfully.")
+    else:
+        logger.info(f"Using existing checkpoint at {target_path}")
+    return target_path
+def plot_with_library(
+    container: BatchTimeSeriesContainer,
+    predictions_np: np.ndarray,  # [B, P, N, Q]
+    model_quantiles: List[float] | None,
+    output_dir: str = "outputs",
+    show_plots: bool = True,
+    save_plots: bool = True,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    batch_size = container.batch_size
+    for i in range(batch_size):
+        output_file = (
+            os.path.join(output_dir, f"sine_wave_prediction_sample_{i + 1}.png")
+            if save_plots
+            else None
+        )
+        plot_from_container(
+            batch=container,
+            sample_idx=i,
+            predicted_values=predictions_np,
+            model_quantiles=model_quantiles,
+            title=f"Sine Wave Time Series Prediction - Sample {i + 1}",
+            output_file=output_file,
+            show=show_plots,
+        )
+def run_inference_and_plot(
+    model: TimeSeriesModel,
+    container: BatchTimeSeriesContainer,
+    output_dir: str = "outputs",
+    use_bfloat16: bool = True,
+) -> None:
+    """Run model inference with optional bfloat16 and plot using shared utilities."""
+    device_type = "cuda" if (container.history_values.device.type == "cuda") else "cpu"
+    autocast_enabled = use_bfloat16 and device_type == "cuda"
+    with (
+        torch.no_grad(),
+        torch.autocast(
+            device_type=device_type, dtype=torch.bfloat16, enabled=autocast_enabled
+        ),
+    ):
+        model_output = model(container)
+    preds_full = model_output["result"].to(torch.float32)
+    if hasattr(model, "scaler") and "scale_statistics" in model_output:
+        preds_full = model.scaler.inverse_scale(
+            preds_full, model_output["scale_statistics"]
+        )
+    preds_np = preds_full.detach().cpu().numpy()
+    model_quantiles = (
+        model.quantiles if getattr(model, "loss_type", None) == "quantile" else None
+    )
+    plot_with_library(
+        container=container,
+        predictions_np=preds_np,
+        model_quantiles=model_quantiles,
+        output_dir=output_dir,
+        show_plots=True,
+        save_plots=True,
+    )

gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+logs/
+*.png
+*.pth
+# *.sh
+*.slurm
+*.pkl
+wandb/
+AutogluonModels/
+.vscode/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Datasets, logs, plots, etc.
+outputs/
+*.arrow
+*.csv
+*.png
+*.pdf
+*.gif
+.DS_Store

models/checkpoint_38M.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a520c07e6f4dc6583b25a7129251c81eef15f168003766adf6ae4983db7b575b
+size 498752361

pyproject.toml ADDED Viewed

	@@ -0,0 +1,62 @@

+[project]
+name = "TempoPFN"
+version = "0.1.0"
+description = "Univariate Time Series Forecasting Using Linear RNNs"
+authors = [
+    { name = "Vladyslav Moroshan" },
+    { name = "Julien Siems" },
+]
+readme = "README.md"
+license = { file = "LICENSE" }
+requires-python = ">=3.10,<3.13"
+dependencies = [
+    "torch>=2.5.0",
+    "torchmetrics",
+    "triton==3.2.0",
+    "numpy",
+    "pandas",
+    "matplotlib",
+    "gpytorch",
+    "flash-linear-attention @ git+https://github.com/fla-org/flash-linear-attention@main",
+    "scikit-learn",
+    "gluonts",
+    "notebook",
+    "datasets",
+    "ujson",
+]
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Topic :: Software Development",
+    "Topic :: Scientific/Engineering",
+    "Operating System :: POSIX",
+    "Operating System :: Unix",
+    "Operating System :: MacOS",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+[project.optional-dependencies]
+dev = [
+    "wandb",
+    "build",
+    "pre-commit",
+    "ruff",
+    "mypy",
+    "commitizen",
+    "black",
+    "cupy-cuda12x",
+    "statsmodels",
+    "pyo", # Requires portaudio
+]
+[build-system]
+requires = ["setuptools>=68.2.2", "wheel>=0.41.2"]
+build-backend = "setuptools.build_meta"
+package-dir = {"" = "src"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# 'torch' must be installed separately first, using the command
+# from the README.md to match your specific CUDA version.
+torchmetrics
+triton==3.2.0
+numpy
+pandas
+matplotlib
+flash-linear-attention @ git+https://github.com/fla-org/flash-linear-attention@main
+scikit-learn
+gluonts
+notebook
+datasets
+ujson
+pyyaml
+wandb
+build
+pre-commit
+ruff
+mypy
+commitizen
+black
+cupy-cuda12x
+statsmodels
+pyo  # Requires portaudio

src/__init__.py ADDED Viewed

File without changes

src/data/__init__.py ADDED Viewed

File without changes

src/data/augmentations.py ADDED Viewed

	@@ -0,0 +1,1318 @@

+import logging
+import math
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from joblib import Parallel, delayed
+from torch.quasirandom import SobolEngine
+import torch.nn.functional as F
+from src.gift_eval.data import Dataset
+logger = logging.getLogger(__name__)
+def find_consecutive_nan_lengths(series: np.ndarray) -> list[int]:
+    """Finds the lengths of all consecutive NaN blocks in a 1D array."""
+    if series.ndim > 1:
+        # For multivariate series, flatten to treat it as one long sequence
+        series = series.flatten()
+    is_nan = np.isnan(series)
+    padded_is_nan = np.concatenate(([False], is_nan, [False]))
+    diffs = np.diff(padded_is_nan.astype(int))
+    start_indices = np.where(diffs == 1)[0]
+    end_indices = np.where(diffs == -1)[0]
+    return (end_indices - start_indices).tolist()
+def analyze_datasets_for_augmentation(gift_eval_path_str: str) -> dict:
+    """
+    Analyzes all datasets to derive statistics needed for NaN augmentation.
+    This version collects the full distribution of NaN ratios.
+    """
+    logger.info(
+        "--- Starting Dataset Analysis for Augmentation (Full Distribution) ---"
+    )
+    path = Path(gift_eval_path_str)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Provided raw data path for augmentation analysis does not exist: {gift_eval_path_str}"
+        )
+    dataset_names = []
+    for dataset_dir in path.iterdir():
+        if dataset_dir.name.startswith(".") or not dataset_dir.is_dir():
+            continue
+        freq_dirs = [d for d in dataset_dir.iterdir() if d.is_dir()]
+        if freq_dirs:
+            for freq_dir in freq_dirs:
+                dataset_names.append(f"{dataset_dir.name}/{freq_dir.name}")
+        else:
+            dataset_names.append(dataset_dir.name)
+    total_series_count = 0
+    series_with_nans_count = 0
+    nan_ratio_distribution = []
+    all_consecutive_nan_lengths = Counter()
+    for ds_name in sorted(dataset_names):
+        try:
+            ds = Dataset(name=ds_name, term="short", to_univariate=False)
+            for series_data in ds.training_dataset:
+                total_series_count += 1
+                target = np.atleast_1d(series_data["target"])
+                num_nans = np.isnan(target).sum()
+                if num_nans > 0:
+                    series_with_nans_count += 1
+                    nan_ratio = num_nans / target.size
+                    nan_ratio_distribution.append(float(nan_ratio))
+                    nan_lengths = find_consecutive_nan_lengths(target)
+                    all_consecutive_nan_lengths.update(nan_lengths)
+        except Exception as e:
+            logger.warning(
+                f"Could not process {ds_name} for augmentation analysis: {e}"
+            )
+    if total_series_count == 0:
+        raise ValueError(
+            "No series were found during augmentation analysis. Check dataset path."
+        )
+    p_series_has_nan = (
+        series_with_nans_count / total_series_count if total_series_count > 0 else 0
+    )
+    logger.info("--- Augmentation Analysis Complete ---")
+    # Print summary statistics
+    logger.info(f"Total series analyzed: {total_series_count}")
+    logger.info(f"Series with NaNs: {series_with_nans_count} ({p_series_has_nan:.4f})")
+    logger.info(f"NaN ratio distribution: {Counter(nan_ratio_distribution)}")
+    logger.info(f"Consecutive NaN lengths distribution: {all_consecutive_nan_lengths}")
+    logger.info("--- End of Dataset Analysis for Augmentation ---")
+    return {
+        "p_series_has_nan": p_series_has_nan,
+        "nan_ratio_distribution": nan_ratio_distribution,
+        "nan_length_distribution": all_consecutive_nan_lengths,
+    }
+class NanAugmenter:
+    """
+    Applies realistic NaN augmentation by generating and caching NaN patterns on-demand
+    during the first transform call for a given data shape.
+    """
+    def __init__(
+        self,
+        p_series_has_nan: float,
+        nan_ratio_distribution: List[float],
+        nan_length_distribution: Counter,
+        num_patterns: int = 100000,
+        n_jobs: int = -1,
+        nan_patterns_path: Optional[str] = None,
+    ):
+        """
+        Initializes the augmenter. NaN patterns are not generated at this stage.
+        Args:
+            p_series_has_nan (float): Probability that a series in a batch will be augmented.
+            nan_ratio_distribution (List[float]): A list of NaN ratios observed in the dataset.
+            nan_length_distribution (Counter): A Counter of consecutive NaN block lengths.
+            num_patterns (int): The number of unique NaN patterns to generate per data shape.
+            n_jobs (int): The number of CPU cores to use for parallel pattern generation (-1 for all cores).
+        """
+        self.p_series_has_nan = p_series_has_nan
+        self.nan_ratio_distribution = nan_ratio_distribution
+        self.num_patterns = num_patterns
+        self.n_jobs = n_jobs
+        self.max_length = 2048
+        self.nan_patterns_path = nan_patterns_path
+        # Cache to store patterns: Dict[shape_tuple -> pattern_tensor]
+        self.pattern_cache: Dict[Tuple[int, ...], torch.BoolTensor] = {}
+        if not nan_length_distribution or sum(nan_length_distribution.values()) == 0:
+            self._has_block_distribution = False
+            logger.warning("NaN length distribution is empty. Augmentation disabled.")
+        else:
+            self._has_block_distribution = True
+            total_blocks = sum(nan_length_distribution.values())
+            self.dist_lengths = list(int(i) for i in nan_length_distribution.keys())
+            self.dist_probs = [
+                count / total_blocks for count in nan_length_distribution.values()
+            ]
+        if not self.nan_ratio_distribution:
+            logger.warning("NaN ratio distribution is empty. Augmentation disabled.")
+        # Try to load existing patterns from disk
+        self._load_existing_patterns()
+    def _load_existing_patterns(self):
+        """Load existing NaN patterns from disk if they exist."""
+        # Determine where to look for patterns
+        explicit_path: Optional[Path] = (
+            Path(self.nan_patterns_path).resolve()
+            if self.nan_patterns_path is not None
+            else None
+        )
+        candidate_files: List[Path] = []
+        if explicit_path is not None:
+            # If the explicit path exists, use it directly
+            if explicit_path.is_file():
+                candidate_files.append(explicit_path)
+            # Also search the directory of the explicit path for matching files
+            explicit_dir = explicit_path.parent
+            explicit_dir.mkdir(exist_ok=True, parents=True)
+            candidate_files.extend(
+                list(explicit_dir.glob(f"nan_patterns_{self.max_length}_*.pt"))
+            )
+        else:
+            # Default to the ./data directory
+            data_dir = Path("data")
+            data_dir.mkdir(exist_ok=True)
+            candidate_files.extend(
+                list(data_dir.glob(f"nan_patterns_{self.max_length}_*.pt"))
+            )
+        # De-duplicate candidate files while preserving order
+        seen: set[str] = set()
+        unique_candidates: List[Path] = []
+        for f in candidate_files:
+            key = str(f.resolve())
+            if key not in seen:
+                seen.add(key)
+                unique_candidates.append(f)
+        for pattern_file in unique_candidates:
+            try:
+                # Extract num_channels from filename
+                filename = pattern_file.stem
+                parts = filename.split("_")
+                if len(parts) >= 4:
+                    num_channels = int(parts[-1])
+                    # Load patterns
+                    patterns = torch.load(pattern_file, map_location="cpu")
+                    cache_key = (self.max_length, num_channels)
+                    self.pattern_cache[cache_key] = patterns
+                    logger.info(
+                        f"Loaded {patterns.shape[0]} patterns for shape {cache_key} from {pattern_file}"
+                    )
+            except (ValueError, RuntimeError, FileNotFoundError) as e:
+                logger.warning(f"Failed to load patterns from {pattern_file}: {e}")
+    def _get_pattern_file_path(self, num_channels: int) -> Path:
+        """Resolve the target file path for storing/loading patterns for a given channel count."""
+        # If user provided a file path, use its directory as the base directory
+        if self.nan_patterns_path is not None:
+            base_dir = Path(self.nan_patterns_path).resolve().parent
+            base_dir.mkdir(exist_ok=True, parents=True)
+        else:
+            base_dir = Path("data").resolve()
+            base_dir.mkdir(exist_ok=True, parents=True)
+        return base_dir / f"nan_patterns_{self.max_length}_{num_channels}.pt"
+    def _generate_nan_mask(self, series_shape: Tuple[int, ...]) -> np.ndarray:
+        """Generates a single boolean NaN mask for a given series shape."""
+        series_size = int(np.prod(series_shape))
+        sampled_ratio = np.random.choice(self.nan_ratio_distribution)
+        n_nans_to_add = int(round(series_size * sampled_ratio))
+        if n_nans_to_add == 0:
+            return np.zeros(series_shape, dtype=bool)
+        mask_flat = np.zeros(series_size, dtype=bool)
+        nans_added = 0
+        max_attempts = n_nans_to_add * 2
+        attempts = 0
+        while nans_added < n_nans_to_add and attempts < max_attempts:
+            attempts += 1
+            block_length = np.random.choice(self.dist_lengths, p=self.dist_probs)
+            if nans_added + block_length > n_nans_to_add:
+                block_length = n_nans_to_add - nans_added
+            if block_length <= 0:
+                break
+            nan_counts_in_window = np.convolve(
+                mask_flat, np.ones(block_length), mode="valid"
+            )
+            valid_starts = np.where(nan_counts_in_window == 0)[0]
+            if valid_starts.size == 0:
+                continue
+            start_pos = np.random.choice(valid_starts)
+            mask_flat[start_pos : start_pos + block_length] = True
+            nans_added += block_length
+        return mask_flat.reshape(series_shape)
+    def _pregenerate_patterns(self, series_shape: Tuple[int, ...]) -> torch.BoolTensor:
+        """Uses joblib to parallelize the generation of NaN masks for a given shape."""
+        if not self._has_block_distribution or not self.nan_ratio_distribution:
+            return torch.empty(0, *series_shape, dtype=torch.bool)
+        logger.info(
+            f"Generating {self.num_patterns} NaN patterns for shape {series_shape}..."
+        )
+        with Parallel(n_jobs=self.n_jobs, backend="loky") as parallel:
+            masks_list = parallel(
+                delayed(self._generate_nan_mask)(series_shape)
+                for _ in range(self.num_patterns)
+            )
+        logger.info(f"Pattern generation complete for shape {series_shape}.")
+        return torch.from_numpy(np.stack(masks_list)).bool()
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """
+        Applies NaN patterns to a batch, generating them on-demand if the shape is new.
+        """
+        if self.p_series_has_nan == 0:
+            return time_series_batch
+        history_length, num_channels = time_series_batch.shape[1:]
+        assert history_length <= self.max_length, (
+            f"History length {history_length} exceeds maximum allowed {self.max_length}."
+        )
+        # 1. Check cache and generate patterns if the shape is new
+        if (
+            self.max_length,
+            num_channels,
+        ) not in self.pattern_cache:
+            # Try loading from a resolved file path if available
+            target_file = self._get_pattern_file_path(num_channels)
+            if target_file.exists():
+                try:
+                    patterns = torch.load(target_file, map_location="cpu")
+                    self.pattern_cache[(self.max_length, num_channels)] = patterns
+                    logger.info(
+                        f"Loaded NaN patterns from {target_file} for shape {(self.max_length, num_channels)}"
+                    )
+                except (RuntimeError, FileNotFoundError):
+                    # Fall back to generating if loading fails
+                    patterns = self._pregenerate_patterns(
+                        (self.max_length, num_channels)
+                    )
+                    torch.save(patterns, target_file)
+                    self.pattern_cache[(self.max_length, num_channels)] = patterns
+                    logger.info(
+                        f"Generated and saved {patterns.shape[0]} NaN patterns to {target_file}"
+                    )
+            else:
+                patterns = self._pregenerate_patterns((self.max_length, num_channels))
+                torch.save(patterns, target_file)
+                self.pattern_cache[(self.max_length, num_channels)] = patterns
+                logger.info(
+                    f"Generated and saved {patterns.shape[0]} NaN patterns to {target_file}"
+                )
+        patterns = self.pattern_cache[(self.max_length, num_channels)][
+            :, :history_length, :
+        ]
+        # Early exit if patterns are empty (e.g., generation failed or was disabled)
+        if patterns.numel() == 0:
+            return time_series_batch
+        batch_size = time_series_batch.shape[0]
+        device = time_series_batch.device
+        # 2. Vectorized decision on which series to augment
+        augment_mask = torch.rand(batch_size, device=device) < self.p_series_has_nan
+        indices_to_augment = torch.where(augment_mask)[0]
+        num_to_augment = indices_to_augment.numel()
+        if num_to_augment == 0:
+            return time_series_batch
+        # 3. Randomly sample patterns for each series being augmented
+        pattern_indices = torch.randint(
+            0, patterns.shape[0], (num_to_augment,), device=device
+        )
+        # 4. Select patterns and apply them in a single vectorized operation
+        selected_patterns = patterns[pattern_indices].to(device)
+        time_series_batch[indices_to_augment] = time_series_batch[
+            indices_to_augment
+        ].masked_fill(selected_patterns, float("nan"))
+        return time_series_batch
+class CensorAugmenter:
+    """
+    Applies censor augmentation by clipping values from above, below, or both.
+    """
+    def __init__(self):
+        """Initializes the CensorAugmenter."""
+        pass
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """
+        Applies a vectorized censor augmentation to a batch of time series.
+        """
+        batch_size, seq_len, num_channels = time_series_batch.shape
+        assert num_channels == 1
+        time_series_batch = time_series_batch.squeeze(-1)
+        with torch.no_grad():
+            batch_size, seq_len = time_series_batch.shape
+            device = time_series_batch.device
+            # Step 1: Choose an op mode for each series
+            op_mode = torch.randint(0, 3, (batch_size, 1), device=device)
+            # Step 2: Calculate potential thresholds for all series
+            q1 = torch.rand(batch_size, device=device)
+            q2 = torch.rand(batch_size, device=device)
+            q_low = torch.minimum(q1, q2)
+            q_high = torch.maximum(q1, q2)
+            sorted_series = torch.sort(time_series_batch, dim=1).values
+            indices_low = (q_low * (seq_len - 1)).long()
+            indices_high = (q_high * (seq_len - 1)).long()
+            c_low = torch.gather(sorted_series, 1, indices_low.unsqueeze(1))
+            c_high = torch.gather(sorted_series, 1, indices_high.unsqueeze(1))
+            # Step 3: Compute results for all possible clipping operations
+            clip_above = torch.minimum(time_series_batch, c_high)
+            clip_below = torch.maximum(time_series_batch, c_low)
+            # Step 4: Select the final result based on the op_mode
+            result = torch.where(
+                op_mode == 1,
+                clip_above,
+                torch.where(op_mode == 2, clip_below, time_series_batch),
+            )
+            augmented_batch = torch.where(
+                op_mode == 0,
+                time_series_batch,
+                result,
+            )
+        return augmented_batch.unsqueeze(-1)
+class QuantizationAugmenter:
+    """
+    Applies non-equidistant quantization using a Sobol sequence to generate
+    uniformly distributed levels. This implementation is fully vectorized.
+    """
+    def __init__(
+        self,
+        p_quantize: float,
+        level_range: Tuple[int, int],
+        seed: Optional[int] = None,
+    ):
+        """
+        Initializes the augmenter.
+        Args:
+            p_quantize (float): Probability of applying quantization to a series.
+            level_range (Tuple[int, int]): Inclusive range [min, max] to sample the
+                                           number of quantization levels from.
+            seed (Optional[int]): Seed for the Sobol sequence generator for reproducibility.
+        """
+        assert 0.0 <= p_quantize <= 1.0, "Probability must be between 0 and 1."
+        assert level_range[0] >= 2, "Minimum number of levels must be at least 2."
+        assert level_range[0] <= level_range[1], (
+            "Min levels cannot be greater than max."
+        )
+        self.p_quantize = p_quantize
+        self.level_range = level_range
+        # Initialize a SobolEngine. The dimension is the max number of random
+        # levels we might need to generate for a single series.
+        max_intermediate_levels = self.level_range[1] - 2
+        if max_intermediate_levels > 0:
+            # SobolEngine must be created on CPU
+            self.sobol_engine = SobolEngine(
+                dimension=max_intermediate_levels, scramble=True, seed=seed
+            )
+        else:
+            self.sobol_engine = None
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """
+        Applies augmentation in a fully vectorized way on the batch's device.
+        Handles input shape (batch, length, 1).
+        """
+        # Handle input shape (batch, length, 1)
+        if time_series_batch.dim() == 3 and time_series_batch.shape[2] == 1:
+            is_3d = True
+            time_series_squeezed = time_series_batch.squeeze(-1)
+        else:
+            is_3d = False
+            time_series_squeezed = time_series_batch
+        if self.p_quantize == 0 or self.sobol_engine is None:
+            return time_series_batch
+        n_series, _ = time_series_squeezed.shape
+        device = time_series_squeezed.device
+        # 1. Decide which series to augment
+        augment_mask = torch.rand(n_series, device=device) < self.p_quantize
+        n_augment = torch.sum(augment_mask)
+        if n_augment == 0:
+            return time_series_batch
+        series_to_augment = time_series_squeezed[augment_mask]
+        # 2. Determine a variable n_levels for EACH series
+        min_l, max_l = self.level_range
+        n_levels_per_series = torch.randint(
+            min_l, max_l + 1, size=(n_augment,), device=device
+        )
+        max_levels_in_batch = n_levels_per_series.max().item()
+        # 3. Find min/max for each series
+        min_vals = torch.amin(series_to_augment, dim=1, keepdim=True)
+        max_vals = torch.amax(series_to_augment, dim=1, keepdim=True)
+        value_range = max_vals - min_vals
+        is_flat = value_range == 0
+        # 4. Generate quasi-random levels using the Sobol sequence
+        num_intermediate_levels = max_levels_in_batch - 2
+        if num_intermediate_levels > 0:
+            # Draw points from the Sobol engine (on CPU) and move to target device
+            sobol_points = self.sobol_engine.draw(n_augment).to(device)
+            # We only need the first `num_intermediate_levels` dimensions
+            quasi_rand_points = sobol_points[:, :num_intermediate_levels]
+        else:
+            # Handle case where max_levels_in_batch is 2 (no intermediate points needed)
+            quasi_rand_points = torch.empty(n_augment, 0, device=device)
+        scaled_quasi_rand_levels = min_vals + value_range * quasi_rand_points
+        level_values = torch.cat([min_vals, max_vals, scaled_quasi_rand_levels], dim=1)
+        level_values, _ = torch.sort(level_values, dim=1)
+        # 5. Find the closest level using a mask to ignore padded values
+        series_expanded = series_to_augment.unsqueeze(2)
+        levels_expanded = level_values.unsqueeze(1)
+        diff = torch.abs(series_expanded - levels_expanded)
+        arange_mask = torch.arange(max_levels_in_batch, device=device).unsqueeze(0)
+        valid_levels_mask = arange_mask < n_levels_per_series.unsqueeze(1)
+        masked_diff = torch.where(valid_levels_mask.unsqueeze(1), diff, float("inf"))
+        closest_level_indices = torch.argmin(masked_diff, dim=2)
+        # 6. Gather the results from the original level values
+        quantized_subset = torch.gather(level_values, 1, closest_level_indices)
+        # 7. For flat series, revert to their original values
+        final_subset = torch.where(is_flat, series_to_augment, quantized_subset)
+        # 8. Place augmented data back into a copy of the original batch
+        augmented_batch_squeezed = time_series_squeezed.clone()
+        augmented_batch_squeezed[augment_mask] = final_subset
+        # Restore original shape before returning
+        if is_3d:
+            return augmented_batch_squeezed.unsqueeze(-1)
+        else:
+            return augmented_batch_squeezed
+class MixUpAugmenter:
+    """
+    Applies mixup augmentation by creating a weighted average of multiple time series.
+    This version includes an option for time-dependent mixup using Simplex Path
+    Interpolation, creating a smooth transition between different mixing weights.
+    """
+    def __init__(
+        self,
+        max_n_series_to_combine: int = 10,
+        p_combine: float = 0.4,
+        p_time_dependent: float = 0.5,
+        randomize_k_per_series: bool = True,
+        dirichlet_alpha_range: Tuple[float, float] = (0.1, 5.0),
+    ):
+        """
+        Initializes the augmenter.
+        Args:
+            max_n_series_to_combine (int): The maximum number of series to combine.
+                The actual number k will be sampled from [2, max].
+            p_combine (float): The probability of replacing a series with a combination.
+            p_time_dependent (float): The probability of using the time-dependent
+                simplex path method for a given mixup operation. Defaults to 0.5.
+            randomize_k_per_series (bool): If True, each augmented series will be a
+                combination of a different number of series (k).
+                If False, one k is chosen for the whole batch.
+            dirichlet_alpha_range (Tuple[float, float]): The [min, max] range to sample the
+                Dirichlet 'alpha' from. A smaller alpha (e.g., 0.2) creates mixes
+                dominated by one series. A larger alpha (e.g., 5.0) creates
+                more uniform weights.
+        """
+        assert max_n_series_to_combine >= 2, "Must combine at least 2 series."
+        assert 0.0 <= p_combine <= 1.0, "p_combine must be between 0 and 1."
+        assert 0.0 <= p_time_dependent <= 1.0, (
+            "p_time_dependent must be between 0 and 1."
+        )
+        assert (
+            dirichlet_alpha_range[0] > 0
+            and dirichlet_alpha_range[0] <= dirichlet_alpha_range[1]
+        )
+        self.max_k = max_n_series_to_combine
+        self.p_combine = p_combine
+        self.p_time_dependent = p_time_dependent
+        self.randomize_k = randomize_k_per_series
+        self.alpha_range = dirichlet_alpha_range
+    def _sample_alpha(self) -> float:
+        log_alpha_min = math.log10(self.alpha_range[0])
+        log_alpha_max = math.log10(self.alpha_range[1])
+        log_alpha = log_alpha_min + np.random.rand() * (log_alpha_max - log_alpha_min)
+        return float(10**log_alpha)
+    def _sample_k(self) -> int:
+        return int(torch.randint(2, self.max_k + 1, (1,)).item())
+    def _static_mix(
+        self,
+        source_series: torch.Tensor,
+        alpha: float,
+        return_weights: bool = False,
+    ):
+        """Mixes k source series using a single, static set of Dirichlet weights."""
+        k = int(source_series.shape[0])
+        device = source_series.device
+        concentration = torch.full((k,), float(alpha), device=device)
+        weights = torch.distributions.Dirichlet(concentration).sample()
+        weights_view = weights.view(k, 1, 1)
+        mixed_series = (source_series * weights_view).sum(dim=0, keepdim=True)
+        if return_weights:
+            return mixed_series, weights
+        return mixed_series
+    def _simplex_path_mix(
+        self,
+        source_series: torch.Tensor,
+        alpha: float,
+        return_weights: bool = False,
+    ):
+        """Mixes k series using time-varying weights interpolated along a simplex path."""
+        k, length, _ = source_series.shape
+        device = source_series.device
+        # 1. Sample two endpoint weight vectors from the Dirichlet distribution
+        concentration = torch.full((k,), float(alpha), device=device)
+        dirichlet_dist = torch.distributions.Dirichlet(concentration)
+        w_start = dirichlet_dist.sample()
+        w_end = dirichlet_dist.sample()
+        # 2. Create a linear ramp from 0 to 1
+        alpha_ramp = torch.linspace(0, 1, length, device=device)
+        # 3. Interpolate between the endpoint weights over time
+        # Reshape for broadcasting: w vectors become [k, 1], ramp becomes [1, length]
+        time_varying_weights = w_start.unsqueeze(1) * (
+            1 - alpha_ramp.unsqueeze(0)
+        ) + w_end.unsqueeze(1) * alpha_ramp.unsqueeze(0)
+        # The result `time_varying_weights` has shape [k, length]
+        # 4. Apply the time-varying weights
+        weights_view = time_varying_weights.unsqueeze(-1)  # Shape: [k, length, 1]
+        mixed_series = (source_series * weights_view).sum(dim=0, keepdim=True)
+        if return_weights:
+            return mixed_series, time_varying_weights
+        return mixed_series
+    def transform(
+        self, time_series_batch: torch.Tensor, return_debug_info: bool = False
+    ):
+        """
+        Applies the mixup augmentation, randomly choosing between static and
+        time-dependent mixing methods.
+        """
+        with torch.no_grad():
+            if self.p_combine == 0:
+                return (
+                    (time_series_batch, {}) if return_debug_info else time_series_batch
+                )
+            batch_size, _, _ = time_series_batch.shape
+            device = time_series_batch.device
+            if batch_size <= self.max_k:
+                return (
+                    (time_series_batch, {}) if return_debug_info else time_series_batch
+                )
+            # 1. Decide which series to replace
+            augment_mask = torch.rand(batch_size, device=device) < self.p_combine
+            indices_to_replace = torch.where(augment_mask)[0]
+            n_augment = indices_to_replace.numel()
+            if n_augment == 0:
+                return (
+                    (time_series_batch, {}) if return_debug_info else time_series_batch
+                )
+            # 2. Determine k for each series to augment
+            if self.randomize_k:
+                k_values = torch.randint(2, self.max_k + 1, (n_augment,), device=device)
+            else:
+                k = self._sample_k()
+                k_values = torch.full((n_augment,), k, device=device)
+            # 3. Augment series one by one
+            new_series_list = []
+            all_batch_indices = torch.arange(batch_size, device=device)
+            debug_info = {}
+            for i, target_idx in enumerate(indices_to_replace):
+                current_k = k_values[i].item()
+                # Sample source indices
+                candidate_mask = all_batch_indices != target_idx
+                candidates = all_batch_indices[candidate_mask]
+                perm = torch.randperm(candidates.shape[0], device=device)
+                source_indices = candidates[perm[:current_k]]
+                source_series = time_series_batch[source_indices]
+                alpha = self._sample_alpha()
+                mix_type = "static"
+                # Randomly choose between static and time-dependent mixup
+                if torch.rand(1).item() < self.p_time_dependent:
+                    mixed_series, weights = self._simplex_path_mix(
+                        source_series, alpha=alpha, return_weights=True
+                    )
+                    mix_type = "simplex"
+                else:
+                    mixed_series, weights = self._static_mix(
+                        source_series, alpha=alpha, return_weights=True
+                    )
+                new_series_list.append(mixed_series)
+                if return_debug_info:
+                    debug_info[target_idx.item()] = {
+                        "source_indices": source_indices.cpu().numpy(),
+                        "weights": weights.cpu().numpy(),
+                        "alpha": alpha,
+                        "k": current_k,
+                        "mix_type": mix_type,
+                    }
+            # 4. Place augmented series back into a clone of the original batch
+            augmented_batch = time_series_batch.clone()
+            if new_series_list:
+                new_series_tensor = torch.cat(new_series_list, dim=0)
+                augmented_batch[indices_to_replace] = new_series_tensor
+            if return_debug_info:
+                return augmented_batch.detach(), debug_info
+            return augmented_batch.detach()
+class TimeFlipAugmenter:
+    """
+    Applies time-reversal augmentation to a random subset of time series in a batch.
+    """
+    def __init__(self, p_flip: float = 0.5):
+        """
+        Initializes the TimeFlipAugmenter.
+        Args:
+            p_flip (float): The probability of flipping a single time series in the batch.
+                            Defaults to 0.5.
+        """
+        assert 0.0 <= p_flip <= 1.0, "Probability must be between 0 and 1."
+        self.p_flip = p_flip
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """
+        Applies time-reversal augmentation to a batch of time series.
+        Args:
+            time_series_batch (torch.Tensor): The input batch of time series with
+                                              shape (batch_size, seq_len, num_channels).
+        Returns:
+            torch.Tensor: The batch with some series potentially flipped.
+        """
+        with torch.no_grad():
+            if self.p_flip == 0:
+                return time_series_batch
+            batch_size = time_series_batch.shape[0]
+            device = time_series_batch.device
+            # 1. Decide which series in the batch to flip
+            flip_mask = torch.rand(batch_size, device=device) < self.p_flip
+            indices_to_flip = torch.where(flip_mask)[0]
+            if indices_to_flip.numel() == 0:
+                return time_series_batch
+            # 2. Select the series to be flipped
+            series_to_flip = time_series_batch[indices_to_flip]
+            # 3. Flip them along the time dimension (dim=1)
+            flipped_series = torch.flip(series_to_flip, dims=[1])
+            # 4. Create a copy of the batch and place the flipped series into it
+            augmented_batch = time_series_batch.clone()
+            augmented_batch[indices_to_flip] = flipped_series
+            return augmented_batch
+class YFlipAugmenter:
+    """
+    Applies y-reversal augmentation to a random subset of time series in a batch.
+    """
+    def __init__(self, p_flip: float = 0.5):
+        """
+        Initializes the TimeFlipAugmenter.
+        Args:
+            p_flip (float): The probability of flipping a single time series in the batch.
+                            Defaults to 0.5.
+        """
+        assert 0.0 <= p_flip <= 1.0, "Probability must be between 0 and 1."
+        self.p_flip = p_flip
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """
+        Applies time-reversal augmentation to a batch of time series.
+        Args:
+            time_series_batch (torch.Tensor): The input batch of time series with
+                                              shape (batch_size, seq_len, num_channels).
+        Returns:
+            torch.Tensor: The batch with some series potentially flipped.
+        """
+        with torch.no_grad():
+            if self.p_flip == 0:
+                return time_series_batch
+            batch_size = time_series_batch.shape[0]
+            device = time_series_batch.device
+            # 1. Decide which series in the batch to flip
+            flip_mask = torch.rand(batch_size, device=device) < self.p_flip
+            indices_to_flip = torch.where(flip_mask)[0]
+            if indices_to_flip.numel() == 0:
+                return time_series_batch
+            # 2. Select the series to be flipped
+            series_to_flip = time_series_batch[indices_to_flip]
+            # 3. Flip them along the time dimension (dim=1)
+            flipped_series = -series_to_flip
+            # 4. Create a copy of the batch and place the flipped series into it
+            augmented_batch = time_series_batch.clone()
+            augmented_batch[indices_to_flip] = flipped_series
+            return augmented_batch
+class DifferentialAugmenter:
+    """
+    Applies calculus-inspired augmentations. This version includes up to the
+    fourth derivative and uses nn.Conv1d with built-in 'reflect' padding for
+    cleaner and more efficient convolutions.
+    The Gaussian kernel size and sigma for the initial smoothing are randomly
+    sampled at every transform() call from user-defined ranges.
+    """
+    def __init__(
+        self,
+        p_transform: float,
+        gaussian_kernel_size_range: Tuple[int, int] = (5, 51),
+        gaussian_sigma_range: Tuple[float, float] = (2.0, 20.0),
+    ):
+        """
+        Initializes the augmenter.
+        Args:
+            p_transform (float): The probability of applying an augmentation to any given
+                                 time series in a batch.
+            gaussian_kernel_size_range (Tuple[int, int]): The [min, max] inclusive range
+                                                           for the Gaussian kernel size.
+                                                           Sizes will be forced to be odd.
+            gaussian_sigma_range (Tuple[float, float]): The [min, max] inclusive range
+                                                        for the Gaussian sigma.
+        """
+        self.p_transform = p_transform
+        self.kernel_size_range = gaussian_kernel_size_range
+        self.sigma_range = gaussian_sigma_range
+        # Validate ranges
+        if not (
+            self.kernel_size_range[0] <= self.kernel_size_range[1]
+            and self.kernel_size_range[0] >= 3
+        ):
+            raise ValueError(
+                "Invalid kernel size range. Ensure min <= max and min >= 3."
+            )
+        if not (self.sigma_range[0] <= self.sigma_range[1] and self.sigma_range[0] > 0):
+            raise ValueError("Invalid sigma range. Ensure min <= max and min > 0.")
+        # Cache for fixed-kernel convolution layers (Sobel, Laplace, etc.)
+        self.conv_cache: Dict[Tuple[int, torch.device], Dict[str, nn.Module]] = {}
+    def _create_fixed_kernel_layers(
+        self, num_channels: int, device: torch.device
+    ) -> dict:
+        """
+        Creates and configures nn.Conv1d layers for fixed-kernel derivative operations.
+        These layers are cached to improve performance.
+        """
+        sobel_conv = nn.Conv1d(
+            in_channels=num_channels,
+            out_channels=num_channels,
+            kernel_size=3,
+            padding="same",
+            padding_mode="reflect",
+            groups=num_channels,
+            bias=False,
+            device=device,
+        )
+        laplace_conv = nn.Conv1d(
+            in_channels=num_channels,
+            out_channels=num_channels,
+            kernel_size=3,
+            padding="same",
+            padding_mode="reflect",
+            groups=num_channels,
+            bias=False,
+            device=device,
+        )
+        d3_conv = nn.Conv1d(
+            in_channels=num_channels,
+            out_channels=num_channels,
+            kernel_size=5,
+            padding="same",
+            padding_mode="reflect",
+            groups=num_channels,
+            bias=False,
+            device=device,
+        )
+        d4_conv = nn.Conv1d(
+            in_channels=num_channels,
+            out_channels=num_channels,
+            kernel_size=5,
+            padding="same",
+            padding_mode="reflect",
+            groups=num_channels,
+            bias=False,
+            device=device,
+        )
+        sobel_kernel = (
+            torch.tensor([-1, 0, 1], device=device, dtype=torch.float32)
+            .view(1, 1, -1)
+            .repeat(num_channels, 1, 1)
+        )
+        laplace_kernel = (
+            torch.tensor([1, -2, 1], device=device, dtype=torch.float32)
+            .view(1, 1, -1)
+            .repeat(num_channels, 1, 1)
+        )
+        d3_kernel = (
+            torch.tensor([-1, 2, 0, -2, 1], device=device, dtype=torch.float32)
+            .view(1, 1, -1)
+            .repeat(num_channels, 1, 1)
+        )
+        d4_kernel = (
+            torch.tensor([1, -4, 6, -4, 1], device=device, dtype=torch.float32)
+            .view(1, 1, -1)
+            .repeat(num_channels, 1, 1)
+        )
+        sobel_conv.weight.data = sobel_kernel
+        laplace_conv.weight.data = laplace_kernel
+        d3_conv.weight.data = d3_kernel
+        d4_conv.weight.data = d4_kernel
+        for layer in [sobel_conv, laplace_conv, d3_conv, d4_conv]:
+            layer.weight.requires_grad = False
+        return {
+            "sobel": sobel_conv,
+            "laplace": laplace_conv,
+            "d3": d3_conv,
+            "d4": d4_conv,
+        }
+    def _create_gaussian_layer(
+        self, kernel_size: int, sigma: float, num_channels: int, device: torch.device
+    ) -> nn.Module:
+        """Creates a single Gaussian convolution layer with the given dynamic parameters."""
+        gauss_conv = nn.Conv1d(
+            in_channels=num_channels,
+            out_channels=num_channels,
+            kernel_size=kernel_size,
+            padding="same",
+            padding_mode="reflect",
+            groups=num_channels,
+            bias=False,
+            device=device,
+        )
+        ax = torch.arange(
+            -(kernel_size // 2),
+            kernel_size // 2 + 1,
+            device=device,
+            dtype=torch.float32,
+        )
+        gauss_kernel = torch.exp(-0.5 * (ax / sigma) ** 2)
+        gauss_kernel /= gauss_kernel.sum()
+        gauss_kernel = gauss_kernel.view(1, 1, -1).repeat(num_channels, 1, 1)
+        gauss_conv.weight.data = gauss_kernel
+        gauss_conv.weight.requires_grad = False
+        return gauss_conv
+    def _rescale_signal(
+        self, processed_signal: torch.Tensor, original_signal: torch.Tensor
+    ) -> torch.Tensor:
+        """Rescales the processed signal to match the min/max range of the original."""
+        original_min = torch.amin(original_signal, dim=2, keepdim=True)
+        original_max = torch.amax(original_signal, dim=2, keepdim=True)
+        processed_min = torch.amin(processed_signal, dim=2, keepdim=True)
+        processed_max = torch.amax(processed_signal, dim=2, keepdim=True)
+        original_range = original_max - original_min
+        processed_range = processed_max - processed_min
+        epsilon = 1e-8
+        rescaled_signal = (
+            (processed_signal - processed_min) / (processed_range + epsilon)
+        ) * original_range + original_min
+        return torch.where(original_range < epsilon, original_signal, rescaled_signal)
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """Applies a random augmentation to a subset of the batch."""
+        with torch.no_grad():
+            if self.p_transform == 0:
+                return time_series_batch
+            batch_size, seq_len, num_channels = time_series_batch.shape
+            device = time_series_batch.device
+            augment_mask = torch.rand(batch_size, device=device) < self.p_transform
+            indices_to_augment = torch.where(augment_mask)[0]
+            num_to_augment = indices_to_augment.numel()
+            if num_to_augment == 0:
+                return time_series_batch
+            # --- 🎲 Randomly sample Gaussian parameters for this call ---
+            min_k, max_k = self.kernel_size_range
+            kernel_size = torch.randint(min_k, max_k + 1, (1,)).item()
+            kernel_size = kernel_size // 2 * 2 + 1  # Ensure kernel size is odd
+            min_s, max_s = self.sigma_range
+            sigma = (min_s + (max_s - min_s) * torch.rand(1)).item()
+            # --- Get/Create Convolution Layers ---
+            gauss_conv = self._create_gaussian_layer(
+                kernel_size, sigma, num_channels, device
+            )
+            cache_key = (num_channels, device)
+            if cache_key not in self.conv_cache:
+                self.conv_cache[cache_key] = self._create_fixed_kernel_layers(
+                    num_channels, device
+                )
+            fixed_layers = self.conv_cache[cache_key]
+            # --- Apply Augmentations ---
+            subset_to_augment = time_series_batch[indices_to_augment]
+            subset_permuted = subset_to_augment.permute(0, 2, 1)
+            op_choices = torch.randint(0, 6, (num_to_augment,), device=device)
+            smoothed_subset = gauss_conv(subset_permuted)
+            sobel_on_smoothed = fixed_layers["sobel"](smoothed_subset)
+            laplace_on_smoothed = fixed_layers["laplace"](smoothed_subset)
+            d3_on_smoothed = fixed_layers["d3"](smoothed_subset)
+            d4_on_smoothed = fixed_layers["d4"](smoothed_subset)
+            gauss_result = self._rescale_signal(smoothed_subset, subset_permuted)
+            sobel_result = self._rescale_signal(sobel_on_smoothed, subset_permuted)
+            laplace_result = self._rescale_signal(laplace_on_smoothed, subset_permuted)
+            d3_result = self._rescale_signal(d3_on_smoothed, subset_permuted)
+            d4_result = self._rescale_signal(d4_on_smoothed, subset_permuted)
+            use_right_integral = torch.rand(num_to_augment, 1, 1, device=device) > 0.5
+            flipped_subset = torch.flip(subset_permuted, dims=[2])
+            right_integral = torch.flip(torch.cumsum(flipped_subset, dim=2), dims=[2])
+            left_integral = torch.cumsum(subset_permuted, dim=2)
+            integral_result = torch.where(
+                use_right_integral, right_integral, left_integral
+            )
+            integral_result_normalized = self._rescale_signal(
+                integral_result, subset_permuted
+            )
+            # --- Assemble the results based on op_choices ---
+            op_choices_view = op_choices.view(-1, 1, 1)
+            augmented_subset = torch.where(
+                op_choices_view == 0, gauss_result, subset_permuted
+            )
+            augmented_subset = torch.where(
+                op_choices_view == 1, sobel_result, augmented_subset
+            )
+            augmented_subset = torch.where(
+                op_choices_view == 2, laplace_result, augmented_subset
+            )
+            augmented_subset = torch.where(
+                op_choices_view == 3, integral_result_normalized, augmented_subset
+            )
+            augmented_subset = torch.where(
+                op_choices_view == 4, d3_result, augmented_subset
+            )
+            augmented_subset = torch.where(
+                op_choices_view == 5, d4_result, augmented_subset
+            )
+            augmented_subset_final = augmented_subset.permute(0, 2, 1)
+            augmented_batch = time_series_batch.clone()
+            augmented_batch[indices_to_augment] = augmented_subset_final
+            return augmented_batch
+class RandomConvAugmenter:
+    """
+    Applies a stack of 1-to-N random 1D convolutions to a time series batch.
+    This augmenter is inspired by the principles of ROCKET and RandConv,
+    randomizing nearly every aspect of the convolution process to create a
+    highly diverse set of transformations. This version includes multiple
+    kernel generation strategies, random padding modes, and optional non-linearities.
+    """
+    def __init__(
+        self,
+        p_transform: float = 0.5,
+        kernel_size_range: Tuple[int, int] = (3, 31),
+        dilation_range: Tuple[int, int] = (1, 8),
+        layer_range: Tuple[int, int] = (1, 3),
+        sigma_range: Tuple[float, float] = (0.5, 5.0),
+        bias_range: Tuple[float, float] = (-0.5, 0.5),
+    ):
+        """
+        Initializes the augmenter.
+        Args:
+            p_transform (float): Probability of applying the augmentation to a series.
+            kernel_size_range (Tuple[int, int]): [min, max] range for kernel sizes.
+                                                 Must be odd numbers.
+            dilation_range (Tuple[int, int]): [min, max] range for dilation factors.
+            layer_range (Tuple[int, int]): [min, max] range for the number of
+                                           stacked convolution layers.
+            sigma_range (Tuple[float, float]): [min, max] range for the sigma of
+                                               Gaussian kernels.
+            bias_range (Tuple[float, float]): [min, max] range for the bias term.
+        """
+        assert kernel_size_range[0] % 2 == 1 and kernel_size_range[1] % 2 == 1, (
+            "Kernel sizes must be odd."
+        )
+        self.p_transform = p_transform
+        self.kernel_size_range = kernel_size_range
+        self.dilation_range = dilation_range
+        self.layer_range = layer_range
+        self.sigma_range = sigma_range
+        self.bias_range = bias_range
+        self.padding_modes = ["reflect", "replicate", "circular"]
+    def _rescale_signal(
+        self, processed_signal: torch.Tensor, original_signal: torch.Tensor
+    ) -> torch.Tensor:
+        """Rescales the processed signal to match the min/max range of the original."""
+        original_min = torch.amin(original_signal, dim=-1, keepdim=True)
+        original_max = torch.amax(original_signal, dim=-1, keepdim=True)
+        processed_min = torch.amin(processed_signal, dim=-1, keepdim=True)
+        processed_max = torch.amax(processed_signal, dim=-1, keepdim=True)
+        original_range = original_max - original_min
+        processed_range = processed_max - processed_min
+        epsilon = 1e-8
+        is_flat = processed_range < epsilon
+        rescaled_signal = (
+            (processed_signal - processed_min) / (processed_range + epsilon)
+        ) * original_range + original_min
+        original_mean = torch.mean(original_signal, dim=-1, keepdim=True)
+        flat_rescaled = original_mean.expand_as(original_signal)
+        return torch.where(is_flat, flat_rescaled, rescaled_signal)
+    def _apply_random_conv_stack(self, series: torch.Tensor) -> torch.Tensor:
+        """
+        Applies a randomly configured stack of convolutions to a single time series.
+        Args:
+            series (torch.Tensor): A single time series of shape (1, num_channels, seq_len).
+        Returns:
+            torch.Tensor: The augmented time series.
+        """
+        num_channels = series.shape[1]
+        device = series.device
+        num_layers = torch.randint(
+            self.layer_range[0], self.layer_range[1] + 1, (1,)
+        ).item()
+        processed_series = series
+        for i in range(num_layers):
+            # 1. Sample kernel size
+            k_min, k_max = self.kernel_size_range
+            kernel_size = torch.randint(k_min // 2, k_max // 2 + 1, (1,)).item() * 2 + 1
+            # 2. Sample dilation
+            d_min, d_max = self.dilation_range
+            dilation = torch.randint(d_min, d_max + 1, (1,)).item()
+            # 3. Sample bias
+            b_min, b_max = self.bias_range
+            bias_val = (b_min + (b_max - b_min) * torch.rand(1)).item()
+            # 4. Sample padding mode
+            padding_mode = np.random.choice(self.padding_modes)
+            conv_layer = nn.Conv1d(
+                in_channels=num_channels,
+                out_channels=num_channels,
+                kernel_size=kernel_size,
+                dilation=dilation,
+                padding="same",  # Let PyTorch handle padding calculation
+                padding_mode=padding_mode,
+                groups=num_channels,
+                bias=True,
+                device=device,
+            )
+            # 5. Sample kernel weights from a wider variety of types
+            weight_type = torch.randint(0, 4, (1,)).item()
+            if weight_type == 0:  # Gaussian kernel
+                s_min, s_max = self.sigma_range
+                sigma = (s_min + (s_max - s_min) * torch.rand(1)).item()
+                ax = torch.arange(
+                    -(kernel_size // 2),
+                    kernel_size // 2 + 1,
+                    device=device,
+                    dtype=torch.float32,
+                )
+                kernel = torch.exp(-0.5 * (ax / sigma) ** 2)
+            elif weight_type == 1:  # Standard normal kernel
+                kernel = torch.randn(kernel_size, device=device)
+            elif weight_type == 2:  # Polynomial kernel
+                coeffs = torch.randn(3, device=device)  # a, b, c for ax^2+bx+c
+                x_vals = torch.linspace(-1, 1, kernel_size, device=device)
+                kernel = coeffs[0] * x_vals**2 + coeffs[1] * x_vals + coeffs[2]
+            else:  # Noisy Sobel kernel
+                # Ensure kernel is large enough for a Sobel filter
+                actual_kernel_size = 3 if kernel_size < 3 else kernel_size
+                sobel_base = torch.tensor(
+                    [-1, 0, 1], dtype=torch.float32, device=device
+                )
+                noise = torch.randn(3, device=device) * 0.1
+                noisy_sobel = sobel_base + noise
+                # Pad if the random kernel size is larger than 3
+                pad_total = actual_kernel_size - 3
+                pad_left = pad_total // 2
+                pad_right = pad_total - pad_left
+                kernel = F.pad(noisy_sobel, (pad_left, pad_right), "constant", 0)
+            # 6. Probabilistic normalization
+            if torch.rand(1).item() < 0.8:  # 80% chance to normalize
+                kernel /= torch.sum(torch.abs(kernel)) + 1e-8
+            kernel = kernel.view(1, 1, -1).repeat(num_channels, 1, 1)
+            conv_layer.weight.data = kernel
+            conv_layer.bias.data.fill_(bias_val)
+            conv_layer.weight.requires_grad = False
+            conv_layer.bias.requires_grad = False
+            # Apply convolution
+            processed_series = conv_layer(processed_series)
+            # 7. Optional non-linearity (not on the last layer)
+            if i < num_layers - 1:
+                activation_type = torch.randint(0, 3, (1,)).item()
+                if activation_type == 1:
+                    processed_series = F.relu(processed_series)
+                elif activation_type == 2:
+                    processed_series = torch.tanh(processed_series)
+                # if 0, do nothing (linear)
+        return processed_series
+    def transform(self, time_series_batch: torch.Tensor) -> torch.Tensor:
+        """Applies a random augmentation to a subset of the batch."""
+        with torch.no_grad():
+            if self.p_transform == 0:
+                return time_series_batch
+            batch_size, seq_len, num_channels = time_series_batch.shape
+            device = time_series_batch.device
+            augment_mask = torch.rand(batch_size, device=device) < self.p_transform
+            indices_to_augment = torch.where(augment_mask)[0]
+            num_to_augment = indices_to_augment.numel()
+            if num_to_augment == 0:
+                return time_series_batch
+            subset_to_augment = time_series_batch[indices_to_augment]
+            subset_permuted = subset_to_augment.permute(0, 2, 1)
+            augmented_subset_list = []
+            for i in range(num_to_augment):
+                original_series = subset_permuted[i : i + 1]
+                augmented_series = self._apply_random_conv_stack(original_series)
+                rescaled_series = self._rescale_signal(
+                    augmented_series.squeeze(0), original_series.squeeze(0)
+                )
+                augmented_subset_list.append(rescaled_series.unsqueeze(0))
+            if augmented_subset_list:
+                augmented_subset = torch.cat(augmented_subset_list, dim=0)
+                augmented_subset_final = augmented_subset.permute(0, 2, 1)
+                augmented_batch = time_series_batch.clone()
+                augmented_batch[indices_to_augment] = augmented_subset_final
+                return augmented_batch
+            else:
+                return time_series_batch

src/data/batch_composer.py ADDED Viewed

	@@ -0,0 +1,705 @@

+import json
+import logging
+import random
+from typing import Dict, Optional, Tuple
+import numpy as np
+import pandas as pd
+import torch
+from src.data.augmentations import (
+    NanAugmenter,
+)
+from src.data.constants import DEFAULT_NAN_STATS_PATH, LENGTH_CHOICES, LENGTH_WEIGHTS
+from src.data.containers import BatchTimeSeriesContainer
+from src.data.datasets import CyclicalBatchDataset
+from src.data.frequency import Frequency
+from src.data.scalers import MeanScaler, MedianScaler, MinMaxScaler, RobustScaler
+from src.data.utils import sample_future_length
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class BatchComposer:
+    """
+    Composes batches from saved generator data according to specified proportions.
+    Manages multiple CyclicalBatchDataset instances and creates uniform or mixed batches.
+    """
+    def __init__(
+        self,
+        base_data_dir: str,
+        generator_proportions: Optional[Dict[str, float]] = None,
+        mixed_batches: bool = True,
+        device: Optional[torch.device] = None,
+        augmentations: Optional[Dict[str, bool]] = None,
+        augmentation_probabilities: Optional[Dict[str, float]] = None,
+        nan_stats_path: Optional[str] = None,
+        nan_patterns_path: Optional[str] = None,
+        global_seed: int = 42,
+        chosen_scaler_name: Optional[str] = None,
+        rank: int = 0,
+        world_size: int = 1,
+    ):
+        """
+        Initialize the BatchComposer.
+        Args:
+            base_data_dir: Base directory containing generator subdirectories
+            generator_proportions: Dict mapping generator names to proportions
+            mixed_batches: If True, create mixed batches; if False, uniform batches
+            device: Device to load tensors to
+            augmentations: Dict mapping augmentation names to booleans
+            augmentation_probabilities: Dict mapping augmentation names to probabilities
+            global_seed: Global random seed
+            chosen_scaler_name: Name of the scaler that used in training
+            rank: Rank of current process for distributed data loading
+            world_size: Total number of processes for distributed data loading
+        """
+        self.base_data_dir = base_data_dir
+        self.mixed_batches = mixed_batches
+        self.device = device
+        self.global_seed = global_seed
+        self.nan_stats_path = nan_stats_path
+        self.nan_patterns_path = nan_patterns_path
+        self.rank = rank
+        self.world_size = world_size
+        self.augmentation_probabilities = augmentation_probabilities or {
+            "noise_augmentation": 0.3,
+            "scaler_augmentation": 0.5,
+        }
+        # Optional preferred scaler name provided by training config
+        self.chosen_scaler_name = (
+            chosen_scaler_name.lower() if chosen_scaler_name is not None else None
+        )
+        # Setup random state
+        self.rng = np.random.default_rng(global_seed)
+        random.seed(global_seed)
+        torch.manual_seed(global_seed)
+        # Setup augmentations
+        self._setup_augmentations(augmentations)
+        # Setup generator proportions
+        self._setup_proportions(generator_proportions)
+        # Initialize datasets
+        self.datasets = self._initialize_datasets()
+        logger.info(
+            f"Initialized BatchComposer with {len(self.datasets)} generators, "
+            f"mixed_batches={mixed_batches}, proportions={self.generator_proportions}, "
+            f"augmentations={self.augmentations}, "
+            f"augmentation_probabilities={self.augmentation_probabilities}"
+        )
+    def _setup_augmentations(self, augmentations: Optional[Dict[str, bool]]):
+        """Setup only the augmentations that should remain online (NaN)."""
+        default_augmentations = {
+            "nan_augmentation": False,
+            "scaler_augmentation": False,
+            "length_shortening": False,
+        }
+        self.augmentations = augmentations or default_augmentations
+        # Initialize NaN augmenter if needed
+        self.nan_augmenter = None
+        if self.augmentations.get("nan_augmentation", False):
+            stats_path_to_use = self.nan_stats_path or DEFAULT_NAN_STATS_PATH
+            stats = json.load(open(stats_path_to_use, "r"))
+            self.nan_augmenter = NanAugmenter(
+                p_series_has_nan=stats["p_series_has_nan"],
+                nan_ratio_distribution=stats["nan_ratio_distribution"],
+                nan_length_distribution=stats["nan_length_distribution"],
+                nan_patterns_path=self.nan_patterns_path,
+            )
+    def _should_apply_scaler_augmentation(self) -> bool:
+        """
+        Decide whether to apply scaler augmentation for a single series based on
+        the boolean toggle and probability from the configuration.
+        """
+        if not self.augmentations.get("scaler_augmentation", False):
+            return False
+        probability = float(
+            self.augmentation_probabilities.get("scaler_augmentation", 0.0)
+        )
+        probability = max(0.0, min(1.0, probability))
+        return bool(self.rng.random() < probability)
+    def _choose_random_scaler(self) -> Optional[object]:
+        """
+        Choose a random scaler for augmentation, explicitly avoiding the one that
+        is already selected in the training configuration (if any).
+        Returns an instance of the selected scaler or None when no valid option exists.
+        """
+        chosen: Optional[str] = None
+        if self.chosen_scaler_name is not None:
+            chosen = self.chosen_scaler_name.strip().lower()
+        candidates = ["custom_robust", "minmax", "median", "mean"]
+        # Remove the chosen scaler from the candidates
+        if chosen in candidates:
+            candidates = [c for c in candidates if c != chosen]
+        if not candidates:
+            return None
+        pick = str(self.rng.choice(candidates))
+        if pick == "custom_robust":
+            return RobustScaler()
+        if pick == "minmax":
+            return MinMaxScaler()
+        if pick == "median":
+            return MedianScaler()
+        if pick == "mean":
+            return MeanScaler()
+        return None
+    def _setup_proportions(self, generator_proportions):
+        """Setup default or custom generator proportions."""
+        default_proportions = {
+            "forecast_pfn": 1.0,
+            "gp": 1.0,
+            "kernel": 1.0,
+            "sinewave": 1.0,
+            "sawtooth": 1.0,
+            "step": 0.1,
+            "anomaly": 1.0,
+            "spike": 2.0,
+            "cauker_univariate": 2.0,
+            "cauker_multivariate": 0.00,
+            "lmc": 0.00,  # multivariate
+            "ou_process": 1.0,
+            "audio_financial_volatility": 0.1,
+            "audio_multi_scale_fractal": 0.1,
+            "audio_network_topology": 0.5,
+            "audio_stochastic_rhythm": 1.0,
+            "augmented_per_sample_2048": 3.0,
+            "augmented_temp_batch_2048": 3.0,
+        }
+        self.generator_proportions = generator_proportions or default_proportions
+        # Normalize proportions
+        total = sum(self.generator_proportions.values())
+        if total <= 0:
+            raise ValueError("Total generator proportions must be positive")
+        self.generator_proportions = {
+            k: v / total for k, v in self.generator_proportions.items()
+        }
+    def _initialize_datasets(self) -> Dict[str, CyclicalBatchDataset]:
+        """Initialize CyclicalBatchDataset for each generator with proportion > 0."""
+        datasets = {}
+        for generator_name, proportion in self.generator_proportions.items():
+            # Only initialize datasets for generators with positive proportion
+            if proportion <= 0:
+                logger.info(f"Skipping {generator_name} (proportion = {proportion})")
+                continue
+            batches_dir = f"{self.base_data_dir}/{generator_name}"
+            try:
+                dataset = CyclicalBatchDataset(
+                    batches_dir=batches_dir,
+                    generator_type=generator_name,
+                    device=None,
+                    prefetch_next=True,
+                    prefetch_threshold=32,
+                    rank=self.rank,
+                    world_size=self.world_size,
+                )
+                datasets[generator_name] = dataset
+                logger.info(
+                    f"Loaded dataset for {generator_name} (proportion = {proportion})"
+                )
+            except Exception as e:
+                logger.warning(f"Failed to load dataset for {generator_name}: {e}")
+                continue
+        if not datasets:
+            raise ValueError(
+                f"No valid datasets found in {self.base_data_dir} or all generators have proportion <= 0"
+            )
+        return datasets
+    def _convert_sample_to_tensors(
+        self, sample: dict, future_length: Optional[int] = None
+    ) -> Tuple[torch.Tensor, np.datetime64, Frequency]:
+        """
+        Convert a sample dict to tensors and metadata.
+        Args:
+            sample: Sample dict from CyclicalBatchDataset
+            future_length: Desired future length (if None, use default split)
+        Returns:
+            Tuple of (history_values, future_values, start, frequency)
+        """
+        # Handle both old and new data formats
+        num_channels = sample.get("num_channels", 1)
+        values_data = sample["values"]
+        generator_type = sample.get("generator_type", "unknown")
+        if num_channels == 1:
+            # Univariate data
+            if isinstance(values_data[0], list):
+                # New format: [[channel_values]]
+                values = torch.tensor(values_data[0], dtype=torch.float32)
+                logger.debug(
+                    f"{generator_type}: Using new univariate format, shape: {values.shape}"
+                )
+            else:
+                # Old format: [values]
+                values = torch.tensor(values_data, dtype=torch.float32)
+            values = values.unsqueeze(0).unsqueeze(-1)  # Shape: [1, seq_len, 1]
+        else:
+            # Multivariate data (LMC) - new format: [[ch1_values], [ch2_values], ...]
+            channel_tensors = []
+            for channel_values in values_data:
+                channel_tensor = torch.tensor(channel_values, dtype=torch.float32)
+                channel_tensors.append(channel_tensor)
+            # Stack channels: [1, seq_len, num_channels]
+            values = torch.stack(channel_tensors, dim=-1).unsqueeze(0)
+            logger.debug(
+                f"{generator_type}: Using multivariate format, {num_channels} channels, shape: {values.shape}"
+            )
+        # Handle frequency conversion
+        freq_str = sample["frequency"]
+        try:
+            frequency = Frequency(freq_str)
+        except ValueError:
+            # Map common frequency strings to Frequency enum
+            freq_mapping = {
+                "h": Frequency.H,
+                "D": Frequency.D,
+                "W": Frequency.W,
+                "M": Frequency.M,
+                "Q": Frequency.Q,
+                "A": Frequency.A,
+                "Y": Frequency.A,  # Annual
+                "1min": Frequency.T1,
+                "5min": Frequency.T5,
+                "10min": Frequency.T10,
+                "15min": Frequency.T15,
+                "30min": Frequency.T30,
+                "s": Frequency.S,
+            }
+            frequency = freq_mapping.get(freq_str, Frequency.H)  # Default to hourly
+        # Handle start timestamp
+        if isinstance(sample["start"], pd.Timestamp):
+            start = sample["start"].to_numpy()
+        else:
+            start = np.datetime64(sample["start"])
+        return values, start, frequency
+    def _effective_proportions_for_length(
+        self, total_length_for_batch: int
+    ) -> Dict[str, float]:
+        """
+        Build a simple, length-aware proportion map for the current batch.
+        Rules:
+        - For generators named 'augmented{L}', keep only the one matching the
+          chosen length L; zero out others.
+        - Keep non-augmented generators as-is.
+        - Drop generators that are unavailable (not loaded) or zero-weight.
+        - If nothing remains, fall back to 'augmented{L}' if available, else any dataset.
+        - Normalize the final map to sum to 1.
+        """
+        def augmented_length_from_name(name: str) -> Optional[int]:
+            if not name.startswith("augmented"):
+                return None
+            suffix = name[len("augmented") :]
+            if not suffix:
+                return None
+            try:
+                return int(suffix)
+            except ValueError:
+                return None
+        # 1) Adjust proportions with the length-aware rule
+        adjusted: Dict[str, float] = {}
+        for name, proportion in self.generator_proportions.items():
+            aug_len = augmented_length_from_name(name)
+            if aug_len is None:
+                adjusted[name] = proportion
+            else:
+                adjusted[name] = (
+                    proportion if aug_len == total_length_for_batch else 0.0
+                )
+        # 2) Keep only available, positive-weight datasets
+        adjusted = {
+            name: p for name, p in adjusted.items() if name in self.datasets and p > 0.0
+        }
+        # 3) Fallback if empty
+        if not adjusted:
+            preferred = f"augmented{total_length_for_batch}"
+            if preferred in self.datasets:
+                adjusted = {preferred: 1.0}
+            elif self.datasets:
+                # Choose any available dataset deterministically (first key)
+                first_key = next(iter(self.datasets.keys()))
+                adjusted = {first_key: 1.0}
+            else:
+                raise ValueError("No datasets available to create batch")
+        # 4) Normalize
+        total = sum(adjusted.values())
+        return {name: p / total for name, p in adjusted.items()}
+    def _compute_sample_counts_for_batch(
+        self, proportions: Dict[str, float], batch_size: int
+    ) -> Dict[str, int]:
+        """
+        Convert a proportion map into integer sample counts that sum to batch_size.
+        Strategy: allocate floor(batch_size * p) to each generator in order, and let the
+        last generator absorb any remainder to ensure the total matches exactly.
+        """
+        counts: Dict[str, int] = {}
+        remaining = batch_size
+        names = list(proportions.keys())
+        values = list(proportions.values())
+        for index, (name, p) in enumerate(zip(names, values)):
+            if index == len(names) - 1:
+                counts[name] = remaining
+            else:
+                n = int(batch_size * p)
+                counts[name] = n
+                remaining -= n
+        return counts
+    def _calculate_generator_samples(self, batch_size: int) -> Dict[str, int]:
+        """
+        Calculate the number of samples each generator should contribute.
+        Args:
+            batch_size: Total batch size
+        Returns:
+            Dict mapping generator names to sample counts
+        """
+        generator_samples = {}
+        remaining_samples = batch_size
+        generators = list(self.generator_proportions.keys())
+        proportions = list(self.generator_proportions.values())
+        # Calculate base samples for each generator
+        for i, (generator, proportion) in enumerate(zip(generators, proportions)):
+            if generator not in self.datasets:
+                continue
+            if i == len(generators) - 1:  # Last generator gets remaining samples
+                samples = remaining_samples
+            else:
+                samples = int(batch_size * proportion)
+                remaining_samples -= samples
+            generator_samples[generator] = samples
+        return generator_samples
+    def create_batch(
+        self,
+        batch_size: int = 128,
+        seed: Optional[int] = None,
+        future_length: Optional[int] = None,
+    ) -> Tuple[BatchTimeSeriesContainer, str]:
+        """
+        Create a batch of the specified size.
+        Args:
+            batch_size: Size of the batch to create
+            seed: Random seed for this batch
+            future_length: Fixed future length to use. If None, samples from gift_eval range
+        Returns:
+            Tuple of (batch_container, generator_info)
+        """
+        if seed is not None:
+            batch_rng = np.random.default_rng(seed)
+            random.seed(seed)
+        else:
+            batch_rng = self.rng
+        if self.mixed_batches:
+            return self._create_mixed_batch(batch_size, future_length)
+        else:
+            return self._create_uniform_batch(batch_size, batch_rng, future_length)
+    def _create_mixed_batch(
+        self, batch_size: int, future_length: Optional[int] = None
+    ) -> Tuple[BatchTimeSeriesContainer, str]:
+        """Create a mixed batch with samples from multiple generators, rejecting NaNs."""
+        # Choose total length for this batch; respect length_shortening flag.
+        # When disabled, always use the maximum to avoid shortening.
+        if self.augmentations.get("length_shortening", False):
+            lengths = list(LENGTH_WEIGHTS.keys())
+            probs = list(LENGTH_WEIGHTS.values())
+            total_length_for_batch = int(self.rng.choice(lengths, p=probs))
+        else:
+            total_length_for_batch = int(max(LENGTH_CHOICES))
+        if future_length is None:
+            prediction_length = int(
+                sample_future_length(
+                    range="gift_eval", total_length=total_length_for_batch
+                )
+            )
+        else:
+            prediction_length = future_length
+        history_length = total_length_for_batch - prediction_length
+        # Calculate samples per generator using simple, per-batch length-aware proportions
+        effective_props = self._effective_proportions_for_length(total_length_for_batch)
+        generator_samples = self._compute_sample_counts_for_batch(
+            effective_props, batch_size
+        )
+        all_values = []
+        all_starts = []
+        all_frequencies = []
+        actual_proportions = {}
+        # Collect valid samples from each generator using batched fetches to reduce I/O overhead
+        for generator_name, num_samples in generator_samples.items():
+            if num_samples == 0 or generator_name not in self.datasets:
+                continue
+            dataset = self.datasets[generator_name]
+            # Lists to hold valid samples for the current generator
+            generator_values = []
+            generator_starts = []
+            generator_frequencies = []
+            # Loop until we have collected the required number of VALID samples
+            max_attempts = 50
+            attempts = 0
+            while len(generator_values) < num_samples and attempts < max_attempts:
+                attempts += 1
+                # Fetch a batch larger than needed to reduce round-trips
+                need = num_samples - len(generator_values)
+                fetch_n = max(need * 2, 8)
+                samples = dataset.get_samples(fetch_n)
+                for sample in samples:
+                    if len(generator_values) >= num_samples:
+                        break
+                    values, sample_start, sample_freq = self._convert_sample_to_tensors(
+                        sample, future_length
+                    )
+                    # Skip if NaNs exist (we inject NaNs later in history only)
+                    if torch.isnan(values).any():
+                        continue
+                    # Resize to target batch length when longer
+                    if total_length_for_batch < values.shape[1]:
+                        strategy = self.rng.choice(["cut", "subsample"])  # 50/50
+                        if strategy == "cut":
+                            max_start_idx = values.shape[1] - total_length_for_batch
+                            start_idx = int(self.rng.integers(0, max_start_idx + 1))
+                            values = values[
+                                :, start_idx : start_idx + total_length_for_batch, :
+                            ]
+                        else:
+                            indices = np.linspace(
+                                0,
+                                values.shape[1] - 1,
+                                total_length_for_batch,
+                                dtype=int,
+                            )
+                            values = values[:, indices, :]
+                    # Optionally apply scaler augmentation according to configuration
+                    if self._should_apply_scaler_augmentation():
+                        scaler = self._choose_random_scaler()
+                        if scaler is not None:
+                            values = scaler.scale(
+                                values, scaler.compute_statistics(values)
+                            )
+                    generator_values.append(values)
+                    generator_starts.append(sample_start)
+                    generator_frequencies.append(sample_freq)
+            if len(generator_values) < num_samples:
+                logger.warning(
+                    f"Generator {generator_name}: collected {len(generator_values)}/{num_samples} after {attempts} attempts"
+                )
+            # Add the collected valid samples to the main batch lists
+            if generator_values:
+                all_values.extend(generator_values)
+                all_starts.extend(generator_starts)
+                all_frequencies.extend(generator_frequencies)
+                actual_proportions[generator_name] = len(generator_values)
+        if not all_values:
+            raise RuntimeError(
+                "No valid samples could be collected from any generator."
+            )
+        combined_values = torch.cat(all_values, dim=0)
+        # Split into history and future
+        combined_history = combined_values[:, :history_length, :]
+        combined_future = combined_values[
+            :, history_length : history_length + prediction_length, :
+        ]
+        if self.nan_augmenter is not None:
+            combined_history = self.nan_augmenter.transform(combined_history)
+        # Create container
+        container = BatchTimeSeriesContainer(
+            history_values=combined_history,
+            future_values=combined_future,
+            start=all_starts,
+            frequency=all_frequencies,
+        )
+        return container, "MixedBatch"
+    def _create_uniform_batch(
+        self,
+        batch_size: int,
+        batch_rng: np.random.Generator,
+        future_length: Optional[int] = None,
+    ) -> Tuple[BatchTimeSeriesContainer, str]:
+        """Create a uniform batch with samples from a single generator."""
+        # Select generator based on proportions
+        generators = list(self.datasets.keys())
+        proportions = [self.generator_proportions[gen] for gen in generators]
+        selected_generator = batch_rng.choice(generators, p=proportions)
+        # Sample future length
+        if future_length is None:
+            future_length = sample_future_length(range="gift_eval")
+        # Get samples from selected generator
+        dataset = self.datasets[selected_generator]
+        samples = dataset.get_samples(batch_size)
+        all_history_values = []
+        all_future_values = []
+        all_starts = []
+        all_frequencies = []
+        for sample in samples:
+            values, sample_start, sample_freq = self._convert_sample_to_tensors(
+                sample, future_length
+            )
+            total_length = values.shape[1]
+            history_length = max(1, total_length - future_length)
+            # Optionally apply scaler augmentation according to configuration
+            if self._should_apply_scaler_augmentation():
+                scaler = self._choose_random_scaler()
+                if scaler is not None:
+                    values = scaler.scale(values, scaler.compute_statistics(values))
+            # Reshape to [1, seq_len, 1] for single sample
+            hist_vals = values[:, :history_length, :]
+            fut_vals = values[:, history_length : history_length + future_length, :]
+            all_history_values.append(hist_vals)
+            all_future_values.append(fut_vals)
+            all_starts.append(sample_start)
+            all_frequencies.append(sample_freq)
+        # Combine samples
+        combined_history = torch.cat(all_history_values, dim=0)
+        combined_future = torch.cat(all_future_values, dim=0)
+        # Create container
+        container = BatchTimeSeriesContainer(
+            history_values=combined_history,
+            future_values=combined_future,
+            start=all_starts,
+            frequency=all_frequencies,
+        )
+        return container, selected_generator
+    def get_dataset_info(self) -> Dict[str, dict]:
+        """Get information about all datasets."""
+        info = {}
+        for name, dataset in self.datasets.items():
+            info[name] = dataset.get_info()
+        return info
+    def get_generator_info(self) -> Dict[str, any]:
+        """Get information about the composer configuration."""
+        return {
+            "mixed_batches": self.mixed_batches,
+            "generator_proportions": self.generator_proportions,
+            "active_generators": list(self.datasets.keys()),
+            "total_generators": len(self.datasets),
+            "augmentations": self.augmentations,
+            "augmentation_probabilities": self.augmentation_probabilities,
+            "nan_augmenter_enabled": self.nan_augmenter is not None,
+        }
+class ComposedDataset(torch.utils.data.Dataset):
+    """
+    PyTorch Dataset wrapper around BatchComposer for training pipeline integration.
+    """
+    def __init__(
+        self,
+        batch_composer: BatchComposer,
+        num_batches_per_epoch: int = 100,
+        batch_size: int = 128,
+    ):
+        """
+        Initialize the dataset.
+        Args:
+            batch_composer: The BatchComposer instance
+            num_batches_per_epoch: Number of batches to generate per epoch
+            batch_size: Size of each batch
+        """
+        self.batch_composer = batch_composer
+        self.num_batches_per_epoch = num_batches_per_epoch
+        self.batch_size = batch_size
+    def __len__(self) -> int:
+        return self.num_batches_per_epoch
+    def __getitem__(self, idx: int) -> BatchTimeSeriesContainer:
+        """
+        Get a batch by index.
+        Args:
+            idx: Batch index (used as seed for reproducibility)
+        Returns:
+            BatchTimeSeriesContainer
+        """
+        # Use index as seed for reproducible batches
+        batch, _ = self.batch_composer.create_batch(
+            batch_size=self.batch_size, seed=self.batch_composer.global_seed + idx
+        )
+        return batch

src/data/constants.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from datetime import date
+from typing import Dict
+import numpy as np
+DEFAULT_START_DATE = date(1700, 1, 1)
+DEFAULT_END_DATE = date(2200, 1, 1)
+BASE_START_DATE = np.datetime64(DEFAULT_START_DATE)
+BASE_END_DATE = np.datetime64(DEFAULT_END_DATE)
+# Maximum years to prevent timestamp overflow
+MAX_YEARS = 500
+LENGTH_CHOICES = [128, 256, 512, 1024, 1536, 2048]
+DEFAULT_NAN_STATS_PATH: str = "./data/nan_stats.json"
+LENGTH_WEIGHTS: Dict[int, float] = {
+    128: 0.05,
+    256: 0.10,
+    512: 0.10,
+    1024: 0.10,
+    1536: 0.15,
+    2048: 0.50,
+}

src/data/containers.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from dataclasses import dataclass
+from typing import List, Optional
+import numpy as np
+import torch
+from src.data.frequency import Frequency
+@dataclass
+class BatchTimeSeriesContainer:
+    """
+    Container for a batch of multivariate time series data and their associated features.
+    Attributes:
+        history_values: Tensor of historical observations.
+            Shape: [batch_size, seq_len, num_channels]
+        future_values: Tensor of future observations to predict.
+            Shape: [batch_size, pred_len, num_channels]
+        start: Timestamp of the first history value.
+            Type: List[np.datetime64]
+        frequency: Frequency of the time series.
+            Type: List[Frequency]
+        history_mask: Optional boolean/float tensor indicating missing entries in history_values across channels.
+            Shape: [batch_size, seq_len]
+        future_mask: Optional boolean/float tensor indicating missing entries in future_values across channels.
+            Shape: [batch_size, pred_len]
+    """
+    history_values: torch.Tensor
+    future_values: torch.Tensor
+    start: List[np.datetime64]
+    frequency: List[Frequency]
+    history_mask: Optional[torch.Tensor] = None
+    future_mask: Optional[torch.Tensor] = None
+    def __post_init__(self):
+        """Validate all tensor shapes and consistency."""
+        # --- Tensor Type Checks ---
+        if not isinstance(self.history_values, torch.Tensor):
+            raise TypeError("history_values must be a torch.Tensor")
+        if not isinstance(self.future_values, torch.Tensor):
+            raise TypeError("future_values must be a torch.Tensor")
+        if not isinstance(self.start, list) or not all(
+            isinstance(x, np.datetime64) for x in self.start
+        ):
+            raise TypeError("start must be a List[np.datetime64]")
+        if not isinstance(self.frequency, list) or not all(
+            isinstance(x, Frequency) for x in self.frequency
+        ):
+            raise TypeError("frequency must be a List[Frequency]")
+        batch_size, seq_len, num_channels = self.history_values.shape
+        pred_len = self.future_values.shape[1]
+        # --- Core Shape Checks ---
+        if self.future_values.shape[0] != batch_size:
+            raise ValueError("Batch size mismatch between history and future_values")
+        if self.future_values.shape[2] != num_channels:
+            raise ValueError("Channel size mismatch between history and future_values")
+        # --- Optional Mask Checks ---
+        if self.history_mask is not None:
+            if not isinstance(self.history_mask, torch.Tensor):
+                raise TypeError("history_mask must be a Tensor or None")
+            if self.history_mask.shape[:2] != (batch_size, seq_len):
+                raise ValueError(
+                    f"Shape mismatch in history_mask: {self.history_mask.shape[:2]} vs {(batch_size, seq_len)}"
+                )
+        if self.future_mask is not None:
+            if not isinstance(self.future_mask, torch.Tensor):
+                raise TypeError("future_mask must be a Tensor or None")
+            if not (
+                self.future_mask.shape == (batch_size, pred_len)
+                or self.future_mask.shape == self.future_values.shape
+            ):
+                raise ValueError(
+                    f"Shape mismatch in future_mask: expected {(batch_size, pred_len)} or {self.future_values.shape}, got {self.future_mask.shape}"
+                )
+    def to_device(
+        self, device: torch.device, attributes: Optional[List[str]] = None
+    ) -> None:
+        """
+        Move specified tensors to the target device in place.
+        Args:
+            device: Target device (e.g., 'cpu', 'cuda').
+            attributes: Optional list of attribute names to move. If None, move all tensors.
+        Raises:
+            ValueError: If an invalid attribute is specified or device transfer fails.
+        """
+        all_tensors = {
+            "history_values": self.history_values,
+            "future_values": self.future_values,
+            "history_mask": self.history_mask,
+            "future_mask": self.future_mask,
+        }
+        if attributes is None:
+            attributes = [k for k, v in all_tensors.items() if v is not None]
+        for attr in attributes:
+            if attr not in all_tensors:
+                raise ValueError(f"Invalid attribute: {attr}")
+            if all_tensors[attr] is not None:
+                setattr(self, attr, all_tensors[attr].to(device))
+    def to(self, device: torch.device, attributes: Optional[List[str]] = None):
+        """
+        Alias for to_device method for consistency with PyTorch conventions.
+        Args:
+            device: Target device (e.g., 'cpu', 'cuda').
+            attributes: Optional list of attribute names to move. If None, move all tensors.
+        """
+        self.to_device(device, attributes)
+        return self
+    @property
+    def batch_size(self) -> int:
+        return self.history_values.shape[0]
+    @property
+    def history_length(self) -> int:
+        return self.history_values.shape[1]
+    @property
+    def future_length(self) -> int:
+        return self.future_values.shape[1]
+    @property
+    def num_channels(self) -> int:
+        return self.history_values.shape[2]
+@dataclass
+class TimeSeriesContainer:
+    """
+    Container for batch of time series data without explicit history/future split.
+    This container is used for storing generated synthetic time series data where
+    the entire series is treated as a single entity, typically for further processing
+    or splitting into history/future components later.
+    Attributes:
+        values: np.ndarray of time series values.
+            Shape: [batch_size, seq_len, num_channels] for multivariate series
+                   [batch_size, seq_len] for univariate series
+        start: List of start timestamps for each series in the batch.
+            Type: List[np.datetime64], length should match batch_size
+        frequency: List of frequency for each series in the batch.
+            Type: List[Frequency], length should match batch_size
+    """
+    values: np.ndarray
+    start: List[np.datetime64]
+    frequency: List[Frequency]
+    def __post_init__(self):
+        """Validate all shapes and consistency."""
+        # --- Numpy Type Checks ---
+        if not isinstance(self.values, np.ndarray):
+            raise TypeError("values must be a np.ndarray")
+        if not isinstance(self.start, list) or not all(
+            isinstance(x, np.datetime64) for x in self.start
+        ):
+            raise TypeError("start must be a List[np.datetime64]")
+        if not isinstance(self.frequency, list) or not all(
+            isinstance(x, Frequency) for x in self.frequency
+        ):
+            raise TypeError("frequency must be a List[Frequency]")
+        # --- Shape and Length Consistency Checks ---
+        if len(self.values.shape) < 2 or len(self.values.shape) > 3:
+            raise ValueError(
+                f"values must have 2 or 3 dimensions [batch_size, seq_len] or [batch_size, seq_len, num_channels], got shape {self.values.shape}"
+            )
+        batch_size = self.values.shape[0]
+        if len(self.start) != batch_size:
+            raise ValueError(
+                f"Length of start ({len(self.start)}) must match batch_size ({batch_size})"
+            )
+        if len(self.frequency) != batch_size:
+            raise ValueError(
+                f"Length of frequency ({len(self.frequency)}) must match batch_size ({batch_size})"
+            )
+    @property
+    def batch_size(self) -> int:
+        return self.values.shape[0]
+    @property
+    def seq_length(self) -> int:
+        return self.values.shape[1]
+    @property
+    def num_channels(self) -> int:
+        return self.values.shape[2] if len(self.values.shape) == 3 else 1

src/data/datasets.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import logging
+import os
+import random
+from typing import List, Optional
+import pyarrow.feather as feather
+import torch
+logger = logging.getLogger(__name__)
+class CyclicalBatchDataset:
+    """
+    Dataset class that loads saved batches from continuous generation script.
+    Maintains a pointer and provides cyclical access to individual samples.
+    Includes enhanced logging to track data shard cycling during training.
+    Supports per-rank file sharding for large-scale distributed training.
+    """
+    def __init__(
+        self,
+        batches_dir: str,
+        generator_type: str,
+        device: Optional[torch.device] = None,
+        prefetch_next: bool = True,
+        prefetch_threshold: int = 32,
+        rank: int = 0,
+        world_size: int = 1,
+    ):
+        """
+        Initialize the cyclical batch dataset.
+        Args:
+            batches_dir: Directory containing the batch arrow files
+            generator_type: Type of generator (for logging)
+            device: Device to load tensors to
+            prefetch_next: Whether to prefetch the next batch
+            prefetch_threshold: Number of remaining samples to trigger prefetching
+            rank: Rank of the current process (for file sharding)
+            world_size: Total number of processes (for file sharding)
+        """
+        self.batches_dir = batches_dir
+        self.generator_type = generator_type
+        self.device = device
+        self.prefetch_next = prefetch_next
+        self.prefetch_threshold = prefetch_threshold
+        self.rank = rank
+        self.world_size = world_size
+        self.batch_files = self._find_batch_files()
+        if not self.batch_files:
+            raise ValueError(f"No batch files found in {batches_dir}")
+        # --- State tracking ---
+        self.current_batch_idx = 0
+        self.current_sample_idx = 0
+        self.current_batch_data = None
+        self.next_batch_data = None
+        self.prefetching_in_progress = False
+        # --- NEW: Logging and cycle tracking ---
+        self.visited_batch_indices = set()
+        self.full_cycles_completed = 0
+        # Load first batch and update tracking
+        self._load_current_batch()
+        self.visited_batch_indices.add(self.current_batch_idx)
+        logger.info(
+            f"Initialized '{self.generator_type}' dataset with {len(self.batch_files)} batches. "
+            f"Current batch file: '{os.path.basename(self.batch_files[self.current_batch_idx])}' "
+            f"has {len(self.current_batch_data)} samples."
+        )
+    def _find_batch_files(self) -> List[str]:
+        """
+        Find and sort batch files with per-rank sharding for distributed training.
+        Each rank gets a disjoint subset of files to minimize I/O contention
+        when scaling to hundreds of GPUs.
+        """
+        import glob
+        pattern = os.path.join(self.batches_dir, "batch_*.arrow")
+        all_files = sorted(glob.glob(pattern))  # Sort for deterministic sharding
+        if not all_files:
+            return []
+        # Shard files across ranks: each rank gets every world_size-th file
+        # Example with 4 ranks: rank0=[0,4,8,...], rank1=[1,5,9,...], etc.
+        rank_files = [
+            f for i, f in enumerate(all_files) if i % self.world_size == self.rank
+        ]
+        # Shuffle only within this rank's shard for variety
+        random.shuffle(rank_files)
+        logger.info(
+            f"[Rank {self.rank}] '{self.generator_type}': Sharded {len(all_files)} files → "
+            f"{len(rank_files)} files for this rank ({len(rank_files) / len(all_files) * 100:.1f}%)"
+        )
+        return rank_files
+    def _load_batch_from_file(self, batch_file: str) -> List[dict]:
+        """Load a batch from arrow file."""
+        try:
+            table = feather.read_table(batch_file)
+            has_num_channels = "num_channels" in table.column_names
+            batch_data = []
+            for i in range(len(table)):
+                row = {
+                    "series_id": table["series_id"][i].as_py(),
+                    "values": table["values"][i].as_py(),
+                    "length": table["length"][i].as_py(),
+                    "generator_type": table["generator_type"][i].as_py(),
+                    "start": table["start"][i].as_py(),
+                    "frequency": table["frequency"][i].as_py(),
+                    "generation_timestamp": table["generation_timestamp"][i].as_py(),
+                }
+                if has_num_channels:
+                    row["num_channels"] = table["num_channels"][i].as_py()
+                else:
+                    row["num_channels"] = 1
+                batch_data.append(row)
+            return batch_data
+        except Exception as e:
+            logger.error(f"Error loading batch from {batch_file}: {e}")
+            raise
+    def _load_current_batch(self):
+        """Load the current batch into memory."""
+        if hasattr(self, "current_batch_data") and self.current_batch_data is not None:
+            del self.current_batch_data
+        batch_file = self.batch_files[self.current_batch_idx]
+        self.current_batch_data = self._load_batch_from_file(batch_file)
+        self.current_sample_idx = 0
+        logger.debug(
+            f"Loaded batch {self.current_batch_idx} for {self.generator_type} "
+            f"with {len(self.current_batch_data)} samples"
+        )
+    def _trigger_smart_prefetch(self):
+        """Trigger prefetching when batch is almost exhausted."""
+        if not self.prefetch_next or len(self.batch_files) <= 1:
+            return
+        remaining_samples = self.get_remaining_samples_in_current_batch()
+        should_prefetch = (
+            remaining_samples <= self.prefetch_threshold
+            and self.next_batch_data is None
+            and not self.prefetching_in_progress
+        )
+        if should_prefetch:
+            self._prefetch_next_batch()
+    def _prefetch_next_batch(self):
+        """Prefetch the next batch."""
+        if self.prefetching_in_progress:
+            return
+        self.prefetching_in_progress = True
+        next_batch_idx = (self.current_batch_idx + 1) % len(self.batch_files)
+        next_batch_file = self.batch_files[next_batch_idx]
+        try:
+            self.next_batch_data = self._load_batch_from_file(next_batch_file)
+            logger.debug(
+                f"Prefetched next batch {next_batch_idx} for {self.generator_type}"
+            )
+        except Exception as e:
+            logger.warning(f"Failed to prefetch batch {next_batch_idx}: {e}")
+            self.next_batch_data = None
+        finally:
+            self.prefetching_in_progress = False
+    def _advance_to_next_batch(self):
+        """Advance to the next batch and log the transition."""
+        if hasattr(self, "current_batch_data") and self.current_batch_data is not None:
+            del self.current_batch_data
+        previous_batch_idx = self.current_batch_idx
+        self.current_batch_idx = (self.current_batch_idx + 1) % len(self.batch_files)
+        if hasattr(self, "next_batch_data") and self.next_batch_data is not None:
+            self.current_batch_data = self.next_batch_data
+            self.next_batch_data = None
+        else:
+            self._load_current_batch()
+        self.current_sample_idx = 0
+        self.prefetching_in_progress = False
+        # --- NEW: Enhanced Logging Logic ---
+        self.visited_batch_indices.add(self.current_batch_idx)
+        # Calculate progress
+        total_files = len(self.batch_files)
+        visited_count = len(self.visited_batch_indices)
+        progress_percent = (visited_count / total_files) * 100
+        # Log the shard cycle event
+        logger.info(
+            f"\nDATA SHARD CYCLED for '{self.generator_type}': "
+            f"Moved from file index {previous_batch_idx} to {self.current_batch_idx}. "
+            f"Unique files visited: {visited_count}/{total_files} ({progress_percent:.1f}%)."
+        )
+        # Check if a full cycle has been completed
+        if visited_count == total_files:
+            self.full_cycles_completed += 1
+            logger.info(
+                f"🎉 FULL CYCLE #{self.full_cycles_completed} COMPLETED for '{self.generator_type}'! "
+                f"All {total_files} data files have been visited at least once. "
+                "Resetting visited set to track the next cycle."
+            )
+            # Reset for the next cycle count
+            self.visited_batch_indices.clear()
+            self.visited_batch_indices.add(self.current_batch_idx)
+    def get_sample(self) -> dict:
+        """Get the current sample and advance pointer."""
+        if not hasattr(self, "current_batch_data") or self.current_batch_data is None:
+            self._load_current_batch()
+        if self.current_batch_data is None:
+            raise RuntimeError("No batch data loaded")
+        if self.current_sample_idx >= len(self.current_batch_data):
+            self._advance_to_next_batch()
+        self._trigger_smart_prefetch()
+        sample = self.current_batch_data[self.current_sample_idx]
+        self.current_sample_idx += 1
+        return sample
+    def get_samples(self, num_samples: int) -> List[dict]:
+        """Get multiple samples."""
+        samples = []
+        for _ in range(num_samples):
+            samples.append(self.get_sample())
+        return samples
+    def get_total_samples_in_current_batch(self) -> int:
+        """Get total samples in current batch."""
+        if not hasattr(self, "current_batch_data") or self.current_batch_data is None:
+            return 0
+        return len(self.current_batch_data)
+    def get_remaining_samples_in_current_batch(self) -> int:
+        """Get remaining samples in current batch."""
+        if not hasattr(self, "current_batch_data") or self.current_batch_data is None:
+            return 0
+        return max(0, len(self.current_batch_data) - self.current_sample_idx)
+    def get_info(self) -> dict:
+        """Get extended dataset info, including cycle progress."""
+        total_files = len(self.batch_files)
+        visited_count = len(self.visited_batch_indices)
+        return {
+            "generator_type": self.generator_type,
+            "total_batch_files": total_files,
+            "current_batch_idx": self.current_batch_idx,
+            "current_sample_idx": self.current_sample_idx,
+            "current_batch_size": self.get_total_samples_in_current_batch(),
+            "remaining_in_batch": self.get_remaining_samples_in_current_batch(),
+            "unique_files_visited": visited_count,
+            "cycle_progress_percent": (visited_count / total_files) * 100
+            if total_files > 0
+            else 0,
+            "full_cycles_completed": self.full_cycles_completed,
+        }

src/data/filter.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import numpy as np
+import torch
+from scipy import signal
+from statsmodels.tsa.stattools import acf
+def lempel_ziv_complexity(binary_sequence: np.ndarray) -> int:
+    """Computes the Lempel-Ziv complexity of a binary sequence."""
+    sub_strings = set()
+    n = len(binary_sequence)
+    i = 0
+    count = 0
+    while i < n:
+        sub_str = ""
+        for j in range(i, n):
+            sub_str += str(binary_sequence[j])
+            if sub_str not in sub_strings:
+                sub_strings.add(sub_str)
+                count += 1
+                i = j + 1
+                break
+        else:
+            i += 1
+    return count
+def is_low_quality(
+    series: torch.Tensor,
+    autocorr_threshold: float = 0.2,
+    snr_threshold: float = 0.5,
+    complexity_threshold: float = 0.4,
+) -> bool:
+    """
+    Returns True if the series appears non-forecastable (noise-like):
+    - weak autocorrelation
+    - low SNR proxy
+    - high normalized Lempel-Ziv complexity
+    """
+    x = series.squeeze().detach().cpu().numpy()
+    if x.size < 20:
+        return True
+    if np.var(x) < 1e-10:
+        return True
+    x_detrended = signal.detrend(x)
+    try:
+        max_lags = min(len(x_detrended) // 4, 40)
+        if max_lags < 1:
+            autocorr_strength = 0.0
+        else:
+            acf_vals = acf(x_detrended, nlags=max_lags, fft=True)[1:]
+            autocorr_strength = float(np.max(np.abs(acf_vals)))
+    except Exception:
+        autocorr_strength = 0.0
+    win_size = max(3, min(len(x) // 10, 15))
+    signal_est = np.convolve(x, np.ones(win_size) / win_size, mode="valid")
+    noise_est = x[win_size - 1 :] - signal_est
+    var_signal = float(np.var(signal_est))
+    var_noise = float(np.var(noise_est))
+    snr_proxy = var_signal / var_noise if var_noise > 1e-8 else 1.0
+    median_val = float(np.median(x_detrended))
+    binary_seq = (x_detrended > median_val).astype(np.uint8)
+    complexity_score = lempel_ziv_complexity(binary_seq)
+    normalized_complexity = complexity_score / max(1, len(binary_seq))
+    is_random_like = (snr_proxy < snr_threshold) and (
+        normalized_complexity > complexity_threshold
+    )
+    is_uncorrelated = autocorr_strength < autocorr_threshold
+    return bool(is_uncorrelated and is_random_like)

src/data/frequency.py ADDED Viewed

	@@ -0,0 +1,538 @@

+"""
+Comprehensive frequency management module for time series forecasting.
+This module centralizes all frequency-related functionality including:
+- Frequency enum with helper methods
+- Frequency parsing and validation
+- Pandas frequency string conversion
+- Safety checks for date ranges
+- Frequency selection utilities
+- All frequency constants and mappings
+"""
+import logging
+import re
+from enum import Enum
+from typing import Dict, Tuple
+import numpy as np
+import pandas as pd
+from numpy.random import Generator
+from src.data.constants import BASE_END_DATE, BASE_START_DATE, MAX_YEARS
+logger = logging.getLogger(__name__)
+class Frequency(Enum):
+    """
+    Enhanced Frequency enum with comprehensive helper methods.
+    Each frequency includes methods for pandas conversion, safety checks,
+    and other frequency-specific operations.
+    """
+    A = "A"  # Annual
+    Q = "Q"  # Quarterly
+    M = "M"  # Monthly
+    W = "W"  # Weekly
+    D = "D"  # Daily
+    H = "h"  # Hourly
+    S = "s"  # Seconds
+    T1 = "1min"  # 1 minute
+    T5 = "5min"  # 5 minutes
+    T10 = "10min"  # 10 minutes
+    T15 = "15min"  # 15 minutes
+    T30 = "30min"  # 30 minutes
+    def to_pandas_freq(self, for_date_range: bool = True) -> str:
+        """
+        Convert to pandas frequency string.
+        Args:
+            for_date_range: If True, use strings suitable for pd.date_range().
+                           If False, use strings suitable for pd.PeriodIndex().
+        Returns:
+            Pandas frequency string
+        """
+        base, prefix, _ = FREQUENCY_MAPPING[self]
+        # Special handling for date_range vs period compatibility
+        if for_date_range:
+            # For date_range, use modern pandas frequency strings
+            if self == Frequency.M:
+                return "ME"  # Month End
+            elif self == Frequency.A:
+                return "YE"  # Year End
+            elif self == Frequency.Q:
+                return "QE"  # Quarter End
+        else:
+            # For periods, use legacy frequency strings
+            if self == Frequency.M:
+                return "M"  # Month for periods
+            elif self == Frequency.A:
+                return "Y"  # Year for periods (not YE)
+            elif self == Frequency.Q:
+                return "Q"  # Quarter for periods (not QE)
+        # Construct frequency string for other frequencies
+        if prefix:
+            return f"{prefix}{base}"
+        else:
+            return base
+    def to_pandas_offset(self) -> str:
+        """Get pandas offset string for time delta calculations."""
+        return FREQUENCY_TO_OFFSET[self]
+    def get_days_per_period(self) -> float:
+        """Get approximate days per period for this frequency."""
+        _, _, days = FREQUENCY_MAPPING[self]
+        return days
+    def get_max_safe_length(self) -> int:
+        """Get maximum safe sequence length to prevent timestamp overflow."""
+        return ALL_FREQUENCY_MAX_LENGTHS.get(self, float("inf"))
+    def is_high_frequency(self) -> bool:
+        """Check if this is a high frequency (minute/second level)."""
+        return self in [
+            Frequency.S,
+            Frequency.T1,
+            Frequency.T5,
+            Frequency.T10,
+            Frequency.T15,
+            Frequency.T30,
+        ]
+    def is_low_frequency(self) -> bool:
+        """Check if this is a low frequency (annual/quarterly/monthly)."""
+        return self in [Frequency.A, Frequency.Q, Frequency.M]
+    def get_seasonality(self) -> int:
+        """Get typical seasonality for this frequency."""
+        seasonality_map = {
+            Frequency.S: 3600,  # 1 hour of seconds
+            Frequency.T1: 60,  # 1 hour of minutes
+            Frequency.T5: 12,  # 1 hour of 5-minute intervals
+            Frequency.T10: 6,  # 1 hour of 10-minute intervals
+            Frequency.T15: 4,  # 1 hour of 15-minute intervals
+            Frequency.T30: 2,  # 1 hour of 30-minute intervals
+            Frequency.H: 24,  # 1 day of hours
+            Frequency.D: 7,  # 1 week of days
+            Frequency.W: 52,  # 1 year of weeks
+            Frequency.M: 12,  # 1 year of months
+            Frequency.Q: 4,  # 1 year of quarters
+            Frequency.A: 1,  # No clear seasonality for annual
+        }
+        return seasonality_map.get(self, 1)
+    def get_gift_eval_weight(self) -> float:
+        """Get GIFT eval dataset frequency weight."""
+        return GIFT_EVAL_FREQUENCY_WEIGHTS.get(self, 0.1)
+    def get_length_range(self) -> Tuple[int, int, int, int]:
+        """Get (min_length, max_length, optimal_start, optimal_end) for this frequency."""
+        return GIFT_EVAL_LENGTH_RANGES.get(self, (50, 1000, 100, 500))
+# ============================================================================
+# Frequency Mappings and Constants
+# ============================================================================
+# Core frequency mapping: (pandas_base, prefix, days_per_period)
+FREQUENCY_MAPPING: Dict[Frequency, Tuple[str, str, float]] = {
+    Frequency.A: (
+        "YE",
+        "",
+        365.25,
+    ),  # Average days per year (accounting for leap years)
+    Frequency.Q: ("Q", "", 91.3125),  # 365.25/4 - average days per quarter
+    Frequency.M: ("M", "", 30.4375),  # 365.25/12 - average days per month
+    Frequency.W: ("W", "", 7),
+    Frequency.D: ("D", "", 1),
+    Frequency.H: ("h", "", 1 / 24),
+    Frequency.S: ("s", "", 1 / 86400),  # 24*60*60
+    Frequency.T1: ("min", "1", 1 / 1440),  # 24*60
+    Frequency.T5: ("min", "5", 1 / 288),  # 24*60/5
+    Frequency.T10: ("min", "10", 1 / 144),  # 24*60/10
+    Frequency.T15: ("min", "15", 1 / 96),  # 24*60/15
+    Frequency.T30: ("min", "30", 1 / 48),  # 24*60/30
+}
+# Frequency to pandas offset mapping for calculating time deltas
+FREQUENCY_TO_OFFSET: Dict[Frequency, str] = {
+    Frequency.A: "AS",  # Annual start
+    Frequency.Q: "QS",  # Quarter start
+    Frequency.M: "MS",  # Month start
+    Frequency.W: "W",  # Weekly
+    Frequency.D: "D",  # Daily
+    Frequency.H: "H",  # Hourly
+    Frequency.T1: "1T",  # 1 minute
+    Frequency.T5: "5T",  # 5 minutes
+    Frequency.T10: "10T",  # 10 minutes
+    Frequency.T15: "15T",  # 15 minutes
+    Frequency.T30: "30T",  # 30 minutes
+    Frequency.S: "S",  # Seconds
+}
+# Maximum sequence lengths to avoid pandas OutOfBoundsDatetime errors
+SHORT_FREQUENCY_MAX_LENGTHS = {
+    Frequency.A: MAX_YEARS,
+    Frequency.Q: MAX_YEARS * 4,
+    Frequency.M: MAX_YEARS * 12,
+    Frequency.W: int(MAX_YEARS * 52.1775),
+    Frequency.D: int(MAX_YEARS * 365.2425),
+}
+HIGH_FREQUENCY_MAX_LENGTHS = {
+    Frequency.H: int(MAX_YEARS * 365.2425 * 24),
+    Frequency.S: int(MAX_YEARS * 365.2425 * 24 * 60 * 60),
+    Frequency.T1: int(MAX_YEARS * 365.2425 * 24 * 60),
+    Frequency.T5: int(MAX_YEARS * 365.2425 * 24 * 12),
+    Frequency.T10: int(MAX_YEARS * 365.2425 * 24 * 6),
+    Frequency.T15: int(MAX_YEARS * 365.2425 * 24 * 4),
+    Frequency.T30: int(MAX_YEARS * 365.2425 * 24 * 2),
+}
+# Combined max lengths for all frequencies
+ALL_FREQUENCY_MAX_LENGTHS = {
+    **SHORT_FREQUENCY_MAX_LENGTHS,
+    **HIGH_FREQUENCY_MAX_LENGTHS,
+}
+# GIFT eval-based frequency weights from actual dataset analysis
+GIFT_EVAL_FREQUENCY_WEIGHTS: Dict[Frequency, float] = {
+    Frequency.H: 25.0,  # Hourly - most common
+    Frequency.D: 23.4,  # Daily - second most common
+    Frequency.W: 12.9,  # Weekly - third most common
+    Frequency.T15: 9.7,  # 15-minute
+    Frequency.T5: 9.7,  # 5-minute
+    Frequency.M: 7.3,  # Monthly
+    Frequency.T10: 4.8,  # 10-minute
+    Frequency.S: 4.8,  # 10-second
+    Frequency.T1: 1.6,  # 1-minute
+    Frequency.Q: 0.8,  # Quarterly
+    Frequency.A: 0.8,  # Annual
+}
+# GIFT eval-based length ranges derived from actual dataset analysis
+# Format: (min_length, max_length, optimal_start, optimal_end)
+GIFT_EVAL_LENGTH_RANGES: Dict[Frequency, Tuple[int, int, int, int]] = {
+    # Low frequency ranges (based on actual GIFT eval data + logical extensions)
+    Frequency.A: (25, 100, 30, 70),
+    Frequency.Q: (25, 150, 50, 120),
+    Frequency.M: (40, 1000, 100, 600),
+    Frequency.W: (50, 3500, 100, 1500),
+    # Medium frequency ranges
+    Frequency.D: (150, 25000, 300, 7000),  # Daily: covers 1-year+ scenarios
+    Frequency.H: (600, 35000, 700, 17000),
+    # High frequency ranges (extended for shorter realistic scenarios)
+    Frequency.T1: (200, 2500, 1200, 1800),  # 1-minute: day to few days
+    Frequency.S: (7500, 9500, 7900, 9000),
+    Frequency.T15: (1000, 140000, 50000, 130000),
+    Frequency.T5: (200, 105000, 20000, 95000),
+    Frequency.T10: (40000, 55000, 47000, 52000),
+    Frequency.T30: (100, 50000, 10000, 40000),
+}
+# ============================================================================
+# Frequency Parsing and Validation
+# ============================================================================
+def parse_frequency(freq_str: str) -> Frequency:
+    """
+    Parse frequency string to Frequency enum, robust to variations.
+    Handles various frequency string formats:
+    - Standard: "A", "Q", "M", "W", "D", "H", "S"
+    - Pandas-style: "A-DEC", "W-SUN", "QE-MAR"
+    - Minutes: "5T", "10min", "1T"
+    - Case variations: "a", "h", "D"
+    Args:
+        freq_str: The frequency string to parse (e.g., "5T", "W-SUN", "M")
+    Returns:
+        Corresponding Frequency enum member
+    Raises:
+        ValueError: If the frequency string is not supported
+    """
+    # Handle minute-based frequencies BEFORE pandas standardization
+    # because pandas converts "5T" to just "min", losing the multiplier
+    minute_match = re.match(r"^(\d*)T$", freq_str, re.IGNORECASE) or re.match(
+        r"^(\d*)min$", freq_str, re.IGNORECASE
+    )
+    if minute_match:
+        multiplier = int(minute_match.group(1)) if minute_match.group(1) else 1
+        enum_key = f"T{multiplier}"
+        try:
+            return Frequency[enum_key]
+        except KeyError:
+            logger.warning(
+                f"Unsupported minute frequency '{freq_str}' (multiplier: {multiplier}). "
+                f"Falling back to '1min' ({Frequency.T1.value})."
+            )
+            return Frequency.T1
+    # Now standardize frequency string for other cases
+    try:
+        offset = pd.tseries.frequencies.to_offset(freq_str)
+        standardized_freq = offset.name
+    except Exception:
+        standardized_freq = freq_str
+    # Handle other frequencies by their base (e.g., 'W-SUN' -> 'W', 'A-DEC' -> 'A')
+    base_freq = standardized_freq.split("-")[0].upper()
+    freq_map = {
+        "A": Frequency.A,
+        "Y": Frequency.A,  # Alias for Annual
+        "YE": Frequency.A,  # Alias for Annual
+        "Q": Frequency.Q,
+        "QE": Frequency.Q,  # Alias for Quarterly
+        "M": Frequency.M,
+        "ME": Frequency.M,  # Alias for Monthly
+        "W": Frequency.W,
+        "D": Frequency.D,
+        "H": Frequency.H,
+        "S": Frequency.S,
+    }
+    if base_freq in freq_map:
+        return freq_map[base_freq]
+    raise NotImplementedError(f"Frequency '{standardized_freq}' is not supported.")
+def validate_frequency_safety(
+    start_date: np.datetime64, total_length: int, frequency: Frequency
+) -> bool:
+    """
+    Check if start date and frequency combination is safe for pandas datetime operations.
+    This function verifies that pd.date_range(start=start_date, periods=total_length, freq=freq_str)
+    will not raise an OutOfBoundsDatetime error, accounting for pandas' datetime bounds
+    (1677-09-21 to 2262-04-11) and realistic frequency limitations.
+    Args:
+        start_date: The proposed start date for the time series
+        total_length: Total length of the time series
+        frequency: The frequency of the time series
+    Returns:
+        True if the combination is safe, False otherwise
+    """
+    try:
+        # Get the pandas frequency string
+        freq_str = frequency.to_pandas_freq(for_date_range=True)
+        # Convert numpy datetime64 to pandas Timestamp for date_range
+        start_pd = pd.Timestamp(start_date)
+        # Check if start date is within pandas' valid datetime range
+        if start_pd < pd.Timestamp.min or start_pd > pd.Timestamp.max:
+            return False
+        # Check maximum length constraints
+        max_length = frequency.get_max_safe_length()
+        if total_length > max_length:
+            return False
+        # For low frequencies, be extra conservative
+        if frequency.is_low_frequency():
+            if frequency == Frequency.A and total_length > 500:  # Max ~500 years
+                return False
+            elif frequency == Frequency.Q and total_length > 2000:  # Max ~500 years
+                return False
+            elif frequency == Frequency.M and total_length > 6000:  # Max ~500 years
+                return False
+        # Calculate approximate end date
+        days_per_period = frequency.get_days_per_period()
+        approx_days = total_length * days_per_period
+        # For annual/quarterly frequencies, add extra safety margin
+        if frequency in [Frequency.A, Frequency.Q]:
+            approx_days *= 1.1  # 10% safety margin
+        end_date = start_pd + pd.Timedelta(days=approx_days)
+        # Check if end date is within pandas' valid datetime range
+        if end_date < pd.Timestamp.min or end_date > pd.Timestamp.max:
+            return False
+        # Try to create the date range as final validation
+        pd.date_range(start=start_pd, periods=total_length, freq=freq_str)
+        return True
+    except (pd.errors.OutOfBoundsDatetime, OverflowError, ValueError):
+        return False
+# ============================================================================
+# Frequency Selection Utilities
+# ============================================================================
+def select_safe_random_frequency(total_length: int, rng: Generator) -> Frequency:
+    """
+    Select a random frequency suitable for a given total length of a time series,
+    based on actual GIFT eval dataset patterns and distributions.
+    The selection logic:
+    1. Filters frequencies that can handle the given total_length
+    2. Applies base weights derived from actual GIFT eval frequency distribution
+    3. Strongly boosts frequencies that are in their optimal length ranges
+    4. Handles edge cases gracefully with fallbacks
+    Args:
+        total_length: The total length of the time series (history + future)
+        rng: A numpy random number generator instance
+    Returns:
+        A randomly selected frequency that matches GIFT eval patterns
+    """
+    # Find valid frequencies and calculate weighted scores
+    valid_frequencies = []
+    frequency_scores = []
+    for freq in Frequency:
+        # Check basic timestamp overflow limits
+        max_allowed = freq.get_max_safe_length()
+        if total_length > max_allowed:
+            continue
+        # Check if frequency has defined ranges
+        min_len, max_len, optimal_start, optimal_end = freq.get_length_range()
+        # Must be within the frequency's realistic range
+        if total_length < min_len or total_length > max_len:
+            continue
+        valid_frequencies.append(freq)
+        # Calculate fitness score based on GIFT eval patterns
+        base_weight = freq.get_gift_eval_weight()
+        # Enhanced length-based fitness scoring
+        if optimal_start <= total_length <= optimal_end:
+            # In optimal range - very strong preference
+            length_multiplier = 5.0
+        else:
+            # Outside optimal but within valid range - calculate penalty
+            if total_length < optimal_start:
+                # Below optimal range
+                distance_ratio = (optimal_start - total_length) / (
+                    optimal_start - min_len
+                )
+            else:
+                # Above optimal range
+                distance_ratio = (total_length - optimal_end) / (max_len - optimal_end)
+            # Apply graduated penalty: closer to optimal = higher score
+            length_multiplier = 0.3 + 1.2 * (1.0 - distance_ratio)  # Range: 0.3-1.5
+        final_score = base_weight * length_multiplier
+        frequency_scores.append(final_score)
+    # Handle edge cases with smart fallbacks
+    if not valid_frequencies:
+        # Fallback strategy based on typical length patterns
+        if total_length <= 100:
+            # Very short series - prefer low frequencies
+            fallback_order = [
+                Frequency.A,
+                Frequency.Q,
+                Frequency.M,
+                Frequency.W,
+                Frequency.D,
+            ]
+        elif total_length <= 1000:
+            # Medium short series - prefer daily/weekly
+            fallback_order = [Frequency.D, Frequency.W, Frequency.H, Frequency.M]
+        else:
+            # Longer series - prefer higher frequencies
+            fallback_order = [Frequency.H, Frequency.D, Frequency.T15, Frequency.T5]
+        for fallback_freq in fallback_order:
+            max_allowed = fallback_freq.get_max_safe_length()
+            if total_length <= max_allowed:
+                return fallback_freq
+        # Last resort
+        return Frequency.D
+    if len(valid_frequencies) == 1:
+        return valid_frequencies[0]
+    # Select based on weighted probabilities
+    scores = np.array(frequency_scores)
+    probabilities = scores / scores.sum()
+    return rng.choice(valid_frequencies, p=probabilities)
+def select_safe_start_date(
+    total_length: int,
+    frequency: Frequency,
+    rng: Generator = np.random.default_rng(),
+    max_retries: int = 10,
+) -> np.datetime64:
+    """
+    Select a safe start date that ensures the entire time series (history + future)
+    will not exceed pandas' datetime bounds.
+    Args:
+        total_length: Total length of the time series (history + future)
+        frequency: Time series frequency
+        rng: Random number generator instance
+        max_retries: Maximum number of retry attempts
+    Returns:
+        A safe start date that prevents timestamp overflow
+    Raises:
+        ValueError: If no safe start date is found after max_retries or if the required
+                   time span exceeds the available date window
+    """
+    days_per_period = frequency.get_days_per_period()
+    # Calculate approximate duration in days
+    total_days = total_length * days_per_period
+    # Define safe bounds: ensure end date doesn't exceed BASE_END_DATE
+    latest_safe_start = BASE_END_DATE - np.timedelta64(int(total_days), "D")
+    earliest_safe_start = BASE_START_DATE
+    # Check if the required time span exceeds the available window
+    if latest_safe_start < earliest_safe_start:
+        available_days = (
+            (BASE_END_DATE - BASE_START_DATE).astype("timedelta64[D]").astype(int)
+        )
+        available_years = available_days / 365.25
+        required_years = total_days / 365.25
+        raise ValueError(
+            f"Required time span ({required_years:.1f} years, {total_days:.0f} days) "
+            f"exceeds available date window ({available_years:.1f} years, {available_days} days). "
+            f"Reduce total_length ({total_length}) or extend the date window."
+        )
+    # Convert to nanoseconds for random sampling
+    earliest_ns = earliest_safe_start.astype("datetime64[ns]").astype(np.int64)
+    latest_ns = latest_safe_start.astype("datetime64[ns]").astype(np.int64)
+    for _ in range(max_retries):
+        # Uniformly sample a start date within bounds
+        random_ns = rng.integers(earliest_ns, latest_ns + 1)
+        start_date = np.datetime64(int(random_ns), "ns")
+        # Verify safety
+        if validate_frequency_safety(start_date, total_length, frequency):
+            return start_date
+    # Default to base start date if no safe start date is found
+    return BASE_START_DATE

src/data/loaders.py ADDED Viewed

	@@ -0,0 +1,661 @@

+import logging
+import random
+from typing import Dict, Iterator, List, Optional
+import numpy as np
+import pandas as pd
+import torch
+from src.data.batch_composer import BatchComposer, ComposedDataset
+from src.data.containers import BatchTimeSeriesContainer
+from src.data.frequency import parse_frequency
+from src.gift_eval.constants import ALL_DATASETS
+from src.gift_eval.data import Dataset as GiftEvalDataset
+logger = logging.getLogger(__name__)
+class GiftEvalDataLoader:
+    """
+    Data loader for GIFT-eval datasets, converting them to BatchTimeSeriesContainer format.
+    Supports both training and validation modes.
+    """
+    TERMS = ["short", "medium", "long"]
+    def __init__(
+        self,
+        mode: str = "train",
+        batch_size: int = 32,
+        device: Optional[torch.device] = None,
+        shuffle: bool = True,
+        to_univariate: bool = False,
+        max_context_length: Optional[int] = None,
+        max_windows: int = 20,
+        skip_datasets_with_nans: bool = False,
+        datasets_to_use: Optional[List[str]] = None,
+        dataset_storage_path: Optional[str] = None,
+    ):
+        """
+        Initialize GIFT-eval data loader.
+        Args:
+            mode: Either "train" or "validation"
+            batch_size: Number of samples per batch
+            device: Device to load data to
+            shuffle: Whether to shuffle data
+            to_univariate: Whether to convert multivariate data to multiple univariate series
+            max_context_length: Optional maximum total window length (context + forecast) to prevent memory issues
+            max_windows: Number of windows to use for training/validation
+            skip_datasets_with_nans: Whether to skip datasets/series that contain NaN values
+            datasets_to_use: Optional list of dataset names to use. If None, uses all available datasets
+            dataset_storage_path: Path on disk where GIFT-eval HuggingFace datasets are stored
+        """
+        # Use specified datasets or all available datasets if none specified
+        if datasets_to_use is not None and len(datasets_to_use) > 0:
+            # Validate that requested datasets are available
+            invalid_datasets = [ds for ds in datasets_to_use if ds not in ALL_DATASETS]
+            if invalid_datasets:
+                logger.warning(f"Invalid datasets requested: {invalid_datasets}")
+                logger.warning(f"Available datasets: {ALL_DATASETS}")
+                # Use only valid datasets
+                self.dataset_names = [
+                    ds for ds in datasets_to_use if ds in ALL_DATASETS
+                ]
+            else:
+                self.dataset_names = datasets_to_use
+        else:
+            self.dataset_names = ALL_DATASETS
+        # Log dataset selection
+        if datasets_to_use is not None and len(datasets_to_use) > 0:
+            logger.info(
+                f"Using subset of datasets: {len(self.dataset_names)}/{len(ALL_DATASETS)} datasets"
+            )
+            logger.info(f"Selected datasets: {self.dataset_names}")
+        else:
+            logger.info(
+                f"Using all available datasets: {len(self.dataset_names)} datasets"
+            )
+        self.terms = self.TERMS
+        self.mode = mode
+        self.batch_size = batch_size
+        self.device = device
+        self.shuffle = shuffle
+        self.to_univariate = to_univariate
+        self.max_context_length = max_context_length
+        self.skip_datasets_with_nans = skip_datasets_with_nans
+        # Window configuration based on mode
+        self.max_windows = max_windows
+        self.dataset_storage_path = dataset_storage_path
+        # Load all datasets and prepare data
+        self._load_datasets()
+        # Create iterator state
+        self._current_idx = 0
+        self._epoch_data = []
+        self._prepare_epoch_data()
+    def _load_datasets(self) -> None:
+        """Load all specified GIFT-eval datasets."""
+        self.datasets = {}
+        self.dataset_prediction_lengths = {}
+        for dataset_name in self.dataset_names:
+            if dataset_name.startswith("m4_"):
+                max_windows = 1
+            else:
+                max_windows = self.max_windows
+            try:
+                # Determine if we need univariate conversion
+                # First check with multivariate to see target dimension
+                temp_dataset = GiftEvalDataset(
+                    name=dataset_name,
+                    term=self.terms[0],  # Use first term to check dimensionality
+                    to_univariate=False,
+                    max_windows=max_windows,
+                    storage_path=self.dataset_storage_path,
+                )
+                # Convert to univariate if needed
+                to_univariate = self.to_univariate and temp_dataset.target_dim > 1
+                # Load datasets for all terms
+                for term in self.terms:
+                    dataset_key = f"{dataset_name}_{term}"
+                    dataset = GiftEvalDataset(
+                        name=dataset_name,
+                        term=term,
+                        to_univariate=to_univariate,
+                        max_windows=max_windows,
+                        storage_path=self.dataset_storage_path,
+                    )
+                    self.datasets[dataset_key] = dataset
+                    self.dataset_prediction_lengths[dataset_key] = (
+                        dataset.prediction_length
+                    )
+                    logger.info(
+                        f"Loaded {dataset_key} - prediction_length: {dataset.prediction_length}, "
+                        f"frequency: {dataset.freq}, target_dim: {dataset.target_dim}, "
+                        f"min_length: {dataset._min_series_length}, windows: {dataset.windows}"
+                    )
+            except Exception as e:
+                logger.warning(f"Failed to load dataset {dataset_name}: {str(e)}")
+                continue
+    def _contains_nan(self, data_entry: dict) -> bool:
+        """Check if a data entry contains NaN values."""
+        target = data_entry.get("target")
+        if target is None:
+            return False
+        # Convert to numeric numpy array for robust NaN checking
+        try:
+            target_np = np.asarray(target, dtype=np.float32)
+            return np.isnan(target_np).any()
+        except Exception:
+            logger.warning(
+                "NaN check: failed to coerce target to float32; skipping entry"
+            )
+            return True
+    def _convert_to_container(
+        self, data_entries: List[dict], prediction_length: int, dataset_freq: str
+    ) -> BatchTimeSeriesContainer:
+        """Convert a batch of data entries to BatchTimeSeriesContainer format with fixed future length."""
+        batch_size = len(data_entries)
+        max_history_len = 0
+        # First pass: determine max history length after truncation
+        for entry in data_entries:
+            target = np.asarray(entry["target"], dtype=np.float32)
+            if target.ndim == 1:
+                target = target.reshape(1, -1)
+            _, seq_len = target.shape
+            # Only consider up to the last (max_context_length) values
+            effective_max_context = (
+                self.max_context_length
+                if self.max_context_length is not None
+                else seq_len
+            )
+            if seq_len > effective_max_context:
+                seq_len = effective_max_context
+            # History is up to (max_context_length - prediction_length)
+            history_len = max(
+                0, min(seq_len, effective_max_context) - prediction_length
+            )
+            max_history_len = max(max_history_len, history_len)
+        # Get number of channels from first entry
+        first_target = np.asarray(data_entries[0]["target"], dtype=np.float32)
+        if first_target.ndim == 1:
+            # Shape to [channels, time]
+            first_target = first_target.reshape(1, -1)
+        num_channels = first_target.shape[0]
+        # Allocate arrays
+        history_values = np.full(
+            (batch_size, max_history_len, num_channels), np.nan, dtype=np.float32
+        )
+        future_values = np.full(
+            (batch_size, prediction_length, num_channels), np.nan, dtype=np.float32
+        )
+        history_mask = np.zeros((batch_size, max_history_len), dtype=bool)
+        # Second pass: fill arrays
+        for i, entry in enumerate(data_entries):
+            target = np.asarray(entry["target"], dtype=np.float32)
+            if target.ndim == 1:
+                target = target.reshape(1, -1)
+            # Truncate to last effective_max_context points if needed
+            full_seq_len = target.shape[1]
+            total_len_allowed = (
+                self.max_context_length
+                if self.max_context_length is not None
+                else full_seq_len
+            )
+            total_len_for_entry = min(full_seq_len, total_len_allowed)
+            if total_len_for_entry < prediction_length + 1:
+                # Not enough length to build (history + future). Signal to caller.
+                raise ValueError(
+                    "Entry too short after max_context_length truncation to form history+future window"
+                )
+            truncated = target[:, -total_len_for_entry:]
+            cur_history_len = total_len_for_entry - prediction_length
+            hist = truncated[:, :cur_history_len]  # [C, H]
+            fut = truncated[
+                :, cur_history_len : cur_history_len + prediction_length
+            ]  # [C, P]
+            # Write into batch arrays with time last -> transpose to [H, C] / [P, C]
+            history_values[i, :cur_history_len, :] = hist.T
+            future_values[i, :, :] = fut.T
+            history_mask[i, :cur_history_len] = True
+        # Get start timestamp and frequency (replicate across batch)
+        start_timestamp = data_entries[0]["start"]
+        if hasattr(start_timestamp, "to_timestamp"):
+            start_numpy = start_timestamp.to_timestamp().to_numpy()
+        else:
+            start_numpy = pd.Timestamp(start_timestamp).to_numpy()
+        start_list = [start_numpy for _ in range(batch_size)]
+        # Get frequency enum and replicate across batch
+        frequency_enum = parse_frequency(dataset_freq)
+        frequency_list = [frequency_enum for _ in range(batch_size)]
+        # Create the container
+        return BatchTimeSeriesContainer(
+            history_values=torch.tensor(history_values, dtype=torch.float32),
+            future_values=torch.tensor(future_values, dtype=torch.float32),
+            start=start_list,
+            frequency=frequency_list,
+            history_mask=torch.tensor(history_mask, dtype=torch.bool)
+            if self.mode == "train"
+            else None,
+        )
+    def _prepare_epoch_data(self) -> None:
+        """Prepare all batches for one epoch."""
+        self._epoch_data = []
+        for dataset_key, dataset in self.datasets.items():
+            try:
+                # Get appropriate dataset based on mode
+                if self.mode == "train":
+                    data = dataset.training_dataset
+                else:
+                    data = dataset.validation_dataset
+                # Collect all valid data entries
+                valid_entries = []
+                dataset_freq = dataset.freq
+                prediction_length = self.dataset_prediction_lengths[dataset_key]
+                for entry in data:
+                    # Skip if contains NaN and configured to do so
+                    if self.skip_datasets_with_nans and self._contains_nan(entry):
+                        continue
+                    # Check if we have enough data
+                    target = np.asarray(entry["target"])
+                    if target.ndim == 1:
+                        seq_len = len(target)
+                    else:
+                        seq_len = target.shape[1]
+                    # Need at least prediction_length + 1 for training
+                    if self.mode == "train" and seq_len < prediction_length + 1:
+                        continue
+                    valid_entries.append(entry)
+                if not valid_entries:
+                    logger.warning(f"No valid entries found for {dataset_key}")
+                    continue
+                # Create batches
+                for i in range(0, len(valid_entries), self.batch_size):
+                    batch_entries = valid_entries[i : i + self.batch_size]
+                    try:
+                        batch_container = self._convert_to_container(
+                            batch_entries, prediction_length, dataset_freq
+                        )
+                        self._epoch_data.append((dataset_key, batch_container))
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to create batch for {dataset_key}: {str(e)}"
+                        )
+                        continue
+            except Exception as e:
+                logger.warning(
+                    f"Failed to process dataset {dataset_key}: {str(e)}. "
+                    f"Dataset may be too short for the required offset."
+                )
+                continue
+        # Shuffle if in training mode
+        if self.mode == "train" and self.shuffle:
+            random.shuffle(self._epoch_data)
+        logger.info(f"Prepared {len(self._epoch_data)} batches for {self.mode} mode")
+    def __iter__(self) -> Iterator[BatchTimeSeriesContainer]:
+        """Iterate through batches for one epoch."""
+        # Reset index at the start of each epoch
+        self._current_idx = 0
+        # Reshuffle data for each new epoch if in training mode
+        if self.mode == "train" and self.shuffle:
+            random.shuffle(self._epoch_data)
+        return self
+    def __next__(self) -> BatchTimeSeriesContainer:
+        """Get next batch."""
+        if not self._epoch_data:
+            raise StopIteration("No valid data available")
+        # Check if we've exhausted the epoch
+        if self._current_idx >= len(self._epoch_data):
+            raise StopIteration
+        # Get current batch
+        dataset_key, batch = self._epoch_data[self._current_idx]
+        self._current_idx += 1
+        # Move to device if specified
+        if self.device is not None:
+            batch.to_device(self.device)
+        return batch
+    def __len__(self) -> int:
+        """Return number of batches per epoch."""
+        return len(self._epoch_data)
+class CyclicGiftEvalDataLoader:
+    """
+    Wrapper for GiftEvalDataLoader that provides cycling behavior for training.
+    This allows training for a fixed number of iterations per epoch, cycling through
+    the available data as needed.
+    """
+    def __init__(self, base_loader: GiftEvalDataLoader, num_iterations_per_epoch: int):
+        """
+        Initialize the cyclic data loader.
+        Args:
+            base_loader: The underlying GiftEvalDataLoader
+            num_iterations_per_epoch: Number of iterations to run per epoch
+        """
+        self.base_loader = base_loader
+        self.num_iterations_per_epoch = num_iterations_per_epoch
+        self.dataset_names = base_loader.dataset_names
+        self.device = base_loader.device
+    def __iter__(self) -> Iterator[BatchTimeSeriesContainer]:
+        """Iterate for exactly num_iterations_per_epoch iterations."""
+        self._current_iteration = 0
+        self._base_iter = iter(self.base_loader)
+        return self
+    def __next__(self) -> BatchTimeSeriesContainer:
+        """Get next batch, cycling through base loader as needed."""
+        if self._current_iteration >= self.num_iterations_per_epoch:
+            raise StopIteration
+        try:
+            batch = next(self._base_iter)
+        except StopIteration:
+            # Restart the base iterator when exhausted
+            self._base_iter = iter(self.base_loader)
+            batch = next(self._base_iter)
+        self._current_iteration += 1
+        return batch
+    def __len__(self) -> int:
+        """Return the configured number of iterations per epoch."""
+        return self.num_iterations_per_epoch
+def create_synthetic_dataloader(
+    base_data_dir: str,
+    batch_size: int = 128,
+    num_batches_per_epoch: int = 1000,
+    generator_proportions: Optional[Dict[str, float]] = None,
+    mixed_batches: bool = True,
+    augmentations: Optional[Dict[str, bool]] = None,
+    augmentation_probabilities: Optional[Dict[str, float]] = None,
+    device: Optional[torch.device] = None,
+    num_workers: int = 0,
+    pin_memory: bool = True,
+    global_seed: int = 42,
+    nan_stats_path: Optional[str] = None,
+    nan_patterns_path: Optional[str] = None,
+    chosen_scaler_name: Optional[str] = None,
+) -> torch.utils.data.DataLoader:
+    """
+    Create a PyTorch DataLoader for training with saved generator batches.
+    Args:
+        base_data_dir: Base directory containing generator subdirectories
+        batch_size: Size of each training batch
+        num_batches_per_epoch: Number of batches per epoch
+        generator_proportions: Dict mapping generator names to proportions
+        mixed_batches: Whether to create mixed or uniform batches
+        augmentations: Dict mapping augmentation names to booleans
+        augmentation_probabilities: Dict mapping augmentation names to probabilities
+        device: Target device
+        num_workers: Number of DataLoader workers
+        pin_memory: Whether to pin memory
+        global_seed: Global random seed
+        nan_stats_path: Path to nan stats file
+        chosen_scaler_name: Name of the scaler that used in training
+    Returns:
+        PyTorch DataLoader
+    """
+    # Create batch composer
+    composer = BatchComposer(
+        base_data_dir=base_data_dir,
+        generator_proportions=generator_proportions,
+        mixed_batches=mixed_batches,
+        device=device,
+        augmentations=augmentations,
+        augmentation_probabilities=augmentation_probabilities,
+        global_seed=global_seed,
+        nan_stats_path=nan_stats_path,
+        nan_patterns_path=nan_patterns_path,
+        chosen_scaler_name=chosen_scaler_name,
+    )
+    # Create dataset
+    dataset = ComposedDataset(
+        batch_composer=composer,
+        num_batches_per_epoch=num_batches_per_epoch,
+        batch_size=batch_size,
+    )
+    # Custom collate function for BatchTimeSeriesContainer
+    def collate_fn(batch):
+        """Custom collate function that returns a single BatchTimeSeriesContainer."""
+        # Since each item is already a BatchTimeSeriesContainer with batch_size samples,
+        # and DataLoader batch_size=1, we just return the first (and only) item
+        return batch[0]
+    # Create DataLoader
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1,  # Each dataset item is already a complete batch
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        collate_fn=collate_fn,
+        drop_last=False,
+    )
+    logger.info(
+        f"Created DataLoader with {len(dataset)} batches per epoch, "
+        f"batch_size={batch_size}, mixed_batches={mixed_batches}"
+    )
+    return dataloader
+class SyntheticValidationDataset(torch.utils.data.Dataset):
+    """
+    Fixed synthetic validation dataset that generates a small number of batches
+    using the same composition approach as training data.
+    """
+    def __init__(
+        self,
+        base_data_dir: str,
+        batch_size: int = 128,
+        num_batches: int = 2,
+        future_length: int = 512,
+        generator_proportions: Optional[Dict[str, float]] = None,
+        augmentations: Optional[Dict[str, bool]] = None,
+        augmentation_probabilities: Optional[Dict[str, float]] = None,
+        device: Optional[torch.device] = None,
+        global_seed: int = 42,
+        chosen_scaler_name: Optional[str] = None,
+        nan_stats_path: Optional[str] = None,
+        nan_patterns_path: Optional[str] = None,
+        rank: int = 0,
+        world_size: int = 1,
+    ):
+        """
+        Initialize the validation dataset.
+        Args:
+            base_data_dir: Base directory containing generator subdirectories
+            batch_size: Size of each validation batch
+            num_batches: Number of validation batches to generate (1 or 2)
+            generator_proportions: Dict mapping generator names to proportions
+            device: Device to load tensors to
+            global_seed: Global random seed
+            chosen_scaler_name: Name of the scaler that used in training
+        """
+        self.batch_size = batch_size
+        self.num_batches = num_batches
+        self.device = device
+        # Create batch composer; force validation to use max-length windows (no length shortening)
+        val_augmentations = dict(augmentations or {})
+        val_augmentations["length_shortening"] = False
+        self.batch_composer = BatchComposer(
+            base_data_dir=base_data_dir,
+            generator_proportions=generator_proportions,
+            mixed_batches=True,  # Use mixed batches for validation
+            device=device,
+            global_seed=global_seed + 999999,
+            augmentations=val_augmentations,
+            augmentation_probabilities=augmentation_probabilities,
+            nan_stats_path=nan_stats_path,
+            nan_patterns_path=nan_patterns_path,
+            chosen_scaler_name=chosen_scaler_name,
+            rank=rank,
+            world_size=world_size,
+        )
+        # Pre-generate fixed validation batches
+        self.validation_batches = []
+        for i in range(num_batches):
+            batch, _ = self.batch_composer.create_batch(
+                batch_size=batch_size,
+                future_length=future_length,
+                seed=global_seed
+                + 999999
+                + i,  # Fixed seeds for reproducible validation
+            )
+            self.validation_batches.append(batch)
+        logger.info(
+            f"Created {num_batches} fixed validation batches with batch_size={batch_size}"
+        )
+    def __len__(self) -> int:
+        return self.num_batches
+    def __getitem__(self, idx: int) -> BatchTimeSeriesContainer:
+        """
+        Get a pre-generated validation batch by index.
+        Args:
+            idx: Batch index
+        Returns:
+            BatchTimeSeriesContainer
+        """
+        if idx >= len(self.validation_batches):
+            raise IndexError(f"Batch index {idx} out of range")
+        batch = self.validation_batches[idx]
+        # Move to device if needed
+        if self.device is not None:
+            batch.to_device(self.device)
+        return batch
+def create_synthetic_dataset(
+    base_data_dir: str,
+    batch_size: int = 128,
+    num_batches_per_epoch: int = 1000,
+    generator_proportions: Optional[Dict[str, float]] = None,
+    mixed_batches: bool = True,
+    augmentations: Optional[Dict[str, bool]] = None,
+    augmentation_probabilities: Optional[Dict[str, float]] = None,
+    global_seed: int = 42,
+    nan_stats_path: Optional[str] = None,
+    nan_patterns_path: Optional[str] = None,
+    chosen_scaler_name: Optional[str] = None,
+    rank: int = 0,
+    world_size: int = 1,
+) -> ComposedDataset:
+    """
+    Creates the ComposedDataset for training with saved generator batches.
+    Args:
+        base_data_dir: Base directory containing generator subdirectories.
+        batch_size: Size of each training batch.
+        num_batches_per_epoch: Number of batches per epoch.
+        generator_proportions: Dict mapping generator names to proportions.
+        mixed_batches: Whether to create mixed or uniform batches.
+        augmentations: Dict mapping augmentation names to booleans.
+        global_seed: Global random seed.
+        nan_stats_path: Path to nan stats file.
+        chosen_scaler_name: Name of the scaler to use.
+    Returns:
+        A ComposedDataset instance.
+    """
+    # Create batch composer
+    composer = BatchComposer(
+        base_data_dir=base_data_dir,
+        generator_proportions=generator_proportions,
+        mixed_batches=mixed_batches,
+        device=None,  # Device is handled in the training loop
+        augmentations=augmentations,
+        augmentation_probabilities=augmentation_probabilities,
+        global_seed=global_seed,
+        nan_stats_path=nan_stats_path,
+        nan_patterns_path=nan_patterns_path,
+        chosen_scaler_name=chosen_scaler_name,
+        rank=rank,
+        world_size=world_size,
+    )
+    # Create and return the dataset
+    dataset = ComposedDataset(
+        batch_composer=composer,
+        num_batches_per_epoch=num_batches_per_epoch,
+        batch_size=batch_size,
+    )
+    logger.info(
+        f"Created ComposedDataset with {len(dataset)} batches per epoch, "
+        f"batch_size={batch_size}, mixed_batches={mixed_batches}"
+    )
+    return dataset

src/data/scalers.py ADDED Viewed

	@@ -0,0 +1,360 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+import torch
+class BaseScaler(ABC):
+    """
+    Abstract base class for time series scalers.
+    Defines the interface for scaling multivariate time series data with support
+    for masked values and channel-wise scaling.
+    """
+    @abstractmethod
+    def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compute scaling statistics from historical data.
+        """
+        pass
+    @abstractmethod
+    def scale(
+        self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply scaling transformation to data.
+        """
+        pass
+    @abstractmethod
+    def inverse_scale(
+        self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply inverse scaling transformation to recover original scale.
+        """
+        pass
+class RobustScaler(BaseScaler):
+    """
+    Robust scaler using median and IQR for normalization.
+    """
+    def __init__(self, epsilon: float = 1e-6, min_scale: float = 1e-3):
+        if epsilon <= 0:
+            raise ValueError("epsilon must be positive")
+        if min_scale <= 0:
+            raise ValueError("min_scale must be positive")
+        self.epsilon = epsilon
+        self.min_scale = min_scale
+    def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compute median and IQR statistics from historical data with improved numerical stability.
+        """
+        batch_size, seq_len, num_channels = history_values.shape
+        device = history_values.device
+        medians = torch.zeros(batch_size, 1, num_channels, device=device)
+        iqrs = torch.ones(batch_size, 1, num_channels, device=device)
+        for b in range(batch_size):
+            for c in range(num_channels):
+                channel_data = history_values[b, :, c]
+                if history_mask is not None:
+                    mask = history_mask[b, :].bool()
+                    valid_data = channel_data[mask]
+                else:
+                    valid_data = channel_data
+                if len(valid_data) == 0:
+                    continue
+                valid_data = valid_data[torch.isfinite(valid_data)]
+                if len(valid_data) == 0:
+                    continue
+                median_val = torch.median(valid_data)
+                medians[b, 0, c] = median_val
+                if len(valid_data) > 1:
+                    try:
+                        q75 = torch.quantile(valid_data, 0.75)
+                        q25 = torch.quantile(valid_data, 0.25)
+                        iqr_val = q75 - q25
+                        iqr_val = torch.max(
+                            iqr_val, torch.tensor(self.min_scale, device=device)
+                        )
+                        iqrs[b, 0, c] = iqr_val
+                    except Exception:
+                        std_val = torch.std(valid_data)
+                        iqrs[b, 0, c] = torch.max(
+                            std_val, torch.tensor(self.min_scale, device=device)
+                        )
+                else:
+                    iqrs[b, 0, c] = self.min_scale
+        return {"median": medians, "iqr": iqrs}
+    def scale(
+        self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply robust scaling: (data - median) / (iqr + epsilon).
+        """
+        median = statistics["median"]
+        iqr = statistics["iqr"]
+        denominator = torch.max(
+            iqr + self.epsilon, torch.tensor(self.min_scale, device=iqr.device)
+        )
+        scaled_data = (data - median) / denominator
+        scaled_data = torch.clamp(scaled_data, -50.0, 50.0)
+        return scaled_data
+    def inverse_scale(
+        self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply inverse robust scaling, now compatible with 3D or 4D tensors.
+        """
+        median = statistics["median"]
+        iqr = statistics["iqr"]
+        denominator = torch.max(
+            iqr + self.epsilon, torch.tensor(self.min_scale, device=iqr.device)
+        )
+        if scaled_data.ndim == 4:
+            denominator = denominator.unsqueeze(-1)
+            median = median.unsqueeze(-1)
+        return scaled_data * denominator + median
+class MinMaxScaler(BaseScaler):
+    """
+    Min-Max scaler that normalizes data to the range [-1, 1].
+    """
+    def __init__(self, epsilon: float = 1e-8):
+        if epsilon <= 0:
+            raise ValueError("epsilon must be positive")
+        self.epsilon = epsilon
+    def compute_statistics(
+        self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compute min and max statistics from historical data.
+        """
+        batch_size, seq_len, num_channels = history_values.shape
+        device = history_values.device
+        mins = torch.zeros(batch_size, 1, num_channels, device=device)
+        maxs = torch.ones(batch_size, 1, num_channels, device=device)
+        for b in range(batch_size):
+            for c in range(num_channels):
+                channel_data = history_values[b, :, c]
+                if history_mask is not None:
+                    mask = history_mask[b, :].bool()
+                    valid_data = channel_data[mask]
+                else:
+                    valid_data = channel_data
+                if len(valid_data) == 0:
+                    continue
+                min_val = torch.min(valid_data)
+                max_val = torch.max(valid_data)
+                mins[b, 0, c] = min_val
+                maxs[b, 0, c] = max_val
+                if torch.abs(max_val - min_val) < self.epsilon:
+                    maxs[b, 0, c] = min_val + 1.0
+        return {"min": mins, "max": maxs}
+    def scale(
+        self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply min-max scaling to range [-1, 1].
+        """
+        min_val = statistics["min"]
+        max_val = statistics["max"]
+        normalized = (data - min_val) / (max_val - min_val + self.epsilon)
+        return normalized * 2.0 - 1.0
+    def inverse_scale(
+        self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply inverse min-max scaling, now compatible with 3D or 4D tensors.
+        """
+        min_val = statistics["min"]
+        max_val = statistics["max"]
+        if scaled_data.ndim == 4:
+            min_val = min_val.unsqueeze(-1)
+            max_val = max_val.unsqueeze(-1)
+        normalized = (scaled_data + 1.0) / 2.0
+        return normalized * (max_val - min_val + self.epsilon) + min_val
+class MeanScaler(BaseScaler):
+    """
+    A scaler that centers the data by subtracting the channel-wise mean.
+    This scaler only performs centering and does not affect the scale of the data.
+    """
+    def compute_statistics(
+            self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compute the mean for each channel from historical data.
+        """
+        batch_size, seq_len, num_channels = history_values.shape
+        device = history_values.device
+        # Initialize a tensor to store the mean for each channel in each batch item
+        means = torch.zeros(batch_size, 1, num_channels, device=device)
+        for b in range(batch_size):
+            for c in range(num_channels):
+                channel_data = history_values[b, :, c]
+                # Use the mask to select only valid (observed) data points
+                if history_mask is not None:
+                    mask = history_mask[b, :].bool()
+                    valid_data = channel_data[mask]
+                else:
+                    valid_data = channel_data
+                # Skip if there's no valid data for this channel
+                if len(valid_data) == 0:
+                    continue
+                # Filter out non-finite values like NaN or Inf before computing
+                valid_data = valid_data[torch.isfinite(valid_data)]
+                if len(valid_data) == 0:
+                    continue
+                # Compute the mean and store it
+                means[b, 0, c] = torch.mean(valid_data)
+        return {"mean": means}
+    def scale(
+            self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply mean centering: data - mean.
+        """
+        mean = statistics["mean"]
+        return data - mean
+    def inverse_scale(
+            self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply inverse mean centering: scaled_data + mean.
+        Handles both 3D (e.g., training input) and 4D (e.g., model output samples) tensors.
+        """
+        mean = statistics["mean"]
+        # Adjust shape for 4D tensors (batch, seq_len, channels, samples)
+        if scaled_data.ndim == 4:
+            mean = mean.unsqueeze(-1)
+        return scaled_data + mean
+class MedianScaler(BaseScaler):
+    """
+    A scaler that centers the data by subtracting the channel-wise median.
+    This scaler only performs centering and does not affect the scale of the data.
+    It is more robust to outliers than the MeanScaler.
+    """
+    def compute_statistics(
+            self, history_values: torch.Tensor, history_mask: Optional[torch.Tensor] = None
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Compute the median for each channel from historical data.
+        """
+        batch_size, seq_len, num_channels = history_values.shape
+        device = history_values.device
+        # Initialize a tensor to store the median for each channel in each batch item
+        medians = torch.zeros(batch_size, 1, num_channels, device=device)
+        for b in range(batch_size):
+            for c in range(num_channels):
+                channel_data = history_values[b, :, c]
+                # Use the mask to select only valid (observed) data points
+                if history_mask is not None:
+                    mask = history_mask[b, :].bool()
+                    valid_data = channel_data[mask]
+                else:
+                    valid_data = channel_data
+                # Skip if there's no valid data for this channel
+                if len(valid_data) == 0:
+                    continue
+                # Filter out non-finite values like NaN or Inf before computing
+                valid_data = valid_data[torch.isfinite(valid_data)]
+                if len(valid_data) == 0:
+                    continue
+                # Compute the median and store it
+                medians[b, 0, c] = torch.median(valid_data)
+        return {"median": medians}
+    def scale(
+            self, data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply median centering: data - median.
+        """
+        median = statistics["median"]
+        return data - median
+    def inverse_scale(
+            self, scaled_data: torch.Tensor, statistics: Dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply inverse median centering: scaled_data + median.
+        Handles both 3D (e.g., training input) and 4D (e.g., model output samples) tensors.
+        """
+        median = statistics["median"]
+        # Adjust shape for 4D tensors (batch, seq_len, channels, samples)
+        if scaled_data.ndim == 4:
+            median = median.unsqueeze(-1)
+        return scaled_data + median

src/data/time_features.py ADDED Viewed

	@@ -0,0 +1,564 @@

+import logging
+from typing import Any, Dict, List, Optional
+import numpy as np
+import pandas as pd
+import scipy.fft as fft
+import torch
+from gluonts.time_feature import time_features_from_frequency_str
+from gluonts.time_feature._base import (
+    day_of_month,
+    day_of_month_index,
+    day_of_week,
+    day_of_week_index,
+    day_of_year,
+    hour_of_day,
+    hour_of_day_index,
+    minute_of_hour,
+    minute_of_hour_index,
+    month_of_year,
+    month_of_year_index,
+    second_of_minute,
+    second_of_minute_index,
+    week_of_year,
+    week_of_year_index,
+)
+from gluonts.time_feature.holiday import (
+    BLACK_FRIDAY,
+    CHRISTMAS_DAY,
+    CHRISTMAS_EVE,
+    CYBER_MONDAY,
+    EASTER_MONDAY,
+    EASTER_SUNDAY,
+    GOOD_FRIDAY,
+    INDEPENDENCE_DAY,
+    LABOR_DAY,
+    MEMORIAL_DAY,
+    NEW_YEARS_DAY,
+    NEW_YEARS_EVE,
+    THANKSGIVING,
+    SpecialDateFeatureSet,
+    exponential_kernel,
+    squared_exponential_kernel,
+)
+from gluonts.time_feature.seasonality import get_seasonality
+from scipy.signal import find_peaks
+from src.data.constants import BASE_END_DATE, BASE_START_DATE
+from src.data.frequency import (
+    Frequency,
+    validate_frequency_safety,
+)
+from src.utils.utils import device
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Enhanced feature sets for different frequencies
+ENHANCED_TIME_FEATURES = {
+    # High-frequency features (seconds, minutes)
+    "high_freq": {
+        "normalized": [
+            second_of_minute,
+            minute_of_hour,
+            hour_of_day,
+            day_of_week,
+            day_of_month,
+        ],
+        "index": [
+            second_of_minute_index,
+            minute_of_hour_index,
+            hour_of_day_index,
+            day_of_week_index,
+        ],
+    },
+    # Medium-frequency features (hourly, daily)
+    "medium_freq": {
+        "normalized": [
+            hour_of_day,
+            day_of_week,
+            day_of_month,
+            day_of_year,
+            month_of_year,
+        ],
+        "index": [
+            hour_of_day_index,
+            day_of_week_index,
+            day_of_month_index,
+            week_of_year_index,
+        ],
+    },
+    # Low-frequency features (weekly, monthly)
+    "low_freq": {
+        "normalized": [day_of_week, day_of_month, month_of_year, week_of_year],
+        "index": [day_of_week_index, month_of_year_index, week_of_year_index],
+    },
+}
+# Holiday features for different markets/regions
+HOLIDAY_FEATURE_SETS = {
+    "us_business": [
+        NEW_YEARS_DAY,
+        MEMORIAL_DAY,
+        INDEPENDENCE_DAY,
+        LABOR_DAY,
+        THANKSGIVING,
+        CHRISTMAS_EVE,
+        CHRISTMAS_DAY,
+        NEW_YEARS_EVE,
+    ],
+    "us_retail": [
+        NEW_YEARS_DAY,
+        EASTER_SUNDAY,
+        MEMORIAL_DAY,
+        INDEPENDENCE_DAY,
+        LABOR_DAY,
+        THANKSGIVING,
+        BLACK_FRIDAY,
+        CYBER_MONDAY,
+        CHRISTMAS_EVE,
+        CHRISTMAS_DAY,
+        NEW_YEARS_EVE,
+    ],
+    "christian": [
+        NEW_YEARS_DAY,
+        GOOD_FRIDAY,
+        EASTER_SUNDAY,
+        EASTER_MONDAY,
+        CHRISTMAS_EVE,
+        CHRISTMAS_DAY,
+        NEW_YEARS_EVE,
+    ],
+}
+class TimeFeatureGenerator:
+    """
+    Enhanced time feature generator that leverages full GluonTS capabilities.
+    """
+    def __init__(
+        self,
+        use_enhanced_features: bool = True,
+        use_holiday_features: bool = True,
+        holiday_set: str = "us_business",
+        holiday_kernel: str = "exponential",
+        holiday_kernel_alpha: float = 1.0,
+        use_index_features: bool = True,
+        k_max: int = 15,
+        include_seasonality_info: bool = True,
+        use_auto_seasonality: bool = False,  # New parameter
+        max_seasonal_periods: int = 3,  # New parameter
+    ):
+        """
+        Initialize enhanced time feature generator.
+        Parameters
+        ----------
+        use_enhanced_features : bool
+            Whether to use frequency-specific enhanced features
+        use_holiday_features : bool
+            Whether to include holiday features
+        holiday_set : str
+            Which holiday set to use ('us_business', 'us_retail', 'christian')
+        holiday_kernel : str
+            Holiday kernel type ('indicator', 'exponential', 'squared_exponential')
+        holiday_kernel_alpha : float
+            Kernel parameter for exponential kernels
+        use_index_features : bool
+            Whether to include index-based features alongside normalized ones
+        k_max : int
+            Maximum number of time features to pad to
+        include_seasonality_info : bool
+            Whether to include seasonality information as features
+        use_auto_seasonality : bool
+            Whether to use automatic FFT-based seasonality detection
+        max_seasonal_periods : int
+            Maximum number of seasonal periods to detect automatically
+        """
+        self.use_enhanced_features = use_enhanced_features
+        self.use_holiday_features = use_holiday_features
+        self.holiday_set = holiday_set
+        self.use_index_features = use_index_features
+        self.k_max = k_max
+        self.include_seasonality_info = include_seasonality_info
+        self.use_auto_seasonality = use_auto_seasonality
+        self.max_seasonal_periods = max_seasonal_periods
+        # Initialize holiday feature set
+        self.holiday_feature_set = None
+        if use_holiday_features and holiday_set in HOLIDAY_FEATURE_SETS:
+            kernel_func = self._get_holiday_kernel(holiday_kernel, holiday_kernel_alpha)
+            self.holiday_feature_set = SpecialDateFeatureSet(
+                HOLIDAY_FEATURE_SETS[holiday_set], kernel_func
+            )
+    def _get_holiday_kernel(self, kernel_type: str, alpha: float):
+        """Get holiday kernel function."""
+        if kernel_type == "exponential":
+            return exponential_kernel(alpha)
+        elif kernel_type == "squared_exponential":
+            return squared_exponential_kernel(alpha)
+        else:
+            # Default indicator kernel
+            return lambda x: float(x == 0)
+    def _get_feature_category(self, freq_str: str) -> str:
+        """Determine feature category based on frequency."""
+        if freq_str in ["s", "1min", "5min", "10min", "15min"]:
+            return "high_freq"
+        elif freq_str in ["h", "D"]:
+            return "medium_freq"
+        else:
+            return "low_freq"
+    def _compute_enhanced_features(
+        self, period_index: pd.PeriodIndex, freq_str: str
+    ) -> np.ndarray:
+        """Compute enhanced time features based on frequency."""
+        if not self.use_enhanced_features:
+            return np.array([]).reshape(len(period_index), 0)
+        category = self._get_feature_category(freq_str)
+        feature_config = ENHANCED_TIME_FEATURES[category]
+        features = []
+        # Add normalized features
+        for feat_func in feature_config["normalized"]:
+            try:
+                feat_values = feat_func(period_index)
+                features.append(feat_values)
+            except Exception:
+                continue
+        # Add index features if enabled
+        if self.use_index_features:
+            for feat_func in feature_config["index"]:
+                try:
+                    feat_values = feat_func(period_index)
+                    # Normalize index features to [0, 1] range
+                    if feat_values.max() > 0:
+                        feat_values = feat_values / feat_values.max()
+                    features.append(feat_values)
+                except Exception:
+                    continue
+        if features:
+            return np.stack(features, axis=-1)
+        else:
+            return np.array([]).reshape(len(period_index), 0)
+    def _compute_holiday_features(self, date_range: pd.DatetimeIndex) -> np.ndarray:
+        """Compute holiday features."""
+        if not self.use_holiday_features or self.holiday_feature_set is None:
+            return np.array([]).reshape(len(date_range), 0)
+        try:
+            holiday_features = self.holiday_feature_set(date_range)
+            return holiday_features.T  # Transpose to get [time, features] shape
+        except Exception:
+            return np.array([]).reshape(len(date_range), 0)
+    def _detect_auto_seasonality(self, time_series_values: np.ndarray) -> list:
+        """
+        Detect seasonal periods automatically using FFT analysis.
+        Parameters
+        ----------
+        time_series_values : np.ndarray
+            Time series values for seasonality detection
+        Returns
+        -------
+        list
+            List of detected seasonal periods
+        """
+        if not self.use_auto_seasonality or len(time_series_values) < 10:
+            return []
+        try:
+            # Remove NaN values
+            values = time_series_values[~np.isnan(time_series_values)]
+            if len(values) < 10:
+                return []
+            # Simple linear detrending
+            x = np.arange(len(values))
+            coeffs = np.polyfit(x, values, 1)
+            trend = np.polyval(coeffs, x)
+            detrended = values - trend
+            # Apply Hann window to reduce spectral leakage
+            window = np.hanning(len(detrended))
+            windowed = detrended * window
+            # Zero padding for better frequency resolution
+            padded_length = len(windowed) * 2
+            padded_values = np.zeros(padded_length)
+            padded_values[: len(windowed)] = windowed
+            # Compute FFT
+            fft_values = fft.rfft(padded_values)
+            fft_magnitudes = np.abs(fft_values)
+            freqs = np.fft.rfftfreq(padded_length)
+            # Exclude DC component
+            fft_magnitudes[0] = 0.0
+            # Find peaks with threshold (5% of max magnitude)
+            threshold = 0.05 * np.max(fft_magnitudes)
+            peak_indices, _ = find_peaks(fft_magnitudes, height=threshold)
+            if len(peak_indices) == 0:
+                return []
+            # Sort by magnitude and take top periods
+            sorted_indices = peak_indices[
+                np.argsort(fft_magnitudes[peak_indices])[::-1]
+            ]
+            top_indices = sorted_indices[: self.max_seasonal_periods]
+            # Convert frequencies to periods
+            periods = []
+            for idx in top_indices:
+                if freqs[idx] > 0:
+                    period = 1.0 / freqs[idx]
+                    # Scale back to original length and round
+                    period = round(period / 2)  # Account for zero padding
+                    if 2 <= period <= len(values) // 2:  # Reasonable period range
+                        periods.append(period)
+            return list(set(periods))  # Remove duplicates
+        except Exception:
+            return []
+    def _compute_seasonality_features(
+        self,
+        period_index: pd.PeriodIndex,
+        freq_str: str,
+        time_series_values: np.ndarray = None,
+    ) -> np.ndarray:
+        """Compute seasonality-aware features."""
+        if not self.include_seasonality_info:
+            return np.array([]).reshape(len(period_index), 0)
+        all_seasonal_features = []
+        # Original frequency-based seasonality
+        try:
+            seasonality = get_seasonality(freq_str)
+            if seasonality > 1:
+                positions = np.arange(len(period_index))
+                sin_feat = np.sin(2 * np.pi * positions / seasonality)
+                cos_feat = np.cos(2 * np.pi * positions / seasonality)
+                all_seasonal_features.extend([sin_feat, cos_feat])
+        except Exception:
+            pass
+        # Automatic seasonality detection
+        if self.use_auto_seasonality and time_series_values is not None:
+            auto_periods = self._detect_auto_seasonality(time_series_values)
+            for period in auto_periods:
+                try:
+                    positions = np.arange(len(period_index))
+                    sin_feat = np.sin(2 * np.pi * positions / period)
+                    cos_feat = np.cos(2 * np.pi * positions / period)
+                    all_seasonal_features.extend([sin_feat, cos_feat])
+                except Exception:
+                    continue
+        if all_seasonal_features:
+            return np.stack(all_seasonal_features, axis=-1)
+        else:
+            return np.array([]).reshape(len(period_index), 0)
+    def compute_features(
+        self,
+        period_index: pd.PeriodIndex,
+        date_range: pd.DatetimeIndex,
+        freq_str: str,
+        time_series_values: np.ndarray = None,
+    ) -> np.ndarray:
+        """
+        Compute all time features for given period index.
+        Parameters
+        ----------
+        period_index : pd.PeriodIndex
+            Period index for computing features
+        date_range : pd.DatetimeIndex
+            Corresponding datetime index for holiday features
+        freq_str : str
+            Frequency string
+        time_series_values : np.ndarray, optional
+            Time series values for automatic seasonality detection
+        Returns
+        -------
+        np.ndarray
+            Time features array of shape [time_steps, num_features]
+        """
+        all_features = []
+        # Standard GluonTS features
+        try:
+            standard_features = time_features_from_frequency_str(freq_str)
+            if standard_features:
+                std_feat = np.stack(
+                    [feat(period_index) for feat in standard_features], axis=-1
+                )
+                all_features.append(std_feat)
+        except Exception:
+            pass
+        # Enhanced features
+        enhanced_feat = self._compute_enhanced_features(period_index, freq_str)
+        if enhanced_feat.shape[1] > 0:
+            all_features.append(enhanced_feat)
+        # Holiday features
+        holiday_feat = self._compute_holiday_features(date_range)
+        if holiday_feat.shape[1] > 0:
+            all_features.append(holiday_feat)
+        # Seasonality features (including auto-detected)
+        seasonality_feat = self._compute_seasonality_features(
+            period_index, freq_str, time_series_values
+        )
+        if seasonality_feat.shape[1] > 0:
+            all_features.append(seasonality_feat)
+        if all_features:
+            combined_features = np.concatenate(all_features, axis=-1)
+        else:
+            combined_features = np.zeros((len(period_index), 1))
+        return combined_features
+def compute_batch_time_features(
+    start: List[np.datetime64],
+    history_length: int,
+    future_length: int,
+    batch_size: int,
+    frequency: List[Frequency],
+    K_max: int = 6,
+    time_feature_config: Optional[Dict[str, Any]] = None,
+):
+    """
+    Compute time features from start timestamps and frequency.
+    Parameters
+    ----------
+    start : array-like, shape (batch_size,)
+        Start timestamps for each batch item.
+    history_length : int
+        Length of history sequence.
+    future_length : int
+        Length of target sequence.
+    batch_size : int
+        Batch size.
+    frequency : array-like, shape (batch_size,)
+        Frequency of the time series.
+    K_max : int, optional
+        Maximum number of time features to pad to (default: 6).
+    time_feature_config : dict, optional
+        Configuration for enhanced time features.
+    Returns
+    -------
+    tuple
+        (history_time_features, target_time_features) where each is a torch.Tensor
+        of shape (batch_size, length, K_max).
+    """
+    # Initialize enhanced feature generator
+    feature_config = time_feature_config or {}
+    feature_generator = TimeFeatureGenerator(**feature_config)
+    # Generate timestamps and features
+    history_features_list = []
+    future_features_list = []
+    total_length = history_length + future_length
+    for i in range(batch_size):
+        frequency_i = frequency[i]
+        freq_str = frequency_i.to_pandas_freq(for_date_range=True)
+        period_freq_str = frequency_i.to_pandas_freq(for_date_range=False)
+        # Validate start timestamp is within safe bounds
+        start_ts = pd.Timestamp(start[i])
+        if not validate_frequency_safety(start_ts, total_length, frequency_i):
+            logger.debug(
+                f"Start date {start_ts} not safe for total_length={total_length}, frequency={frequency_i}. "
+                f"Using BASE_START_DATE instead."
+            )
+            start_ts = BASE_START_DATE
+        # Create history range with bounds checking
+        history_range = pd.date_range(
+            start=start_ts, periods=history_length, freq=freq_str
+        )
+        # Check if history range goes beyond safe bounds
+        if history_range[-1] > BASE_END_DATE:
+            safe_start = BASE_END_DATE - pd.tseries.frequencies.to_offset(freq_str) * (
+                history_length + future_length
+            )
+            if safe_start < BASE_START_DATE:
+                safe_start = BASE_START_DATE
+            history_range = pd.date_range(
+                start=safe_start, periods=history_length, freq=freq_str
+            )
+        future_start = history_range[-1] + pd.tseries.frequencies.to_offset(freq_str)
+        future_range = pd.date_range(
+            start=future_start, periods=future_length, freq=freq_str
+        )
+        # Convert to period indices
+        history_period_idx = history_range.to_period(period_freq_str)
+        future_period_idx = future_range.to_period(period_freq_str)
+        # Compute enhanced features
+        history_features = feature_generator.compute_features(
+            history_period_idx, history_range, freq_str
+        )
+        future_features = feature_generator.compute_features(
+            future_period_idx, future_range, freq_str
+        )
+        # Pad or truncate to K_max
+        history_features = _pad_or_truncate_features(history_features, K_max)
+        future_features = _pad_or_truncate_features(future_features, K_max)
+        history_features_list.append(history_features)
+        future_features_list.append(future_features)
+    # Stack into batch tensors
+    history_time_features = np.stack(history_features_list, axis=0)
+    future_time_features = np.stack(future_features_list, axis=0)
+    return (
+        torch.from_numpy(history_time_features).float().to(device),
+        torch.from_numpy(future_time_features).float().to(device),
+    )
+def _pad_or_truncate_features(features: np.ndarray, K_max: int) -> np.ndarray:
+    """Pad with zeros or truncate features to K_max dimensions."""
+    seq_len, num_features = features.shape
+    if num_features < K_max:
+        # Pad with zeros
+        padding = np.zeros((seq_len, K_max - num_features))
+        features = np.concatenate([features, padding], axis=-1)
+    elif num_features > K_max:
+        # Truncate to K_max (keep most important features first)
+        features = features[:, :K_max]
+    return features

src/data/utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import random
+from typing import Optional, Tuple, Union
+def sample_future_length(
+    range: Union[Tuple[int, int], str] = "gift_eval",
+    total_length: Optional[int] = None,
+) -> int:
+    """
+    Sample a forecast length.
+    - If `range` is a tuple, uniformly sample in [min, max]. When `total_length` is
+      provided, enforce a cap so the result is at most floor(0.45 * total_length).
+    - If `range` is "gift_eval", sample from a pre-defined weighted set. When
+      `total_length` is provided, filter out candidates greater than
+      floor(0.45 * total_length) before sampling.
+    """
+    # Compute the cap when total_length is provided
+    cap: Optional[int] = None
+    if total_length is not None:
+        cap = max(1, int(0.45 * int(total_length)))
+    if isinstance(range, tuple):
+        min_len, max_len = range
+        if cap is not None:
+            effective_max_len = min(max_len, cap)
+            # Ensure valid bounds
+            if min_len > effective_max_len:
+                return effective_max_len
+            return random.randint(min_len, effective_max_len)
+        return random.randint(min_len, max_len)
+    elif range == "gift_eval":
+        # Gift eval forecast lengths with their frequencies
+        GIFT_EVAL_FORECAST_LENGTHS = {
+            48: 5,
+            720: 38,
+            480: 38,
+            30: 3,
+            300: 16,
+            8: 2,
+            120: 3,
+            450: 8,
+            80: 8,
+            12: 2,
+            900: 10,
+            180: 3,
+            600: 10,
+            60: 3,
+            210: 3,
+            195: 3,
+            140: 3,
+            130: 3,
+            14: 1,
+            18: 1,
+            13: 1,
+            6: 1,
+        }
+        lengths = list(GIFT_EVAL_FORECAST_LENGTHS.keys())
+        weights = list(GIFT_EVAL_FORECAST_LENGTHS.values())
+        if cap is not None:
+            filtered = [
+                (length_candidate, weight)
+                for length_candidate, weight in zip(lengths, weights)
+                if length_candidate <= cap
+            ]
+            if filtered:
+                lengths, weights = zip(*filtered)
+                lengths = list(lengths)
+                weights = list(weights)
+        return random.choices(lengths, weights=weights)[0]
+    else:
+        raise ValueError(f"Invalid range: {range}")

src/gift_eval/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Public API for the GIFT-Eval utilities."""
+from .core import DatasetMetadata, EvaluationItem, expand_datasets_arg
+from .predictor import TimeSeriesPredictor
+from .results import aggregate_results, get_all_datasets_full_name, write_results_to_disk
+__all__ = [
+    "DatasetMetadata",
+    "EvaluationItem",
+    "TimeSeriesPredictor",
+    "aggregate_results",
+    "expand_datasets_arg",
+    "get_all_datasets_full_name",
+    "write_results_to_disk",
+]

src/gift_eval/constants.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import json
+import logging
+import os
+from pathlib import Path
+from gluonts.ev.metrics import (
+    MAE,
+    MAPE,
+    MASE,
+    MSE,
+    MSIS,
+    ND,
+    NRMSE,
+    RMSE,
+    SMAPE,
+    MeanWeightedSumQuantileLoss,
+)
+logger = logging.getLogger(__name__)
+# Environment setup
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+# Use absolute path relative to the project root
+_MODULE_DIR = Path(__file__).parent.parent.parent  # Goes to project root
+DATASET_PROPERTIES_PATH = _MODULE_DIR / "data" / "dataset_properties.json"
+try:
+    with open(DATASET_PROPERTIES_PATH, "r") as f:
+        DATASET_PROPERTIES = json.load(f)
+except Exception as exc:  # pragma: no cover - logging path
+    DATASET_PROPERTIES = {}
+    logger.warning(
+        "Could not load dataset properties from %s: %s. Domain and num_variates will fall back to defaults.",
+        DATASET_PROPERTIES_PATH,
+        exc,
+    )
+# Datasets
+SHORT_DATASETS = (
+    "m4_yearly",
+    "m4_quarterly",
+    "m4_monthly",
+    "m4_weekly",
+    "m4_daily",
+    "m4_hourly",
+    "electricity/15T",
+    "electricity/H",
+    "electricity/D",
+    "electricity/W",
+    "solar/10T",
+    "solar/H",
+    "solar/D",
+    "solar/W",
+    "hospital",
+    "covid_deaths",
+    "us_births/D",
+    "us_births/M",
+    "us_births/W",
+    "saugeenday/D",
+    "saugeenday/M",
+    "saugeenday/W",
+    "temperature_rain_with_missing",
+    "kdd_cup_2018_with_missing/H",
+    "kdd_cup_2018_with_missing/D",
+    "car_parts_with_missing",
+    "restaurant",
+    "hierarchical_sales/D",
+    "hierarchical_sales/W",
+    "LOOP_SEATTLE/5T",
+    "LOOP_SEATTLE/H",
+    "LOOP_SEATTLE/D",
+    "SZ_TAXI/15T",
+    "SZ_TAXI/H",
+    "M_DENSE/H",
+    "M_DENSE/D",
+    "ett1/15T",
+    "ett1/H",
+    "ett1/D",
+    "ett1/W",
+    "ett2/15T",
+    "ett2/H",
+    "ett2/D",
+    "ett2/W",
+    "jena_weather/10T",
+    "jena_weather/H",
+    "jena_weather/D",
+    "bitbrains_fast_storage/5T",
+    "bitbrains_fast_storage/H",
+    "bitbrains_rnd/5T",
+    "bitbrains_rnd/H",
+    "bizitobs_application",
+    "bizitobs_service",
+    "bizitobs_l2c/5T",
+    "bizitobs_l2c/H",
+)
+MED_LONG_DATASETS = (
+    "electricity/15T",
+    "electricity/H",
+    "solar/10T",
+    "solar/H",
+    "kdd_cup_2018_with_missing/H",
+    "LOOP_SEATTLE/5T",
+    "LOOP_SEATTLE/H",
+    "SZ_TAXI/15T",
+    "M_DENSE/H",
+    "ett1/15T",
+    "ett1/H",
+    "ett2/15T",
+    "ett2/H",
+    "jena_weather/10T",
+    "jena_weather/H",
+    "bitbrains_fast_storage/5T",
+    "bitbrains_rnd/5T",
+    "bizitobs_application",
+    "bizitobs_service",
+    "bizitobs_l2c/5T",
+    "bizitobs_l2c/H",
+)
+# Preserve insertion order from SHORT_DATASETS followed by MED_LONG_DATASETS
+ALL_DATASETS = list(dict.fromkeys(SHORT_DATASETS + MED_LONG_DATASETS))
+# Evaluation terms
+TERMS = ("short", "medium", "long")
+# Pretty names mapping (following GIFT eval standard)
+PRETTY_NAMES = {
+    "saugeenday": "saugeen",
+    "temperature_rain_with_missing": "temperature_rain",
+    "kdd_cup_2018_with_missing": "kdd_cup_2018",
+    "car_parts_with_missing": "car_parts",
+}
+METRICS = (
+    MSE(forecast_type="mean"),
+    MSE(forecast_type=0.5),
+    MAE(),
+    MASE(),
+    MAPE(),
+    SMAPE(),
+    MSIS(),
+    RMSE(),
+    NRMSE(),
+    ND(),
+    MeanWeightedSumQuantileLoss(
+        quantile_levels=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+    ),
+)
+STANDARD_METRIC_NAMES = (
+    "MSE[mean]",
+    "MSE[0.5]",
+    "MAE[0.5]",
+    "MASE[0.5]",
+    "MAPE[0.5]",
+    "sMAPE[0.5]",
+    "MSIS",
+    "RMSE[mean]",
+    "NRMSE[mean]",
+    "ND[0.5]",
+    "mean_weighted_sum_quantile_loss",
+)
+__all__ = [
+    "ALL_DATASETS",
+    "DATASET_PROPERTIES",
+    "DATASET_PROPERTIES_PATH",
+    "MED_LONG_DATASETS",
+    "METRICS",
+    "PRETTY_NAMES",
+    "SHORT_DATASETS",
+    "STANDARD_METRIC_NAMES",
+    "TERMS",
+]

src/gift_eval/core.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Core data structures and helpers shared across GIFT-Eval modules."""
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+from src.gift_eval.constants import ALL_DATASETS
+@dataclass
+class DatasetMetadata:
+    """Structured description of a dataset/term combination."""
+    full_name: str
+    key: str
+    freq: str
+    term: str
+    season_length: int
+    target_dim: int
+    to_univariate: bool
+    prediction_length: int
+    windows: int
+@dataclass
+class EvaluationItem:
+    """Container for evaluation results and optional figures."""
+    dataset_metadata: DatasetMetadata
+    metrics: Dict
+    figures: List[Tuple[object, str]]
+DatasetSelection = Union[List[str], Tuple[str, ...], str]
+def expand_datasets_arg(datasets: DatasetSelection) -> List[str]:
+    """Normalize dataset selection strings to explicit lists."""
+    if isinstance(datasets, str):
+        dataset_list = [datasets]
+    else:
+        dataset_list = list(datasets)
+    if not dataset_list:
+        return []
+    if dataset_list[0] == "all":
+        return list(ALL_DATASETS)
+    for dataset in dataset_list:
+        if dataset not in ALL_DATASETS:
+            raise ValueError(f"Invalid dataset: {dataset}. Use one of {ALL_DATASETS}")
+    return dataset_list
+__all__ = [
+    "DatasetMetadata",
+    "EvaluationItem",
+    "DatasetSelection",
+    "expand_datasets_arg",
+]

src/gift_eval/data.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# Copyright (c) 2023, Salesforce, Inc.
+# SPDX-License-Identifier: Apache-2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from collections.abc import Iterable, Iterator
+from enum import Enum
+from functools import cached_property
+from pathlib import Path
+from typing import Optional
+import datasets
+import pyarrow.compute as pc
+from gluonts.dataset import DataEntry
+from gluonts.dataset.common import ProcessDataEntry
+from gluonts.dataset.split import TestData, TrainingDataset, split
+from gluonts.itertools import Map
+from gluonts.time_feature import norm_freq_str
+from gluonts.transform import Transformation
+from pandas.tseries.frequencies import to_offset
+from toolz import compose
+TEST_SPLIT = 0.1
+MAX_WINDOW = 20
+M4_PRED_LENGTH_MAP = {
+    "A": 6,
+    "Q": 8,
+    "M": 18,
+    "W": 13,
+    "D": 14,
+    "H": 48,
+    "h": 48,
+    "Y": 6,
+}
+PRED_LENGTH_MAP = {
+    "M": 12,
+    "W": 8,
+    "D": 30,
+    "H": 48,
+    "h": 48,
+    "T": 48,
+    "S": 60,
+    "s": 60,
+    "min": 48,
+}
+TFB_PRED_LENGTH_MAP = {
+    "A": 6,
+    "Y": 6,
+    "H": 48,
+    "h": 48,
+    "Q": 8,
+    "D": 14,
+    "M": 18,
+    "W": 13,
+    "U": 8,
+    "T": 8,
+    "min": 8,
+    "us": 8,
+}
+class Term(Enum):
+    SHORT = "short"
+    MEDIUM = "medium"
+    LONG = "long"
+    @property
+    def multiplier(self) -> int:
+        if self == Term.SHORT:
+            return 1
+        elif self == Term.MEDIUM:
+            return 10
+        elif self == Term.LONG:
+            return 15
+def itemize_start(data_entry: DataEntry) -> DataEntry:
+    data_entry["start"] = data_entry["start"].item()
+    return data_entry
+class MultivariateToUnivariate(Transformation):
+    def __init__(self, field):
+        self.field = field
+    def __call__(
+        self, data_it: Iterable[DataEntry], is_train: bool = False
+    ) -> Iterator:
+        for data_entry in data_it:
+            item_id = data_entry["item_id"]
+            val_ls = list(data_entry[self.field])
+            for id, val in enumerate(val_ls):
+                univariate_entry = data_entry.copy()
+                univariate_entry[self.field] = val
+                univariate_entry["item_id"] = item_id + "_dim" + str(id)
+                yield univariate_entry
+class Dataset:
+    def __init__(
+        self,
+        name: str,
+        term: Term | str = Term.SHORT,
+        to_univariate: bool = False,
+        storage_path: str = None,
+        max_windows: Optional[int] = None,
+    ):
+        storage_path = Path(storage_path)
+        self.hf_dataset = datasets.load_from_disk(str(storage_path / name)).with_format(
+            "numpy"
+        )
+        process = ProcessDataEntry(
+            self.freq,
+            one_dim_target=self.target_dim == 1,
+        )
+        self.gluonts_dataset = Map(compose(process, itemize_start), self.hf_dataset)
+        if to_univariate:
+            self.gluonts_dataset = MultivariateToUnivariate("target").apply(
+                self.gluonts_dataset
+            )
+        self.term = Term(term)
+        self.name = name
+        self.max_windows = max_windows if max_windows is not None else MAX_WINDOW
+    @cached_property
+    def prediction_length(self) -> int:
+        freq = norm_freq_str(to_offset(self.freq).name)
+        if freq.endswith("E"):
+            freq = freq[:-1]
+        pred_len = (
+            M4_PRED_LENGTH_MAP[freq] if "m4" in self.name else PRED_LENGTH_MAP[freq]
+        )
+        return self.term.multiplier * pred_len
+    @cached_property
+    def freq(self) -> str:
+        return self.hf_dataset[0]["freq"]
+    @cached_property
+    def target_dim(self) -> int:
+        return (
+            target.shape[0]
+            if len((target := self.hf_dataset[0]["target"]).shape) > 1
+            else 1
+        )
+    @cached_property
+    def past_feat_dynamic_real_dim(self) -> int:
+        if "past_feat_dynamic_real" not in self.hf_dataset[0]:
+            return 0
+        elif (
+            len(
+                (
+                    past_feat_dynamic_real := self.hf_dataset[0][
+                        "past_feat_dynamic_real"
+                    ]
+                ).shape
+            )
+            > 1
+        ):
+            return past_feat_dynamic_real.shape[0]
+        else:
+            return 1
+    @cached_property
+    def windows(self) -> int:
+        if "m4" in self.name:
+            return 1
+        w = math.ceil(TEST_SPLIT * self._min_series_length / self.prediction_length)
+        return min(max(1, w), self.max_windows)
+    @cached_property
+    def _min_series_length(self) -> int:
+        if self.hf_dataset[0]["target"].ndim > 1:
+            lengths = pc.list_value_length(
+                pc.list_flatten(
+                    pc.list_slice(self.hf_dataset.data.column("target"), 0, 1)
+                )
+            )
+        else:
+            lengths = pc.list_value_length(self.hf_dataset.data.column("target"))
+        return min(lengths.to_numpy())
+    @cached_property
+    def sum_series_length(self) -> int:
+        if self.hf_dataset[0]["target"].ndim > 1:
+            lengths = pc.list_value_length(
+                pc.list_flatten(self.hf_dataset.data.column("target"))
+            )
+        else:
+            lengths = pc.list_value_length(self.hf_dataset.data.column("target"))
+        return sum(lengths.to_numpy())
+    @property
+    def training_dataset(self) -> TrainingDataset:
+        training_dataset, _ = split(
+            self.gluonts_dataset, offset=-self.prediction_length * (self.windows + 1)
+        )
+        return training_dataset
+    @property
+    def validation_dataset(self) -> TrainingDataset:
+        validation_dataset, _ = split(
+            self.gluonts_dataset, offset=-self.prediction_length * self.windows
+        )
+        return validation_dataset
+    @property
+    def test_data(self) -> TestData:
+        _, test_template = split(
+            self.gluonts_dataset, offset=-self.prediction_length * self.windows
+        )
+        test_data = test_template.generate_instances(
+            prediction_length=self.prediction_length,
+            windows=self.windows,
+            distance=self.prediction_length,
+        )
+        return test_data

src/gift_eval/evaluate.py ADDED Viewed

	@@ -0,0 +1,421 @@

+import argparse
+import logging
+import warnings
+from pathlib import Path
+from typing import List, Optional, Tuple
+import matplotlib
+from gluonts.model.evaluation import evaluate_model
+from gluonts.time_feature import get_seasonality
+from linear_operator.utils.cholesky import NumericalWarning
+from src.gift_eval.constants import (
+    DATASET_PROPERTIES,
+    MED_LONG_DATASETS,
+    METRICS,
+    PRETTY_NAMES,
+)
+from src.gift_eval.core import DatasetMetadata, EvaluationItem, expand_datasets_arg
+from src.gift_eval.data import Dataset
+from src.gift_eval.predictor import TimeSeriesPredictor
+from src.gift_eval.results import write_results_to_disk
+from src.plotting.gift_eval_utils import create_plots_for_dataset
+logger = logging.getLogger(__name__)
+# Warnings configuration
+warnings.filterwarnings("ignore", category=NumericalWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+matplotlib.set_loglevel("WARNING")
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logging.getLogger("matplotlib.font_manager").setLevel(logging.WARNING)
+logging.getLogger("PIL").setLevel(logging.WARNING)
+class WarningFilter(logging.Filter):
+    def __init__(self, text_to_filter: str) -> None:
+        super().__init__()
+        self.text_to_filter = text_to_filter
+    def filter(self, record: logging.LogRecord) -> bool:
+        return self.text_to_filter not in record.getMessage()
+# Filter out gluonts warnings about mean predictions
+gts_logger = logging.getLogger("gluonts.model.forecast")
+gts_logger.addFilter(
+    WarningFilter("The mean prediction is not stored in the forecast data")
+)
+def construct_evaluation_data(
+    dataset_name: str,
+    dataset_storage_path: str,
+    terms: List[str] = ["short", "medium", "long"],
+    max_windows: Optional[int] = None,
+) -> List[Tuple[Dataset, DatasetMetadata]]:
+    """Build datasets and rich metadata per term for a dataset name."""
+    sub_datasets: List[Tuple[Dataset, DatasetMetadata]] = []
+    if "/" in dataset_name:
+        ds_key, ds_freq = dataset_name.split("/")
+        ds_key = ds_key.lower()
+        ds_key = PRETTY_NAMES.get(ds_key, ds_key)
+    else:
+        ds_key = dataset_name.lower()
+        ds_key = PRETTY_NAMES.get(ds_key, ds_key)
+        ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get("frequency")
+    for term in terms:
+        # Skip medium/long terms for datasets that don't support them
+        if (
+            term == "medium" or term == "long"
+        ) and dataset_name not in MED_LONG_DATASETS:
+            continue
+        # Probe once to determine dimensionality
+        probe_dataset = Dataset(
+            name=dataset_name,
+            term=term,
+            to_univariate=False,
+            storage_path=dataset_storage_path,
+            max_windows=max_windows,
+        )
+        to_univariate = probe_dataset.target_dim > 1
+        dataset = Dataset(
+            name=dataset_name,
+            term=term,
+            to_univariate=to_univariate,
+            storage_path=dataset_storage_path,
+            max_windows=max_windows,
+        )
+        # Compute metadata
+        season_length = get_seasonality(dataset.freq)
+        actual_freq = ds_freq if ds_freq else dataset.freq
+        metadata = DatasetMetadata(
+            full_name=f"{ds_key}/{actual_freq}/{term}",
+            key=ds_key,
+            freq=actual_freq,
+            term=term,
+            season_length=season_length,
+            target_dim=probe_dataset.target_dim,
+            to_univariate=to_univariate,
+            prediction_length=dataset.prediction_length,
+            windows=dataset.windows,
+        )
+        sub_datasets.append((dataset, metadata))
+    return sub_datasets
+def evaluate_datasets(
+    predictor: TimeSeriesPredictor,
+    dataset: str,
+    dataset_storage_path: str,
+    terms: List[str] = ["short", "medium", "long"],
+    max_windows: Optional[int] = None,
+    batch_size: int = 48,
+    max_context_length: Optional[int] = 1024,
+    create_plots: bool = False,
+    max_plots_per_dataset: int = 10,
+) -> List[EvaluationItem]:
+    """Evaluate predictor on one dataset across the requested terms."""
+    sub_datasets = construct_evaluation_data(
+        dataset_name=dataset,
+        dataset_storage_path=dataset_storage_path,
+        terms=terms,
+        max_windows=max_windows,
+    )
+    results: List[EvaluationItem] = []
+    for i, (sub_dataset, metadata) in enumerate(sub_datasets):
+        logger.info(f"Evaluating {i + 1}/{len(sub_datasets)}: {metadata.full_name}")
+        logger.info(f"  Dataset size: {len(sub_dataset.test_data)}")
+        logger.info(f"  Frequency: {sub_dataset.freq}")
+        logger.info(f"  Term: {metadata.term}")
+        logger.info(f"  Prediction length: {sub_dataset.prediction_length}")
+        logger.info(f"  Target dimensions: {sub_dataset.target_dim}")
+        logger.info(f"  Windows: {sub_dataset.windows}")
+        # Update context on the reusable predictor
+        predictor.set_dataset_context(
+            prediction_length=sub_dataset.prediction_length,
+            freq=sub_dataset.freq,
+            batch_size=batch_size,
+            max_context_length=max_context_length,
+        )
+        res = evaluate_model(
+            model=predictor,
+            test_data=sub_dataset.test_data,
+            metrics=METRICS,
+            axis=None,
+            mask_invalid_label=True,
+            allow_nan_forecast=False,
+            seasonality=metadata.season_length,
+        )
+        figs: List[Tuple[object, str]] = []
+        if create_plots:
+            forecasts = predictor.predict(sub_dataset.test_data.input)
+            figs = create_plots_for_dataset(
+                forecasts=forecasts,
+                test_data=sub_dataset.test_data,
+                dataset_metadata=metadata,
+                max_plots=max_plots_per_dataset,
+                max_context_length=max_context_length,
+            )
+        results.append(
+            EvaluationItem(dataset_metadata=metadata, metrics=res, figures=figs)
+        )
+    return results
+def _run_evaluation(
+    predictor: TimeSeriesPredictor,
+    datasets: List[str] | str,
+    terms: List[str],
+    dataset_storage_path: str,
+    max_windows: Optional[int] = None,
+    batch_size: int = 48,
+    max_context_length: Optional[int] = 1024,
+    output_dir: str = "gift_eval_results",
+    model_name: str = "TimeSeriesModel",
+    create_plots: bool = False,
+    max_plots: int = 10,
+) -> None:
+    """Shared evaluation workflow used by both entry points."""
+    datasets_to_run = expand_datasets_arg(datasets)
+    results_root = Path(output_dir)
+    for ds_name in datasets_to_run:
+        items = evaluate_datasets(
+            predictor=predictor,
+            dataset=ds_name,
+            dataset_storage_path=dataset_storage_path,
+            terms=terms,
+            max_windows=max_windows,
+            batch_size=batch_size,
+            max_context_length=max_context_length,
+            create_plots=create_plots,
+            max_plots_per_dataset=max_plots,
+        )
+        write_results_to_disk(
+            items=items,
+            dataset_name=ds_name,
+            output_dir=results_root,
+            model_name=model_name,
+            create_plots=create_plots,
+        )
+def evaluate_from_paths(
+    model_path: str,
+    config_path: str,
+    datasets: List[str] | str,
+    terms: List[str],
+    dataset_storage_path: str,
+    max_windows: Optional[int] = None,
+    batch_size: int = 48,
+    max_context_length: Optional[int] = 1024,
+    output_dir: str = "gift_eval_results",
+    model_name: str = "TimeSeriesModel",
+    create_plots: bool = False,
+    max_plots: int = 10,
+) -> None:
+    """Entry point: load model from disk and save metrics/plots to disk."""
+    # Validate inputs early
+    if not Path(model_path).exists():
+        raise FileNotFoundError(f"Model path does not exist: {model_path}")
+    if not Path(config_path).exists():
+        raise FileNotFoundError(f"Config path does not exist: {config_path}")
+    predictor = TimeSeriesPredictor.from_paths(
+        model_path=model_path,
+        config_path=config_path,
+        ds_prediction_length=1,  # placeholder; set per dataset below
+        ds_freq="D",  # placeholder; set per dataset below
+        batch_size=batch_size,
+        max_context_length=max_context_length,
+    )
+    _run_evaluation(
+        predictor=predictor,
+        datasets=datasets,
+        terms=terms,
+        dataset_storage_path=dataset_storage_path,
+        max_windows=max_windows,
+        batch_size=batch_size,
+        max_context_length=max_context_length,
+        output_dir=output_dir,
+        model_name=model_name,
+        create_plots=create_plots,
+        max_plots=max_plots,
+    )
+def evaluate_in_memory(
+    model,
+    config: dict,
+    datasets: List[str] | str,
+    terms: List[str],
+    dataset_storage_path: str,
+    max_windows: Optional[int] = None,
+    batch_size: int = 48,
+    max_context_length: Optional[int] = 1024,
+    output_dir: str = "gift_eval_results",
+    model_name: str = "TimeSeriesModel",
+    create_plots: bool = False,
+    max_plots: int = 10,
+) -> None:
+    """Entry point: evaluate in-memory model and return results per dataset."""
+    predictor = TimeSeriesPredictor.from_model(
+        model=model,
+        config=config,
+        ds_prediction_length=1,  # placeholder; set per dataset below
+        ds_freq="D",  # placeholder; set per dataset below
+        batch_size=batch_size,
+        max_context_length=max_context_length,
+    )
+    _run_evaluation(
+        predictor=predictor,
+        datasets=datasets,
+        terms=terms,
+        dataset_storage_path=dataset_storage_path,
+        max_windows=max_windows,
+        batch_size=batch_size,
+        max_context_length=max_context_length,
+        output_dir=output_dir,
+        model_name=model_name,
+        create_plots=create_plots,
+        max_plots=max_plots,
+    )
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Evaluate TimeSeriesModel on GIFT-Eval datasets"
+    )
+    # Model configuration
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the trained model checkpoint",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        required=True,
+        help="Path to the model configuration YAML file",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="TimeSeriesModel",
+        help="Name identifier for the model",
+    )
+    # Dataset configuration
+    parser.add_argument(
+        "--datasets",
+        type=str,
+        default="all",
+        help="Comma-separated list of dataset names to evaluate (or 'all')",
+    )
+    parser.add_argument(
+        "--dataset_storage_path",
+        type=str,
+        default="/work/dlclarge2/moroshav-GiftEvalPretrain/gift_eval",
+        help="Path to the dataset storage directory (default: GIFT_EVAL)",
+    )
+    parser.add_argument(
+        "--terms",
+        type=str,
+        default="short,medium,long",
+        help="Comma-separated list of prediction terms to evaluate",
+    )
+    parser.add_argument(
+        "--max_windows",
+        type=int,
+        default=None,
+        help="Maximum number of windows to use for evaluation",
+    )
+    # Inference configuration
+    parser.add_argument(
+        "--batch_size", type=int, default=48, help="Batch size for model inference"
+    )
+    parser.add_argument(
+        "--max_context_length",
+        type=int,
+        default=1024,
+        help="Maximum context length to use (None for no limit)",
+    )
+    # Output configuration
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="gift_eval_results",
+        help="Directory to save evaluation results",
+    )
+    # Plotting configuration
+    parser.add_argument(
+        "--create_plots",
+        action="store_true",
+        help="Create and save plots for each evaluation window",
+    )
+    parser.add_argument(
+        "--max_plots_per_dataset",
+        type=int,
+        default=10,
+        help="Maximum number of plots to create per dataset term",
+    )
+    args = parser.parse_args()
+    args.terms = args.terms.split(",")
+    args.datasets = args.datasets.split(",")
+    return args
+def _configure_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+if __name__ == "__main__":
+    _configure_logging()
+    args = _parse_args()
+    logger.info(f"Command Line Arguments: {vars(args)}")
+    try:
+        evaluate_from_paths(
+            model_path=args.model_path,
+            config_path=args.config_path,
+            datasets=args.datasets,
+            terms=args.terms,
+            dataset_storage_path=args.dataset_storage_path,
+            max_windows=args.max_windows,
+            batch_size=args.batch_size,
+            max_context_length=args.max_context_length,
+            output_dir=args.output_dir,
+            model_name=args.model_name,
+            create_plots=args.create_plots,
+            max_plots=args.max_plots_per_dataset,
+        )
+    except Exception as e:
+        logger.error(f"Evaluation failed: {str(e)}")
+        raise

src/gift_eval/predictor.py ADDED Viewed

	@@ -0,0 +1,318 @@

+"""Predictor implementation wrapping the TimeSeriesModel for GIFT-Eval."""
+import logging
+from typing import Iterator, List, Optional
+import numpy as np
+import torch
+import yaml
+from gluonts.model.forecast import QuantileForecast
+from gluonts.model.predictor import Predictor
+from torch.nn.parallel import DistributedDataParallel as DDP
+from src.data.containers import BatchTimeSeriesContainer
+from src.data.frequency import parse_frequency
+from src.data.scalers import RobustScaler
+from src.models.model import TimeSeriesModel
+from src.utils.utils import device
+logger = logging.getLogger(__name__)
+class TimeSeriesPredictor(Predictor):
+    """Unified predictor for TimeSeriesModel supporting flexible construction."""
+    def __init__(
+        self,
+        model: TimeSeriesModel,
+        config: dict,
+        ds_prediction_length: int,
+        ds_freq: str,
+        batch_size: int = 32,
+        max_context_length: Optional[int] = None,
+        debug: bool = False,
+    ) -> None:
+        # Dataset-specific context (can be updated per dataset/term)
+        self.ds_prediction_length = ds_prediction_length
+        self.ds_freq = ds_freq
+        self.batch_size = batch_size
+        self.max_context_length = max_context_length
+        self.debug = debug
+        # Persistent model/config (unwrap DDP if needed)
+        self.model = model.module if isinstance(model, DDP) else model
+        self.model.eval()
+        self.config = config
+        # Initialize scaler (using same type as model)
+        scaler_type = self.config.get("TimeSeriesModel", {}).get(
+            "scaler", "custom_robust"
+        )
+        epsilon = self.config.get("TimeSeriesModel", {}).get("epsilon", 1e-3)
+        if scaler_type == "custom_robust":
+            self.scaler = RobustScaler(epsilon=epsilon)
+        else:
+            raise ValueError(f"Unsupported scaler type: {scaler_type}")
+    def set_dataset_context(
+        self,
+        prediction_length: Optional[int] = None,
+        freq: Optional[str] = None,
+        batch_size: Optional[int] = None,
+        max_context_length: Optional[int] = None,
+    ) -> None:
+        """Update lightweight dataset-specific attributes without reloading the model."""
+        if prediction_length is not None:
+            self.ds_prediction_length = prediction_length
+        if freq is not None:
+            self.ds_freq = freq
+        if batch_size is not None:
+            self.batch_size = batch_size
+        if max_context_length is not None:
+            self.max_context_length = max_context_length
+    @classmethod
+    def from_model(
+        cls,
+        model: TimeSeriesModel,
+        config: dict,
+        ds_prediction_length: int,
+        ds_freq: str,
+        batch_size: int = 32,
+        max_context_length: Optional[int] = None,
+        debug: bool = False,
+    ) -> "TimeSeriesPredictor":
+        return cls(
+            model=model,
+            config=config,
+            ds_prediction_length=ds_prediction_length,
+            ds_freq=ds_freq,
+            batch_size=batch_size,
+            max_context_length=max_context_length,
+            debug=debug,
+        )
+    @classmethod
+    def from_paths(
+        cls,
+        model_path: str,
+        config_path: str,
+        ds_prediction_length: int,
+        ds_freq: str,
+        batch_size: int = 32,
+        max_context_length: Optional[int] = None,
+        debug: bool = False,
+    ) -> "TimeSeriesPredictor":
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        model = cls._load_model_from_path(config=config, model_path=model_path)
+        return cls(
+            model=model,
+            config=config,
+            ds_prediction_length=ds_prediction_length,
+            ds_freq=ds_freq,
+            batch_size=batch_size,
+            max_context_length=max_context_length,
+            debug=debug,
+        )
+    @staticmethod
+    def _load_model_from_path(config: dict, model_path: str) -> TimeSeriesModel:
+        try:
+            model = TimeSeriesModel(**config["TimeSeriesModel"]).to(device)
+            checkpoint = torch.load(model_path, map_location=device)
+            model.load_state_dict(checkpoint["model_state_dict"])
+            model.eval()
+            logger.info(f"Successfully loaded model from {model_path}")
+            return model
+        except Exception as exc:  # pragma: no cover - logging path
+            logger.error(f"Failed to load model from {model_path}: {exc}")
+            raise
+    def predict(self, test_data_input) -> Iterator[QuantileForecast]:
+        """Generate forecasts for the test data."""
+        if hasattr(test_data_input, "__iter__") and not isinstance(test_data_input, list):
+            test_data_input = list(test_data_input)
+        logger.debug(f"Processing {len(test_data_input)} time series")
+        # Group series by their effective length (after optional truncation),
+        # then process each uniform-length group in sub-batches up to batch_size.
+        def _effective_length(entry) -> int:
+            target = entry["target"]
+            if target.ndim == 1:
+                seq_len = len(target)
+            else:
+                # target shape is [num_channels, seq_len]
+                seq_len = target.shape[1]
+            if self.max_context_length is not None:
+                seq_len = min(seq_len, self.max_context_length)
+            return seq_len
+        length_to_items: dict[int, List[tuple[int, object]]] = {}
+        for idx, entry in enumerate(test_data_input):
+            seq_len = _effective_length(entry)
+            length_to_items.setdefault(seq_len, []).append((idx, entry))
+        total = len(test_data_input)
+        ordered_results: List[Optional[QuantileForecast]] = [None] * total
+        for _, items in length_to_items.items():
+            for i in range(0, len(items), self.batch_size):
+                chunk = items[i : i + self.batch_size]
+                entries = [entry for (_orig_idx, entry) in chunk]
+                batch_forecasts = self._predict_batch(entries)
+                for forecast_idx, (orig_idx, _entry) in enumerate(chunk):
+                    ordered_results[orig_idx] = batch_forecasts[forecast_idx]
+        return ordered_results  # type: ignore[return-value]
+    def _predict_batch(self, test_data_batch: List) -> List[QuantileForecast]:
+        """Generate predictions for a batch of time series."""
+        logger.debug(f"Processing batch of size: {len(test_data_batch)}")
+        try:
+            batch_container = self._convert_to_batch_container(test_data_batch)
+            if isinstance(device, torch.device):
+                device_type = device.type
+            else:
+                device_type = "cuda" if "cuda" in str(device).lower() else "cpu"
+            enable_autocast = device_type == "cuda"
+            with torch.autocast(
+                device_type=device_type,
+                dtype=torch.bfloat16,
+                enabled=enable_autocast,
+            ):
+                with torch.no_grad():
+                    model_output = self.model(batch_container, drop_enc_allow=False)
+            forecasts = self._convert_to_forecasts(
+                model_output, test_data_batch, batch_container
+            )
+            logger.debug(f"Generated {len(forecasts)} forecasts")
+            return forecasts
+        except Exception as exc:  # pragma: no cover - logging path
+            logger.error(f"Error in batch prediction: {exc}")
+            raise
+    def _convert_to_batch_container(
+        self, test_data_batch: List
+    ) -> BatchTimeSeriesContainer:
+        """Convert gluonts test data to BatchTimeSeriesContainer."""
+        batch_size = len(test_data_batch)
+        history_values_list = []
+        start_dates = []
+        frequencies = []
+        for entry in test_data_batch:
+            target = entry["target"]
+            if target.ndim == 1:
+                target = target.reshape(-1, 1)
+            else:
+                target = target.T
+            if (
+                self.max_context_length is not None
+                and len(target) > self.max_context_length
+            ):
+                target = target[-self.max_context_length :]
+            history_values_list.append(target)
+            start_dates.append(entry["start"].to_timestamp().to_datetime64())
+            frequencies.append(parse_frequency(entry["freq"]))
+        history_values_np = np.stack(history_values_list, axis=0)
+        num_channels = history_values_np.shape[2]
+        history_values = torch.tensor(
+            history_values_np, dtype=torch.float32, device=device
+        )
+        future_values = torch.zeros(
+            (batch_size, self.ds_prediction_length, num_channels),
+            dtype=torch.float32,
+            device=device,
+        )
+        return BatchTimeSeriesContainer(
+            history_values=history_values,
+            future_values=future_values,
+            start=start_dates,
+            frequency=frequencies,
+        )
+    def _convert_to_forecasts(
+        self,
+        model_output: dict,
+        test_data_batch: List,
+        batch_container: BatchTimeSeriesContainer,
+    ) -> List[QuantileForecast]:
+        """Convert model predictions to QuantileForecast objects."""
+        predictions = model_output["result"]
+        scale_statistics = model_output["scale_statistics"]
+        if predictions.ndim == 4:
+            predictions_unscaled = self.scaler.inverse_scale(
+                predictions, scale_statistics
+            )
+            is_quantile = True
+            quantile_levels = self.model.quantiles
+        else:
+            predictions_unscaled = self.scaler.inverse_scale(
+                predictions, scale_statistics
+            )
+            is_quantile = False
+            quantile_levels = [0.5]
+        forecasts: List[QuantileForecast] = []
+        for idx, entry in enumerate(test_data_batch):
+            history_length = int(batch_container.history_values.shape[1])
+            start_date = entry["start"]
+            forecast_start = start_date + history_length
+            if is_quantile:
+                pred_array = predictions_unscaled[idx].cpu().numpy()
+                if pred_array.shape[1] == 1:
+                    pred_array = pred_array.squeeze(1)
+                    forecast_arrays = pred_array.T
+                else:
+                    forecast_arrays = pred_array.transpose(2, 0, 1)
+                forecast = QuantileForecast(
+                    forecast_arrays=forecast_arrays,
+                    forecast_keys=[str(q) for q in quantile_levels],
+                    start_date=forecast_start,
+                )
+            else:
+                pred_array = predictions_unscaled[idx].cpu().numpy()
+                if pred_array.shape[1] == 1:
+                    pred_array = pred_array.squeeze(1)
+                    forecast_arrays = pred_array.reshape(1, -1)
+                else:
+                    forecast_arrays = pred_array.reshape(1, *pred_array.shape)
+                forecast = QuantileForecast(
+                    forecast_arrays=forecast_arrays,
+                    forecast_keys=["0.5"],
+                    start_date=forecast_start,
+                )
+            forecasts.append(forecast)
+        return forecasts
+__all__ = ["TimeSeriesPredictor"]

src/gift_eval/results.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""Utilities for persisting and aggregating GIFT-Eval results."""
+import argparse
+import csv
+import glob
+import logging
+from pathlib import Path
+from typing import List, Optional
+import pandas as pd
+from src.gift_eval.constants import (
+    ALL_DATASETS,
+    DATASET_PROPERTIES,
+    MED_LONG_DATASETS,
+    PRETTY_NAMES,
+    STANDARD_METRIC_NAMES,
+)
+from src.gift_eval.core import DatasetMetadata, EvaluationItem
+logger = logging.getLogger(__name__)
+def _ensure_results_csv(csv_file_path: Path) -> None:
+    if not csv_file_path.exists():
+        csv_file_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(csv_file_path, "w", newline="") as csvfile:
+            writer = csv.writer(csvfile)
+            header = (
+                ["dataset", "model"]
+                + [f"eval_metrics/{name}" for name in STANDARD_METRIC_NAMES]
+                + ["domain", "num_variates"]
+            )
+            writer.writerow(header)
+def write_results_to_disk(
+    items: List[EvaluationItem],
+    dataset_name: str,
+    output_dir: Path,
+    model_name: str,
+    create_plots: bool,
+) -> None:
+    output_dir = output_dir / dataset_name
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_csv_path = output_dir / "results.csv"
+    _ensure_results_csv(output_csv_path)
+    try:
+        import matplotlib.pyplot as plt  # Local import to avoid unnecessary dependency at module import time
+    except ImportError:  # pragma: no cover - guard for optional dependency
+        plt = None
+    with open(output_csv_path, "a", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        for item in items:
+            md: DatasetMetadata = item.dataset_metadata
+            metric_values: List[Optional[float]] = []
+            for metric_name in STANDARD_METRIC_NAMES:
+                value = item.metrics.get(metric_name, None)
+                if value is None:
+                    metric_values.append(None)
+                else:
+                    if (
+                        hasattr(value, "__len__")
+                        and not isinstance(value, (str, bytes))
+                        and len(value) == 1
+                    ):
+                        value = value[0]
+                    elif hasattr(value, "item"):
+                        value = value.item()
+                    metric_values.append(value)
+            ds_key = md.key.lower()
+            props = DATASET_PROPERTIES.get(ds_key, {})
+            domain = props.get("domain", "unknown")
+            num_variates = props.get(
+                "num_variates", 1 if md.to_univariate else md.target_dim
+            )
+            row = [md.full_name, model_name] + metric_values + [domain, num_variates]
+            writer.writerow(row)
+            if create_plots and item.figures and plt is not None:
+                plots_dir = output_dir / "plots" / md.key / md.term
+                plots_dir.mkdir(parents=True, exist_ok=True)
+                for fig, filename in item.figures:
+                    filepath = plots_dir / filename
+                    fig.savefig(filepath, dpi=300, bbox_inches="tight")
+                    plt.close(fig)
+    logger.info(
+        "Evaluation complete for dataset '%s'. Results saved to %s",
+        dataset_name,
+        output_csv_path,
+    )
+    if create_plots:
+        logger.info("Plots saved under %s", output_dir / "plots")
+def get_all_datasets_full_name() -> List[str]:
+    """Get all possible dataset full names for validation."""
+    terms = ["short", "medium", "long"]
+    datasets_full_names: List[str] = []
+    for name in ALL_DATASETS:
+        for term in terms:
+            if term in ["medium", "long"] and name not in MED_LONG_DATASETS:
+                continue
+            if "/" in name:
+                ds_key, ds_freq = name.split("/")
+                ds_key = ds_key.lower()
+                ds_key = PRETTY_NAMES.get(ds_key, ds_key)
+            else:
+                ds_key = name.lower()
+                ds_key = PRETTY_NAMES.get(ds_key, ds_key)
+                ds_freq = DATASET_PROPERTIES.get(ds_key, {}).get("frequency")
+            datasets_full_names.append(
+                f"{ds_key}/{ds_freq if ds_freq else 'unknown'}/{term}"
+            )
+    return datasets_full_names
+def aggregate_results(result_root_dir: str | Path) -> pd.DataFrame | None:
+    """Aggregate results from multiple CSV files into a single dataframe."""
+    result_root = Path(result_root_dir)
+    logger.info("Aggregating results in: %s", result_root)
+    result_files = glob.glob(f"{result_root}/**/results.csv", recursive=True)
+    if not result_files:
+        logger.error("No result files found!")
+        return None
+    dataframes: List[pd.DataFrame] = []
+    for file in result_files:
+        try:
+            df = pd.read_csv(file)
+            if len(df) > 0:
+                dataframes.append(df)
+            else:
+                logger.warning("Empty file: %s", file)
+        except pd.errors.EmptyDataError:
+            logger.warning("Skipping empty file: %s", file)
+        except Exception as exc:
+            logger.error("Error reading %s: %s", file, exc)
+    if not dataframes:
+        logger.warning("No valid CSV files found to combine")
+        return None
+    combined_df = pd.concat(dataframes, ignore_index=True).sort_values("dataset")
+    if len(combined_df) != len(set(combined_df.dataset)):
+        duplicate_datasets = combined_df.dataset[
+            combined_df.dataset.duplicated()
+        ].tolist()
+        logger.warning("Warning: Duplicate datasets found: %s", duplicate_datasets)
+        combined_df = combined_df.drop_duplicates(subset=["dataset"], keep="first")
+        logger.info(
+            "Removed duplicates, %s unique datasets remaining", len(combined_df)
+        )
+    logger.info("Combined results: %s datasets", len(combined_df))
+    all_datasets_full_name = get_all_datasets_full_name()
+    completed_experiments = combined_df.dataset.tolist()
+    completed_experiments_clean = [
+        exp for exp in completed_experiments if exp in all_datasets_full_name
+    ]
+    missing_or_failed_experiments = [
+        exp for exp in all_datasets_full_name if exp not in completed_experiments_clean
+    ]
+    logger.info("=== EXPERIMENT SUMMARY ===")
+    logger.info("Total expected datasets: %s", len(all_datasets_full_name))
+    logger.info("Completed experiments: %s", len(completed_experiments_clean))
+    logger.info("Missing/failed experiments: %s", len(missing_or_failed_experiments))
+    logger.info("Completed experiments:")
+    for idx, exp in enumerate(completed_experiments_clean, start=1):
+        logger.info("  %3d: %s", idx, exp)
+    if missing_or_failed_experiments:
+        logger.info("Missing or failed experiments:")
+        for idx, exp in enumerate(missing_or_failed_experiments, start=1):
+            logger.info("  %3d: %s", idx, exp)
+    completion_rate = (
+        len(completed_experiments_clean) / len(all_datasets_full_name) * 100
+        if all_datasets_full_name
+        else 0.0
+    )
+    logger.info("Completion rate: %.1f%%", completion_rate)
+    output_file = result_root / "all_results.csv"
+    combined_df.to_csv(output_file, index=False)
+    logger.info("Combined results saved to: %s", output_file)
+    return combined_df
+__all__ = [
+    "aggregate_results",
+    "get_all_datasets_full_name",
+    "write_results_to_disk",
+]
+def main() -> None:
+    """CLI entry point for aggregating results from disk."""
+    parser = argparse.ArgumentParser(
+        description="Aggregate GIFT-Eval results from multiple CSV files"
+    )
+    parser.add_argument(
+        "--result_root_dir",
+        type=str,
+        required=True,
+        help="Root directory containing result subdirectories",
+    )
+    args = parser.parse_args()
+    result_root_dir = Path(args.result_root_dir)
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+    logger.info("Searching in directory: %s", result_root_dir)
+    aggregate_results(result_root_dir=result_root_dir)
+if __name__ == "__main__":
+    main()

src/models/__init__.py ADDED Viewed

File without changes

src/models/blocks.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import torch.nn as nn
+from src.models.gated_deltaproduct import GatedDeltaProductConfig
+from src.models.gated_deltaproduct.modeling_gated_deltaproduct import (
+    GatedDeltaProductBlock,
+)
+class GatedDeltaProductEncoder(nn.Module):
+    """
+    GatedDeltaNet encoder using GatedDeltaProductBlock for sequence modeling.
+    """
+    def __init__(
+        self,
+        layer_idx: int,
+        token_embed_dim: int,
+        num_heads: int = 4,
+        attn_mode: str = "chunk",
+        expand_v: float = 1.0,
+        use_gate: bool = False,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        hidden_ratio: int = 1.0,
+        allow_neg_eigval: bool = True,
+        use_forget_gate: bool = True,
+        num_householder: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        config = GatedDeltaProductConfig(
+            attn_mode=attn_mode,
+            hidden_size=token_embed_dim,
+            expand_v=expand_v,
+            use_gate=use_gate,
+            use_short_conv=use_short_conv,
+            conv_size=conv_size,
+            head_dim=token_embed_dim // num_heads,
+            hidden_ratio=hidden_ratio,
+            num_heads=num_heads,
+            allow_neg_eigval=allow_neg_eigval,
+            use_forget_gate=use_forget_gate,
+            num_householder=num_householder,
+        )
+        self.encoder_layer = GatedDeltaProductBlock(layer_idx=layer_idx, config=config)
+    def forward(self, x, initial_state=None):
+        """
+        Forward pass through the GatedDeltaProductBlock.
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size]
+        Returns:
+            Output tensor of same shape as input
+        """
+        x, last_hidden_state, _ = self.encoder_layer(
+            x, output_attentions=True, initial_state=initial_state
+        )
+        return x, last_hidden_state

src/models/gated_deltaproduct/README.md ADDED Viewed

	@@ -0,0 +1,344 @@

+# Custom GatedDeltaProduct Implementation
+This directory contains a custom implementation of the GatedDeltaProduct layer, based on the [Flash Linear Attention (FLA)](https://github.com/fla-org/flash-linear-attention) library, with modifications specifically designed for **time series forecasting** tasks.
+## Overview
+Our custom implementation adds **hidden state weaving** functionality that enables information to flow across encoder layers, maintaining temporal continuity - a crucial feature for time series forecasting that differs from the general-purpose language modeling focus of the official FLA implementation.
+## Reference
+This implementation is based on:
+- **Official FLA Repository**: [https://github.com/fla-org/flash-linear-attention](https://github.com/fla-org/flash-linear-attention)
+- **Original Paper**: [DeltaProduct: Improving State-Tracking in Linear RNNs via Householder Products](https://arxiv.org/html/2502.10297v3) (Siems et al., 2025)
+---
+## What is DeltaProduct?
+DeltaProduct is a linear RNN architecture that uses **diagonal plus rank-nₕ** state-transition matrices, formed as products of `nₕ` generalized Householder transformations. This provides a tunable mechanism to balance expressivity and efficiency compared to diagonal-only architectures like Mamba or GLA.
+### Key Concepts
+- **Householder transformations**: Enable simultaneous token-channel mixing, overcoming the expressivity limitations of purely diagonal state-transition matrices
+- **Rank-nₕ structure**: Allows better expressivity than rank-1 (DeltaNet) while maintaining training efficiency. The parameter `nₕ` (number of Householder transformations) provides a tunable trade-off between expressivity and computational cost
+- **Gated variant**: Adds gating mechanisms for improved performance, allowing the model to control information flow through forget gates and output gates
+### Architecture Overview
+DeltaProduct improves upon earlier linear RNN architectures:
+- **Diagonal architectures** (Mamba, GLA, mLSTM): Use diagonal state-transition matrices for fast runtime but suffer from limited expressivity
+- **Rank-1 architectures** (DeltaNet, RWKV-7): Use diagonal plus rank-1 structure, enabling simultaneous token-channel mixing with only a slight decrease in training efficiency
+- **DeltaProduct**: Extends this to diagonal plus rank-nₕ structure, where multiple Householder transformations (nₕ ≥ 1) provide greater expressivity while maintaining computational efficiency
+The architecture interprets DeltaNet's recurrence as performing one step of online gradient descent per token on an associative recall loss. DeltaProduct instead takes multiple (`nₕ`) steps per token, naturally leading to the rank-nₕ structure.
+---
+## State Weaving Mechanism
+Unlike DeltaProduct's original design for autoregressive language modeling, time series forecasting across a full horizon does not require causal masking. To exploit this property, we introduce **state weaving**, a mechanism that enables bidirectional information flow across the entire sequence length without additional parameters or computational overhead.
+<div align="center">
+  <img src="https://iili.io/Ks86Z0X.png" alt="State Weaving Architecture" width="450"/>
+</div>
+*Figure: The TempoPFN architecture using stacked GatedDeltaProduct blocks with learnable initial states H₀ⁱ and state-weaving. The final hidden state of each layer Hₜⁱ is added to the learnable initial state of the next layer H₀ⁱ⁺¹, enabling bidirectional information flow.*
+### How State Weaving Works
+In our implementation, state weaving operates as follows:
+1. **Learnable Initial States**: Each encoder layer `i` has a learnable initial hidden state `H₀ⁱ` that is optimized during training.
+2. **State Propagation**: The final hidden state from layer `i`, denoted `Hₜⁱ`, is propagated forward and combined with the learnable initial state of the next layer:
+   ```
+   H₀ⁱ⁺¹ = H₀ⁱ⁺¹ + Hₜⁱ
+   ```
+3. **Bidirectional Information Flow**: This mechanism effectively lifts the causal constraint while maintaining computational efficiency. Information from later tokens can influence earlier layers through the accumulated hidden states, enabling the model to process the entire sequence (history + future horizon) coherently.
+4. **No Extra Overhead**: Unlike explicit bidirectional architectures, state weaving requires no additional parameters or computational overhead beyond the existing forward pass.
+This design is particularly powerful for time series forecasting, where:
+- The full prediction horizon is known at inference time
+- Coherent predictions across all future time steps are desired
+- Historical context should inform all future predictions simultaneously
+---
+## Key Differences from Official FLA
+### 1. **`initial_state` Parameter in Forward Method**
+#### Official FLA (`fla/layers/gated_deltaproduct.py`)
+```python
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+    past_key_values: Cache | None = None,
+    use_cache: bool | None = False,
+    output_attentions: bool | None = False,
+    **kwargs: Unpack[dict],
+) -> tuple[torch.Tensor, torch.Tensor | None, Cache | None]:
+```
+**No `initial_state` parameter** - The official implementation only uses `recurrent_state` from `past_key_values`.
+#### Our Custom Implementation (`gated_deltaproduct.py`)
+```python
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[Cache] = None,
+    initial_state: Optional[torch.Tensor] = None,  # ← ADDED
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    **kwargs: Unpack[Dict],
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+```
+**Added `initial_state` parameter** - Allows external control of the initial recurrent state, enabling layer-to-layer state propagation.
+---
+### 2. **Usage of `initial_state` in Chunk Mode**
+#### Official FLA
+```python
+if mode == 'chunk':
+    o, recurrent_state = chunk_gated_delta_product(
+        q=q, k=k, v=v, g=g, beta=beta,
+        initial_state=recurrent_state,  # ← Only from past_key_values
+        output_final_state=use_cache,
+        cu_seqlens=cu_seqlens,
+        num_householder=self.num_householder,
+        use_qk_l2norm_in_kernel=True,
+    )
+```
+#### Our Custom Implementation
+```python
+if mode == "chunk":
+    o, recurrent_state = chunk_gated_delta_product(
+        q=q, k=k, v=v, g=g, beta=beta,
+        initial_state=initial_state,  # ← Uses external initial_state if provided
+        output_final_state=output_attentions,
+        cu_seqlens=cu_seqlens,
+        num_householder=self.num_householder,
+        use_qk_l2norm_in_kernel=True,
+    )
+```
+**Key Difference**: Our implementation prioritizes the externally provided `initial_state` over `recurrent_state` from `past_key_values`, enabling layer-to-layer state propagation.
+---
+### 3. **Return Value: Hidden State Output**
+#### Official FLA (`fla/models/gated_deltaproduct/modeling_gated_deltaproduct.py`)
+```python
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+    past_key_values: Cache | list[torch.FloatTensor] | None = None,
+    use_cache: bool | None = False,
+    output_attentions: bool | None = False,
+    **kwargs: Unpack[dict],
+) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
+    # ...
+    return outputs  # Returns (hidden_states, attentions, past_key_values)
+```
+**No `initial_state` parameter** - The block doesn't accept or return hidden states explicitly.
+#### Our Custom Implementation (`modeling_gated_deltaproduct.py`)
+```python
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+    use_cache: Optional[bool] = False,
+    output_attentions: Optional[bool] = False,
+    initial_state: Optional[torch.FloatTensor] = None,  # ← ADDED
+    **kwargs: Unpack[Dict],
+) -> Tuple[
+    torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+]:
+    # ...
+    hidden_states, attentions, past_key_values = self.attn(
+        # ...
+        initial_state=initial_state,  # ← Passed through
+        **kwargs,
+    )
+    # ...
+    return outputs  # Returns (hidden_states, attentions, past_key_values)
+```
+**Added `initial_state` parameter** - The block accepts and forwards `initial_state` to the attention layer.
+---
+### 4. **Hidden State Weaving Implementation**
+Our implementation supports two modes of hidden state weaving (controlled by the `weaving` parameter in encoder config):
+#### **Mode 1: Weaving Enabled (`weaving=True`)** - Default
+```python
+if self.encoder_config.get("weaving", True):
+    # initial hidden state is learnable
+    hidden_state = torch.zeros_like(
+        self.initial_hidden_state[0].repeat(batch_size * num_channels, 1, 1, 1)
+    )
+    for layer_idx, encoder_layer in enumerate(self.encoder_layers):
+        x, hidden_state = encoder_layer(
+            x,
+            hidden_state + self.initial_hidden_state[layer_idx].repeat(
+                batch_size * num_channels, 1, 1, 1
+            ),
+        )
+```
+**Key Features**:
+- Hidden state accumulates across layers
+- Each layer receives: `previous_hidden_state + learnable_initial_state[layer_idx]`
+- State persists between layers, allowing information to flow through the network
+#### **Mode 2: No Weaving (`weaving=False`)**
+```python
+else:
+    # initial hidden state is separately learnable for each layer
+    for layer_idx, encoder_layer in enumerate(self.encoder_layers):
+        initial_hidden_state = self.initial_hidden_state[layer_idx].repeat(
+            batch_size * num_channels, 1, 1, 1
+        )
+        x, _ = encoder_layer(x, initial_hidden_state)
+```
+**Key Features**:
+- Each layer uses its own independent learnable initial state
+- No accumulation between layers
+- Hidden state is discarded after each layer
+---
+### 5. **Learnable Initial Hidden States**
+Our implementation includes learnable initial states managed at the model level:
+```python
+num_initial_hidden_states = self.num_encoder_layers
+self.initial_hidden_state = nn.ParameterList(
+    [
+        nn.Parameter(
+            torch.randn(
+                1, self.encoder_config["num_heads"], head_k_dim, head_v_dim
+            )
+            / head_k_dim,
+            requires_grad=True,
+        )
+        for _ in range(num_initial_hidden_states)
+    ]
+)
+```
+**Key Features**:
+- One learnable parameter per encoder layer
+- Shape: `[1, num_heads, head_k_dim, head_v_dim]`
+- Initialized with small random values scaled by `head_k_dim`
+- These are trainable parameters that can be optimized during training
+---
+### 6. **Parameter Name Differences**
+- **Official FLA**: Uses `use_output_gate` parameter
+- **Our Implementation**: Uses `use_gate` parameter (renamed for clarity)
+---
+### 7. **Return Value Differences**
+#### Official FLA (`fla/layers/gated_deltaproduct.py`)
+```python
+return o, None, past_key_values  # Returns (output, None, past_key_values)
+```
+#### Our Custom Implementation (`gated_deltaproduct.py`)
+```python
+return o, recurrent_state, past_key_values  # Returns (output, recurrent_state, past_key_values)
+```
+**Key Difference**: Our implementation returns `recurrent_state` (the final hidden state) instead of `None`, enabling state propagation.
+---
+### 8. **Encoder Wrapper Return Values**
+Our `GatedDeltaProductEncoder` (in `src/models/blocks.py`) returns both the output and hidden state:
+```python
+x, last_hidden_state, _ = self.encoder_layer(
+    x, output_attentions=True, initial_state=initial_state
+)
+return x, last_hidden_state  # ← Returns hidden state for weaving
+```
+This allows state propagation between layers in the `TimeSeriesModel`.
+---
+## Summary Table
+| Feature | Official FLA | Our Custom Implementation |
+|---------|-------------|---------------------------|
+| `initial_state` in `forward()` | ❌ No | ✅ Yes |
+| `initial_state` in `GatedDeltaProductBlock.forward()` | ❌ No | ✅ Yes |
+| Hidden state weaving | ❌ No | ✅ Yes (configurable) |
+| Learnable initial states | ❌ No | ✅ Yes (`nn.ParameterList`) |
+| Returns `recurrent_state` | ❌ No (returns `None`) | ✅ Yes |
+| Layer-to-layer state propagation | ❌ No | ✅ Yes (when `weaving=True`) |
+| Parameter name | `use_output_gate` | `use_gate` |
+---
+## Why These Differences Matter for Time Series Forecasting
+1. **Temporal Continuity**: Hidden state weaving allows information to flow across layers, maintaining temporal patterns across the encoder stack. This is crucial for time series where historical context matters.
+2. **Learnable Initialization**: Learnable initial states allow the model to learn optimal starting points for the recurrent computation, which can be crucial for capturing time series patterns.
+3. **Flexible State Management**: The `weaving` parameter allows switching between:
+   - **Weaving mode**: Better for capturing long-term dependencies across layers
+   - **Independent mode**: Each layer processes independently, potentially more stable
+4. **State Propagation**: Returning and propagating hidden states enables the model to maintain context across multiple encoder layers, which is beneficial for time series forecasting where historical context matters.
+These modifications make our implementation better suited for time series forecasting tasks compared to the general-purpose language modeling focus of the official FLA implementation.
+---
+## Files in This Directory
+- **`gated_deltaproduct.py`**: Core GatedDeltaProduct layer implementation with `initial_state` support
+- **`modeling_gated_deltaproduct.py`**: GatedDeltaProductBlock wrapper that integrates the layer
+- **`configuration_gated_deltaproduct.py`**: Configuration class for the model
+- **`__init__.py`**: Module exports
+---
+## Usage
+See `src/models/model.py` and `src/models/blocks.py` for examples of how to use this custom implementation with hidden state weaving.
+To enable/disable weaving, set the `weaving` parameter in your encoder configuration:
+```python
+encoder_config = {
+    "weaving": True,  # Enable state propagation across layers
+    # ... other config parameters
+}
+```

src/models/gated_deltaproduct/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from src.models.gated_deltaproduct.configuration_gated_deltaproduct import (
+    GatedDeltaProductConfig,
+)
+from src.models.gated_deltaproduct.modeling_gated_deltaproduct import (
+    GatedDeltaProductBlock,
+)
+__all__ = [
+    "GatedDeltaProductConfig",
+    "GatedDeltaProductBlock",
+]

src/models/gated_deltaproduct/configuration_gated_deltaproduct.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import warnings
+from transformers.configuration_utils import PretrainedConfig
+class GatedDeltaProductConfig(PretrainedConfig):
+    model_type = "gated_deltaproduct"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        attn_mode: str = "chunk",
+        conv_size: int = 4,
+        head_dim: int = 256,
+        num_heads: int = 6,
+        hidden_size: int = 2048,
+        expand_v: float = 2.0,
+        use_gate: bool = True,  # Changed from use_output_gate to use_gate for custom implementation
+        use_short_conv: bool = True,
+        max_position_embeddings: int = 2048,
+        hidden_ratio: int | None = 4,
+        intermediate_size: int | None = None,
+        hidden_act: str = "swish",
+        num_hidden_layers: int = 21,
+        norm_eps: float = 1e-6,
+        attn: dict | None = None,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        initializer_range: float = 0.02,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        fuse_linear_cross_entropy: bool = False,
+        use_l2warp: bool = False,
+        vocab_size: int = 32000,
+        use_forget_gate: bool = False,
+        allow_neg_eigval: bool = False,
+        num_householder: int = 1,
+        **kwargs,
+    ):
+        self.attn_mode = attn_mode
+        self.conv_size = conv_size
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.expand_v = expand_v
+        self.use_gate = use_gate  # Changed from use_output_gate to use_gate
+        self.use_short_conv = use_short_conv
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_hidden_layers = num_hidden_layers
+        self.norm_eps = norm_eps
+        self.attn = attn
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.fuse_linear_cross_entropy = fuse_linear_cross_entropy
+        self.use_l2warp = use_l2warp
+        self.vocab_size = vocab_size
+        if fuse_cross_entropy and fuse_linear_cross_entropy:
+            raise ValueError(
+                "`fuse_cross_entropy` and `fuse_linear_cross_entropy` cannot be True at the same time.",
+            )
+        if fuse_linear_cross_entropy:
+            warnings.warn(
+                "`fuse_linear_cross_entropy` is enabled, which can improves memory efficiency "
+                "at the potential cost of reduced precision. "
+                "If you observe issues like loss divergence, consider disabling this setting.",
+            )
+        # DeltaProduct specific
+        self.allow_neg_eigval = allow_neg_eigval
+        self.num_householder = num_householder
+        self.use_forget_gate = use_forget_gate
+        if attn is not None:
+            if not isinstance(attn, dict):
+                raise ValueError("attn must be a dictionary")
+            if "layers" not in attn:
+                raise ValueError(
+                    "Layer indices must be provided to initialize hybrid attention layers"
+                )
+            if "num_heads" not in attn:
+                raise ValueError(
+                    "Number of heads must be provided to initialize hybrid attention layers"
+                )
+            attn["num_kv_heads"] = attn.get("num_kv_heads", attn["num_heads"])
+            attn["qkv_bias"] = attn.get("qkv_bias", False)
+            attn["window_size"] = attn.get("window_size", None)
+            attn["rope_theta"] = attn.get("rope_theta", 10000.0)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

src/models/gated_deltaproduct/gated_deltaproduct.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import math
+import warnings
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from fla.layers.utils import get_unpad_data, index_first_axis, pad_input
+from fla.modules import FusedRMSNormGated, RMSNorm, ShortConvolution
+from fla.ops.delta_rule import fused_recurrent_delta_rule
+from fla.ops.gated_delta_product import chunk_gated_delta_product
+from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
+from torch.nn import functional as F
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+    from transformers.processing_utils import Unpack
+class GatedDeltaProduct(nn.Module):
+    """
+    Generalized version of GatedDoubleDeltaNet that supports arbitrary number of householder transformations.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        expand_v: float = 2,
+        head_dim: int = 256,
+        num_heads: int = 6,
+        num_v_heads: int = None,
+        mode: str = "chunk",
+        use_gate: bool = True,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        layer_idx: int = None,
+        norm_eps: float = 1e-5,
+        use_forget_gate: bool = True,
+        allow_neg_eigval: bool = True,
+        num_householder: int = 2,
+        **kwargs,
+    ) -> GatedDeltaProduct:
+        super().__init__()
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_v = expand_v
+        self.use_forget_gate = use_forget_gate
+        self.allow_neg_eigval = allow_neg_eigval
+        self.num_householder = num_householder
+        self.use_gate = use_gate
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+        self.num_v_heads = num_v_heads if num_v_heads is not None else num_heads
+        self.head_k_dim = head_dim
+        self.head_v_dim = int(self.head_dim * self.expand_v)
+        self.key_dim = int(self.num_heads * self.head_k_dim)
+        self.value_dim = int(self.num_v_heads * self.head_v_dim)
+        self.layer_idx = layer_idx
+        self.init_hidden_state = nn.Parameter(
+            torch.randn(self.num_heads, self.head_dim, self.head_dim)
+        )
+        # Consistency check: Ensure expand_v produces integer values
+        if not math.isclose(
+            self.num_v_heads * self.head_dim * expand_v, self.value_dim, rel_tol=1e-5
+        ):
+            raise ValueError(
+                f"expand_v={expand_v} does not produce an integer value when multiplied by key_dim={self.key_dim}. "
+                f"Resulting value_dim would be {self.num_v_heads * self.head_dim * expand_v}, which is invalid for nn.Linear."
+            )
+        if self.num_v_heads > self.num_heads and self.num_v_heads % self.num_heads != 0:
+            raise ValueError(
+                f"num_v_heads={self.num_v_heads} must be divisible by num_heads={self.num_heads}."
+            )
+        if not math.isclose(head_dim * expand_v, self.head_v_dim, rel_tol=1e-5):
+            raise ValueError(
+                f"expand_v={expand_v} does not produce an integer value when multiplied by head_dim={head_dim}. "
+                f"Resulting head_v_dim would be {head_dim * expand_v}, which is invalid for FusedRMSNormGated."
+            )
+        assert mode in ["chunk", "fused_recurrent"], f"Not suppoerted mode `{mode}`."
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim * num_householder, bias=False)
+        self.v_proj = nn.Linear(
+            hidden_size, self.value_dim * num_householder, bias=False
+        )
+        self.b_proj = nn.Linear(
+            hidden_size, self.num_v_heads * num_householder, bias=False
+        )
+        if self.use_forget_gate:
+            self.a_proj = nn.Linear(hidden_size, self.num_v_heads, bias=False)
+            A = torch.empty(self.num_v_heads, dtype=torch.float32).uniform_(0, 16)
+            self.A_log = nn.Parameter(torch.log(A))
+            self.A_log._no_weight_decay = True
+            # hard coded for now
+            dt_min = 0.001
+            dt_max = 0.1
+            dt_init_floor = 1e-4
+            dt = torch.exp(
+                torch.rand(self.num_v_heads) * (math.log(dt_max) - math.log(dt_min))
+                + math.log(dt_min)
+            )
+            dt = torch.clamp(dt, min=dt_init_floor)
+            # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            self.dt_bias = nn.Parameter(inv_dt)
+            # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
+            # name.endswith("bias") in param_grouping.py
+            self.dt_bias._no_weight_decay = True
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                bias=conv_bias,
+                activation="silu",
+            )
+            self.k_conv1d = ShortConvolution(
+                hidden_size=self.key_dim * num_householder,
+                kernel_size=conv_size,
+                bias=conv_bias,
+                activation="silu",
+            )
+            self.v_conv1d = ShortConvolution(
+                hidden_size=self.value_dim * num_householder,
+                kernel_size=conv_size,
+                bias=conv_bias,
+                activation="silu",
+            )
+        else:
+            warnings.warn(
+                "ShortConvolution is crucial to the performance. "
+                "Do not turn it off, i.e., setting `use_short_conv=False` unless you know what you are doing."
+            )
+        if use_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+            self.o_norm = FusedRMSNormGated(self.head_v_dim, eps=norm_eps)
+        else:
+            self.o_norm = RMSNorm(self.head_v_dim, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+    def _initialize_weights(self, module: nn.Module):
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight, gain=2**-2.5)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        module._is_hf_initialized = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        initial_state: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs: Unpack[Dict],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        batch_size, q_len, _ = hidden_states.shape
+        # change to inference mode.
+        mode = self.mode
+        if self.training:
+            assert mode == "chunk", "Only chunk mode is supported in training."
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        cu_seqlens = kwargs.get("cu_seqlens", None)
+        if attention_mask is not None:
+            indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
+            hidden_states = index_first_axis(
+                rearrange(hidden_states, "b s ... -> (b s) ..."), indices
+            ).unsqueeze(0)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state["conv_state"]
+            q, conv_state_q = self.q_conv1d(
+                x=self.q_proj(hidden_states),
+                cache=conv_state_q,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+            )
+            k, conv_state_k = self.k_conv1d(
+                x=self.k_proj(hidden_states),
+                cache=conv_state_k,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+            )
+            v, conv_state_v = self.v_conv1d(
+                x=self.v_proj(hidden_states),
+                cache=conv_state_v,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+            )
+        else:
+            q = F.silu(self.q_proj(hidden_states))
+            k = F.silu(self.k_proj(hidden_states))
+            v = F.silu(self.v_proj(hidden_states))
+        q = rearrange(q, "... (h d) -> ... h d", d=self.head_k_dim)
+        k = rearrange(
+            k,
+            "... l (n h d) -> ... (l n) h d",
+            n=self.num_householder,
+            d=self.head_k_dim,
+        )
+        v = rearrange(
+            v,
+            "... l (n h d) -> ... (l n) h d",
+            n=self.num_householder,
+            d=self.head_v_dim,
+        )
+        if self.num_v_heads > self.num_heads:
+            q, k = map(
+                lambda x: repeat(
+                    x, "... h d -> ... (h g) d", g=self.num_v_heads // self.num_heads
+                ),
+                (q, k),
+            )
+        beta = self.b_proj(hidden_states).sigmoid()
+        if self.allow_neg_eigval:
+            beta = beta * 2.0
+        beta = rearrange(beta, "... l (n h) -> ... (l n) h", n=self.num_householder)
+        if self.use_forget_gate:
+            g = -self.A_log.float().exp() * F.softplus(
+                self.a_proj(hidden_states).float() + self.dt_bias
+            )
+        else:
+            g = None
+        recurrent_state = (
+            last_state["recurrent_state"] if last_state is not None else None
+        )
+        if mode == "chunk":
+            o, recurrent_state = chunk_gated_delta_product(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=initial_state,
+                output_final_state=output_attentions,
+                cu_seqlens=cu_seqlens,
+                num_householder=self.num_householder,
+                use_qk_l2norm_in_kernel=True,
+            )
+        elif mode == "fused_recurrent":
+            if self.use_forget_gate:
+                g_new = torch.zeros(
+                    g.shape[0],
+                    g.shape[1],
+                    self.num_householder,
+                    g.shape[2],
+                    device=g.device,
+                    dtype=torch.float32,
+                )
+                g_new[:, :, 0] = g
+                g = rearrange(g_new, "... l n h -> ... (l n) h")
+            q_new = q.new_zeros(
+                q.shape[0], q.shape[1], self.num_householder, q.shape[2], q.shape[3]
+            )
+            q_new[:, :, -1] = q
+            q = rearrange(q_new, "... l n h d-> ... (l n) h d")
+            if self.use_forget_gate:
+                o, recurrent_state = fused_recurrent_gated_delta_rule(
+                    q=q,
+                    k=k,
+                    v=v,
+                    g=g,
+                    beta=beta,
+                    initial_state=recurrent_state,
+                    output_final_state=use_cache,
+                    cu_seqlens=cu_seqlens * self.num_householder
+                    if cu_seqlens is not None
+                    else None,
+                    use_qk_l2norm_in_kernel=True,
+                )
+            else:
+                o, recurrent_state = fused_recurrent_delta_rule(
+                    q=q,
+                    k=k,
+                    v=v,
+                    beta=beta,
+                    initial_state=recurrent_state,
+                    output_final_state=use_cache,
+                    cu_seqlens=cu_seqlens * self.num_householder
+                    if cu_seqlens is not None
+                    else None,
+                    use_qk_l2norm_in_kernel=True,
+                )
+            o = rearrange(o, "... (l n) h d -> ... l n h d", n=self.num_householder)[
+                ..., -1, :, :
+            ].contiguous()
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v)
+                if self.use_short_conv
+                else None,
+                layer_idx=self.layer_idx,
+                offset=q_len,
+            )
+        if self.use_gate:
+            g = rearrange(
+                self.g_proj(hidden_states), "... (h d) -> ... h d", d=self.head_v_dim
+            )
+            o = self.o_norm(o, g)
+        else:
+            o = self.o_norm(o)
+        o = rearrange(o, "b t h d -> b t (h d)")
+        o = self.o_proj(o)
+        if attention_mask is not None:
+            o = pad_input(o.squeeze(0), indices, batch_size, q_len)
+        return o, recurrent_state, past_key_values

src/models/gated_deltaproduct/modeling_gated_deltaproduct.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from fla.layers.attn import Attention
+from fla.models.utils import Cache
+from fla.modules import GatedMLP as GatedDeltaProductMLP
+from fla.modules import RMSNorm
+from src.models.gated_deltaproduct.configuration_gated_deltaproduct import (
+    GatedDeltaProductConfig,
+)
+from src.models.gated_deltaproduct.gated_deltaproduct import GatedDeltaProduct
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+class GatedDeltaProductBlock(nn.Module):
+    def __init__(self, config: GatedDeltaProductConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(
+            config.hidden_size, eps=config.norm_eps
+        )
+        if config.attn is not None and layer_idx in config.attn["layers"]:
+            self.attn = Attention(
+                hidden_size=config.hidden_size,
+                num_heads=config.attn["num_heads"],
+                num_kv_heads=config.attn["num_kv_heads"],
+                qkv_bias=config.attn["qkv_bias"],
+                window_size=config.attn["window_size"],
+                rope_theta=config.attn["rope_theta"],
+                max_position_embeddings=config.max_position_embeddings,
+                layer_idx=layer_idx,
+            )
+        else:
+            self.attn = GatedDeltaProduct(
+                mode=config.attn_mode,
+                hidden_size=config.hidden_size,
+                expand_v=config.expand_v,
+                head_dim=config.head_dim,
+                num_heads=config.num_heads,
+                use_gate=config.use_gate,
+                use_forget_gate=config.use_forget_gate,
+                use_short_conv=config.use_short_conv,
+                conv_size=config.conv_size,
+                norm_eps=config.norm_eps,
+                allow_neg_eigval=config.allow_neg_eigval,
+                num_householder=config.num_householder,
+                layer_idx=layer_idx,
+            )
+        self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(
+            config.hidden_size, eps=config.norm_eps
+        )
+        self.mlp = GatedDeltaProductMLP(
+            hidden_size=config.hidden_size,
+            hidden_ratio=config.hidden_ratio,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            fuse_swiglu=config.fuse_swiglu,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        initial_state: Optional[torch.FloatTensor] = None,
+        **kwargs: Unpack[Dict],
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states)
+        hidden_states, attentions, past_key_values = self.attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            initial_state=initial_state,
+            **kwargs,
+        )
+        if self.config.fuse_norm:
+            hidden_states, residual = self.mlp_norm(hidden_states, residual, True)
+        else:
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.mlp_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states, **kwargs)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states, attentions, past_key_values)
+        return outputs

src/models/model.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import torch
+import torch.nn as nn
+from fla.modules import GatedMLP
+from src.data.containers import BatchTimeSeriesContainer
+from src.data.scalers import MinMaxScaler, RobustScaler
+from src.data.time_features import compute_batch_time_features
+from src.models.blocks import GatedDeltaProductEncoder
+from src.utils.utils import device
+def create_scaler(scaler_type: str, epsilon: float = 1e-3):
+    """Create scaler instance based on type."""
+    if scaler_type == "custom_robust":
+        return RobustScaler(epsilon=epsilon)
+    elif scaler_type == "min_max":
+        return MinMaxScaler(epsilon=epsilon)
+    else:
+        raise ValueError(f"Unknown scaler: {scaler_type}")
+def apply_channel_noise(values: torch.Tensor, noise_scale: float = 0.1):
+    """Add noise to constant channels to prevent model instability."""
+    is_constant = torch.all(values == values[:, 0:1, :], dim=1)
+    noise = torch.randn_like(values) * noise_scale * is_constant.unsqueeze(1)
+    return values + noise
+class TimeSeriesModel(nn.Module):
+    """Time series forecasting model combining embedding, encoding, and prediction."""
+    def __init__(
+        self,
+        # Core architecture
+        embed_size: int = 128,
+        num_encoder_layers: int = 2,
+        # Scaling and preprocessing
+        scaler: str = "custom_robust",
+        epsilon: float = 1e-3,
+        scaler_clamp_value: float = None,
+        handle_constants: bool = False,
+        # Time features
+        K_max: int = 6,
+        time_feature_config: dict = None,
+        encoding_dropout: float = 0.0,
+        # Encoder configuration
+        encoder_config: dict = None,
+        # Loss configuration
+        loss_type: str = "huber",  # "huber", "quantile"
+        quantiles: list[float] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        # Core parameters
+        self.embed_size = embed_size
+        self.num_encoder_layers = num_encoder_layers
+        self.epsilon = epsilon
+        self.scaler_clamp_value = scaler_clamp_value
+        self.handle_constants = handle_constants
+        self.encoding_dropout = encoding_dropout
+        self.K_max = K_max
+        self.time_feature_config = time_feature_config or {}
+        self.encoder_config = encoder_config or {}
+        # Store loss parameters
+        self.loss_type = loss_type
+        self.quantiles = quantiles
+        if self.loss_type == "quantile" and self.quantiles is None:
+            raise ValueError("Quantiles must be provided for quantile loss.")
+        if self.quantiles:
+            self.register_buffer(
+                "qt", torch.tensor(self.quantiles, device=device).view(1, 1, 1, -1)
+            )
+        # Validate configuration before initialization
+        self._validate_configuration()
+        # Initialize components
+        self.scaler = create_scaler(scaler, epsilon)
+        self._init_embedding_layers()
+        self._init_encoder_layers(self.encoder_config, num_encoder_layers)
+        self._init_projection_layers()
+    def _validate_configuration(self):
+        """Validate essential model configuration parameters."""
+        if "num_heads" not in self.encoder_config:
+            raise ValueError("encoder_config must contain 'num_heads' parameter")
+        if self.embed_size % self.encoder_config["num_heads"] != 0:
+            raise ValueError(
+                f"embed_size ({self.embed_size}) must be divisible by "
+                f"num_heads ({self.encoder_config['num_heads']})"
+            )
+    def _init_embedding_layers(self):
+        """Initialize value and time feature embedding layers."""
+        self.expand_values = nn.Linear(1, self.embed_size, bias=True)
+        self.nan_embedding = nn.Parameter(
+            torch.randn(1, 1, 1, self.embed_size) / self.embed_size,
+            requires_grad=True,
+        )
+        self.time_feature_projection = nn.Linear(self.K_max, self.embed_size)
+    def _init_encoder_layers(self, encoder_config: dict, num_encoder_layers: int):
+        """Initialize encoder layers."""
+        self.num_encoder_layers = num_encoder_layers
+        # Ensure encoder_config has token_embed_dim
+        encoder_config = encoder_config.copy()
+        encoder_config["token_embed_dim"] = self.embed_size
+        self.encoder_layers = nn.ModuleList(
+            [
+                GatedDeltaProductEncoder(layer_idx=layer_idx, **encoder_config)
+                for layer_idx in range(self.num_encoder_layers)
+            ]
+        )
+    def _init_projection_layers(self):
+        if self.loss_type == "quantile":
+            output_dim = len(self.quantiles)
+        else:
+            output_dim = 1
+        self.final_output_layer = nn.Linear(self.embed_size, output_dim)
+        self.mlp = GatedMLP(
+            hidden_size=self.embed_size,
+            hidden_ratio=4,
+            hidden_act="swish",
+            fuse_swiglu=True,
+        )
+        # Initialize learnable initial hidden state for the first encoder layer
+        # This will be expanded to match batch size during forward pass
+        head_k_dim = self.embed_size // self.encoder_config["num_heads"]
+        # Get expand_v from encoder_config, default to 1.0 if not present
+        expand_v = self.encoder_config.get("expand_v", 1.0)
+        head_v_dim = int(head_k_dim * expand_v)
+        num_initial_hidden_states = self.num_encoder_layers
+        self.initial_hidden_state = nn.ParameterList(
+            [
+                nn.Parameter(
+                    torch.randn(
+                        1, self.encoder_config["num_heads"], head_k_dim, head_v_dim
+                    )
+                    / head_k_dim,
+                    requires_grad=True,
+                )
+                for _ in range(num_initial_hidden_states)
+            ]
+        )
+    def _preprocess_data(self, data_container: BatchTimeSeriesContainer):
+        """Extract data shapes and handle constants without padding."""
+        history_values = data_container.history_values
+        future_values = data_container.future_values
+        history_mask = data_container.history_mask
+        batch_size, history_length, num_channels = history_values.shape
+        future_length = future_values.shape[1] if future_values is not None else 0
+        # Handle constants
+        if self.handle_constants:
+            history_values = apply_channel_noise(history_values)
+        return {
+            "history_values": history_values,
+            "future_values": future_values,
+            "history_mask": history_mask,
+            "num_channels": num_channels,
+            "history_length": history_length,
+            "future_length": future_length,
+            "batch_size": batch_size,
+        }
+    def _compute_scaling(
+        self, history_values: torch.Tensor, history_mask: torch.Tensor = None
+    ):
+        """Compute scaling statistics and apply scaling."""
+        scale_statistics = self.scaler.compute_statistics(history_values, history_mask)
+        return scale_statistics
+    def _apply_scaling_and_masking(
+        self, values: torch.Tensor, scale_statistics: dict, mask: torch.Tensor = None
+    ):
+        """Apply scaling and optional masking to values."""
+        scaled_values = self.scaler.scale(values, scale_statistics)
+        if mask is not None:
+            scaled_values = scaled_values * mask.unsqueeze(-1).float()
+        if self.scaler_clamp_value is not None:
+            scaled_values = torch.clamp(
+                scaled_values, -self.scaler_clamp_value, self.scaler_clamp_value
+            )
+        return scaled_values
+    def _get_positional_embeddings(
+        self,
+        time_features: torch.Tensor,
+        num_channels: int,
+        batch_size: int,
+        drop_enc_allow: bool = False,
+    ):
+        """Generate positional embeddings from time features."""
+        seq_len = time_features.shape[1]
+        if (torch.rand(1).item() < self.encoding_dropout) and drop_enc_allow:
+            return torch.zeros(
+                batch_size, seq_len, num_channels, self.embed_size, device=device
+            ).to(torch.float32)
+        pos_embed = self.time_feature_projection(time_features)
+        return pos_embed.unsqueeze(2).expand(-1, -1, num_channels, -1)
+    def _compute_embeddings(
+        self,
+        scaled_history: torch.Tensor,
+        history_pos_embed: torch.Tensor,
+        history_mask: torch.Tensor | None = None,
+    ):
+        """Compute value embeddings and combine with positional embeddings."""
+        nan_mask = torch.isnan(scaled_history)
+        history_for_embedding = torch.nan_to_num(scaled_history, nan=0.0)
+        channel_embeddings = self.expand_values(history_for_embedding.unsqueeze(-1))
+        channel_embeddings[nan_mask] = self.nan_embedding.to(channel_embeddings.dtype)
+        channel_embeddings = channel_embeddings + history_pos_embed
+        # Suppress padded time steps completely so padding is a pure batching artifact
+        # history_mask: [B, S] -> broadcast to [B, S, 1, 1]
+        if history_mask is not None:
+            mask_broadcast = (
+                history_mask.unsqueeze(-1).unsqueeze(-1).to(channel_embeddings.dtype)
+            )
+            channel_embeddings = channel_embeddings * mask_broadcast
+        batch_size, seq_len = scaled_history.shape[:2]
+        all_channels_embedded = channel_embeddings.view(batch_size, seq_len, -1)
+        return all_channels_embedded
+    def _generate_predictions(
+        self,
+        embedded: torch.Tensor,
+        target_pos_embed: torch.Tensor,
+        prediction_length: int,
+        num_channels: int,
+        history_mask: torch.Tensor = None,
+    ):
+        """
+        Generate predictions for all channels using vectorized operations.
+        """
+        batch_size, seq_len, _ = embedded.shape
+        # embedded shape: [B, S, N*E] -> Reshape to [B, S, N, E]
+        embedded = embedded.view(batch_size, seq_len, num_channels, self.embed_size)
+        # Vectorize across channels by merging the batch and channel dimensions.
+        # [B, S, N, E] -> [B*N, S, E]
+        channel_embedded = (
+            embedded.permute(0, 2, 1, 3)
+            .contiguous()
+            .view(batch_size * num_channels, seq_len, self.embed_size)
+        )
+        # Reshape target positional embeddings similarly: [B, P, N, E] -> [B*N, P, E]
+        target_pos_embed = (
+            target_pos_embed.permute(0, 2, 1, 3)
+            .contiguous()
+            .view(batch_size * num_channels, prediction_length, self.embed_size)
+        )
+        x = channel_embedded
+        target_repr = target_pos_embed
+        x = torch.concatenate([x, target_repr], dim=1)
+        if self.encoder_config.get("weaving", True):
+            # initial hidden state is learnable
+            hidden_state = torch.zeros_like(
+                self.initial_hidden_state[0].repeat(batch_size * num_channels, 1, 1, 1)
+            )
+            for layer_idx, encoder_layer in enumerate(self.encoder_layers):
+                x, hidden_state = encoder_layer(
+                    x,
+                    hidden_state
+                    + self.initial_hidden_state[layer_idx].repeat(
+                        batch_size * num_channels, 1, 1, 1
+                    ),
+                )
+        else:
+            # initial hidden state is separately learnable for each layer
+            for layer_idx, encoder_layer in enumerate(self.encoder_layers):
+                initial_hidden_state = self.initial_hidden_state[layer_idx].repeat(
+                    batch_size * num_channels, 1, 1, 1
+                )
+                x, _ = encoder_layer(x, initial_hidden_state)
+        # Use the last prediction_length positions
+        prediction_embeddings = x[:, -prediction_length:, :]
+        predictions = self.final_output_layer(self.mlp(prediction_embeddings))
+        # Reshape output to handle quantiles
+        # Original shape: [B*N, P, Q] where Q is num_quantiles or 1
+        # Reshape the output back to [B, P, N, Q]
+        output_dim = len(self.quantiles) if self.loss_type == "quantile" else 1
+        predictions = predictions.view(
+            batch_size, num_channels, prediction_length, output_dim
+        )
+        predictions = predictions.permute(0, 2, 1, 3)  # [B, P, N, Q]
+        # Squeeze the last dimension if not in quantile mode for backward compatibility
+        if self.loss_type != "quantile":
+            predictions = predictions.squeeze(-1)  # [B, P, N]
+        return predictions
+    def forward(
+        self, data_container: BatchTimeSeriesContainer, drop_enc_allow: bool = False
+    ):
+        """Main forward pass."""
+        # Preprocess data
+        preprocessed = self._preprocess_data(data_container)
+        # Compute time features dynamically based on actual lengths
+        history_time_features, target_time_features = compute_batch_time_features(
+            start=data_container.start,
+            history_length=preprocessed["history_length"],
+            future_length=preprocessed["future_length"],
+            batch_size=preprocessed["batch_size"],
+            frequency=data_container.frequency,
+            K_max=self.K_max,
+            time_feature_config=self.time_feature_config,
+        )
+        # Compute scaling
+        scale_statistics = self._compute_scaling(
+            preprocessed["history_values"], preprocessed["history_mask"]
+        )
+        # Apply scaling
+        history_scaled = self._apply_scaling_and_masking(
+            preprocessed["history_values"],
+            scale_statistics,
+            preprocessed["history_mask"],
+        )
+        # Scale future values if present
+        future_scaled = None
+        if preprocessed["future_values"] is not None:
+            future_scaled = self.scaler.scale(
+                preprocessed["future_values"], scale_statistics
+            )
+        # Get positional embeddings
+        history_pos_embed = self._get_positional_embeddings(
+            history_time_features,
+            preprocessed["num_channels"],
+            preprocessed["batch_size"],
+            drop_enc_allow,
+        )
+        target_pos_embed = self._get_positional_embeddings(
+            target_time_features,
+            preprocessed["num_channels"],
+            preprocessed["batch_size"],
+            drop_enc_allow,
+        )
+        # Compute embeddings
+        history_embed = self._compute_embeddings(
+            history_scaled, history_pos_embed, preprocessed["history_mask"]
+        )
+        # Generate predictions
+        predictions = self._generate_predictions(
+            history_embed,
+            target_pos_embed,
+            preprocessed["future_length"],
+            preprocessed["num_channels"],
+            preprocessed["history_mask"],
+        )
+        return {
+            "result": predictions,
+            "scale_statistics": scale_statistics,
+            "future_scaled": future_scaled,
+            "history_length": preprocessed["history_length"],
+            "future_length": preprocessed["future_length"],
+        }
+    def _quantile_loss(self, y_true: torch.Tensor, y_pred: torch.Tensor):
+        """
+        Compute the quantile loss.
+        y_true: [B, P, N]
+        y_pred: [B, P, N, Q]
+        """
+        # Add a dimension to y_true to match y_pred: [B, P, N] -> [B, P, N, 1]
+        y_true = y_true.unsqueeze(-1)
+        # Calculate errors
+        errors = y_true - y_pred
+        # Calculate quantile loss
+        # The max operator implements the two cases of the quantile loss formula
+        loss = torch.max((self.qt - 1) * errors, self.qt * errors)
+        # Average the loss across all dimensions
+        return loss.mean()
+    def compute_loss(self, y_true: torch.Tensor, y_pred: dict):
+        """Compute loss between predictions and scaled ground truth."""
+        predictions = y_pred["result"]
+        scale_statistics = y_pred["scale_statistics"]
+        if y_true is None:
+            return torch.tensor(0.0, device=predictions.device)
+        future_scaled = self.scaler.scale(y_true, scale_statistics)
+        if self.loss_type == "huber":
+            if predictions.shape != future_scaled.shape:
+                raise ValueError(
+                    f"Shape mismatch for Huber loss: predictions {predictions.shape} vs future_scaled {future_scaled.shape}"
+                )
+            return nn.functional.huber_loss(predictions, future_scaled)
+        elif self.loss_type == "quantile":
+            return self._quantile_loss(future_scaled, predictions)
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")

src/optim/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# src/utils/lr_scheduler.py
+import math
+from enum import Enum
+from functools import partial
+from typing import Optional
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+class SchedulerType(Enum):
+    """Enumeration of available learning rate schedulers."""
+    COSINE = "cosine"
+    COSINE_WITH_WARMUP = "cosine_with_warmup"
+    COSINE_WITH_RESTARTS = "cosine_with_restarts"
+    WARMUP_STABLE_DECAY = "warmup_stable_decay"
+    POLYNOMIAL_WITH_WARMUP = "polynomial_with_warmup"
+    LINEAR_WITH_WARMUP = "linear_with_warmup"
+    CONSTANT_WITH_WARMUP = "constant_with_warmup"
+    INVERSE_SQRT = "inverse_sqrt"
+def _get_warmup_stable_decay_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_training_steps: int,
+    min_lr_ratio: float = 0.001,
+    decay_type: str = "cosine",
+):
+    """
+    Learning rate lambda function for Warmup-Stable-Decay (WSD) schedule.
+    This scheduler implements three phases:
+    1. Warmup: Linear increase from 0 to peak learning rate
+    2. Stable: Constant learning rate for majority of training
+    3. Decay: Gradual decrease using cosine or linear decay
+    Args:
+        current_step: Current training step
+        num_warmup_steps: Number of warmup steps
+        num_stable_steps: Number of stable learning rate steps
+        num_training_steps: Total number of training steps
+        min_lr_ratio: Minimum learning rate as ratio of peak learning rate
+        decay_type: Type of decay schedule ("cosine" or "linear")
+    """
+    if current_step < num_warmup_steps:
+        # Warmup phase: linear increase
+        return float(current_step) / float(max(1, num_warmup_steps))
+    elif current_step < num_warmup_steps + num_stable_steps:
+        # Stable phase: constant learning rate
+        return 1.0
+    else:
+        # Decay phase
+        decay_steps = num_training_steps - num_warmup_steps - num_stable_steps
+        if decay_steps <= 0:
+            return max(min_lr_ratio, 1.0)
+        progress = (current_step - num_warmup_steps - num_stable_steps) / decay_steps
+        progress = min(progress, 1.0)  # Clamp to [0, 1]
+        if decay_type == "cosine":
+            # Cosine decay
+            decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
+            return max(min_lr_ratio, decay_factor)
+        elif decay_type == "linear":
+            # Linear decay
+            decay_factor = 1.0 - progress
+            return max(min_lr_ratio, decay_factor)
+        else:
+            raise ValueError(f"Unknown decay_type: {decay_type}")
+def get_warmup_stable_decay_schedule(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_training_steps: int,
+    min_lr_ratio: float = 0.01,
+    decay_type: str = "cosine",
+    last_epoch: int = -1,
+):
+    """
+    Create a Warmup-Stable-Decay learning rate schedule.
+    This scheduler is particularly well-suited for foundation model training as it:
+    - Provides stable learning during the majority of training
+    - Doesn't require pre-committing to exact training duration
+    - Allows for extended training without aggressive decay
+    Args:
+        optimizer: The optimizer for which to schedule the learning rate
+        num_warmup_steps: Number of steps for warmup phase
+        num_stable_steps: Number of steps for stable learning rate phase
+        num_training_steps: Total number of training steps
+        min_lr_ratio: Minimum learning rate as fraction of peak learning rate
+        decay_type: Type of decay ("cosine" or "linear")
+        last_epoch: The index of the last epoch when resuming training
+    Returns:
+        torch.optim.lr_scheduler.LambdaLR with the WSD schedule
+    """
+    lr_lambda = partial(
+        _get_warmup_stable_decay_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_stable_steps=num_stable_steps,
+        num_training_steps=num_training_steps,
+        min_lr_ratio=min_lr_ratio,
+        decay_type=decay_type,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+def _get_cosine_schedule_with_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    min_lr_ratio: float = 0.0,
+):
+    """Enhanced cosine schedule with configurable minimum learning rate."""
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    cosine_factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    return max(min_lr_ratio, cosine_factor)
+def get_enhanced_cosine_schedule_with_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    min_lr_ratio: float = 0.01,
+    last_epoch: int = -1,
+):
+    """
+    Enhanced cosine schedule with warmup and configurable minimum learning rate.
+    Args:
+        optimizer: The optimizer for which to schedule the learning rate
+        num_warmup_steps: Number of steps for warmup phase
+        num_training_steps: Total number of training steps
+        num_cycles: Number of cosine cycles (0.5 = half cosine)
+        min_lr_ratio: Minimum learning rate as fraction of peak learning rate
+        last_epoch: The index of the last epoch when resuming training
+    """
+    lr_lambda = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+        min_lr_ratio=min_lr_ratio,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+def _get_cosine_with_restarts_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: int = 1,
+    min_lr_ratio: float = 0.0,
+):
+    """Cosine schedule with hard restarts and configurable minimum learning rate."""
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    if progress >= 1.0:
+        return min_lr_ratio
+    cosine_factor = 0.5 * (
+        1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))
+    )
+    return max(min_lr_ratio, cosine_factor)
+def get_cosine_with_restarts_schedule(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: int = 4,
+    min_lr_ratio: float = 0.01,
+    last_epoch: int = -1,
+):
+    """
+    Cosine schedule with hard restarts.
+    Args:
+        optimizer: The optimizer for which to schedule the learning rate
+        num_warmup_steps: Number of steps for warmup phase
+        num_training_steps: Total number of training steps
+        num_cycles: Number of restart cycles
+        min_lr_ratio: Minimum learning rate as fraction of peak learning rate
+        last_epoch: The index of the last epoch when resuming training
+    """
+    lr_lambda = partial(
+        _get_cosine_with_restarts_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+        min_lr_ratio=min_lr_ratio,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+# Scheduler registry for easy lookup
+SCHEDULER_REGISTRY = {
+    SchedulerType.WARMUP_STABLE_DECAY: get_warmup_stable_decay_schedule,
+    SchedulerType.COSINE_WITH_WARMUP: get_enhanced_cosine_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_restarts_schedule,
+}
+def get_scheduler(
+    scheduler_type: str | SchedulerType,
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    scheduler_kwargs: Optional[dict] = None,
+):
+    """
+    Unified interface to create learning rate schedulers.
+    Args:
+        scheduler_type: Type of scheduler to create
+        optimizer: The optimizer to schedule
+        num_warmup_steps: Number of warmup steps
+        num_training_steps: Total training steps
+        scheduler_kwargs: Additional scheduler-specific parameters
+    Returns:
+        Configured learning rate scheduler
+    """
+    if isinstance(scheduler_type, str):
+        scheduler_type = SchedulerType(scheduler_type)
+    if scheduler_kwargs is None:
+        scheduler_kwargs = {}
+    if scheduler_type not in SCHEDULER_REGISTRY:
+        raise ValueError(f"Unsupported scheduler type: {scheduler_type}")
+    scheduler_func = SCHEDULER_REGISTRY[scheduler_type]
+    return scheduler_func(
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        **scheduler_kwargs,
+    )
+class WarmupStableDecayScheduler:
+    """
+    Alternative implementation as a standalone scheduler class.
+    This provides more flexibility and better state management for
+    complex training scenarios with checkpointing.
+    """
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        num_warmup_steps: int,
+        num_stable_steps: int,
+        total_steps: int,
+        min_lr_ratio: float = 0.01,
+        decay_type: str = "cosine",
+        verbose: bool = False,
+    ):
+        self.optimizer = optimizer
+        self.num_warmup_steps = num_warmup_steps
+        self.num_stable_steps = num_stable_steps
+        self.total_steps = total_steps
+        self.min_lr_ratio = min_lr_ratio
+        self.decay_type = decay_type
+        self.verbose = verbose
+        # Store initial learning rates
+        self.base_lrs = [group["lr"] for group in optimizer.param_groups]
+        self.current_step = 0
+    def get_lr_factor(self, step: int) -> float:
+        """Calculate the learning rate multiplication factor for given step."""
+        if step < self.num_warmup_steps:
+            # Warmup phase
+            return step / max(1, self.num_warmup_steps)
+        elif step < self.num_warmup_steps + self.num_stable_steps:
+            # Stable phase
+            return 1.0
+        else:
+            # Decay phase
+            decay_steps = (
+                self.total_steps - self.num_warmup_steps - self.num_stable_steps
+            )
+            if decay_steps <= 0:
+                return max(self.min_lr_ratio, 1.0)
+            progress = (
+                step - self.num_warmup_steps - self.num_stable_steps
+            ) / decay_steps
+            progress = min(progress, 1.0)
+            if self.decay_type == "cosine":
+                decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
+            elif self.decay_type == "linear":
+                decay_factor = 1.0 - progress
+            else:
+                raise ValueError(f"Unknown decay_type: {self.decay_type}")
+            return max(self.min_lr_ratio, decay_factor)
+    def step(self):
+        """Update learning rates for all parameter groups."""
+        lr_factor = self.get_lr_factor(self.current_step)
+        for param_group, base_lr in zip(self.optimizer.param_groups, self.base_lrs):
+            param_group["lr"] = base_lr * lr_factor
+        if self.verbose and self.current_step % 1000 == 0:
+            phase = self.get_phase()
+            print(
+                f"Step {self.current_step}: LR factor = {lr_factor:.6f}, Phase = {phase}"
+            )
+        self.current_step += 1
+    def get_phase(self) -> str:
+        """Get current training phase."""
+        if self.current_step < self.num_warmup_steps:
+            return "warmup"
+        elif self.current_step < self.num_warmup_steps + self.num_stable_steps:
+            return "stable"
+        else:
+            return "decay"
+    def state_dict(self) -> dict:
+        """Return scheduler state for checkpointing."""
+        return {
+            "current_step": self.current_step,
+            "base_lrs": self.base_lrs,
+        }
+    def load_state_dict(self, state_dict: dict):
+        """Load scheduler state from checkpoint."""
+        self.current_step = state_dict["current_step"]
+        self.base_lrs = state_dict["base_lrs"]

src/plotting/__init__.py ADDED Viewed

File without changes

src/plotting/gift_eval_utils.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import logging
+from typing import List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from gluonts.model.forecast import QuantileForecast
+from src.data.frequency import parse_frequency
+from src.plotting.plot_timeseries import (
+    plot_multivariate_timeseries,
+)
+logger = logging.getLogger(__name__)
+def _prepare_data_for_plotting(
+    input_data: dict, label_data: dict, max_context_length: int
+):
+    history_values = np.asarray(input_data["target"], dtype=np.float32)
+    future_values = np.asarray(label_data["target"], dtype=np.float32)
+    start_period = input_data["start"]
+    def ensure_time_first(arr: np.ndarray) -> np.ndarray:
+        if arr.ndim == 1:
+            return arr.reshape(-1, 1)
+        elif arr.ndim == 2:
+            if arr.shape[0] < arr.shape[1]:
+                return arr.T
+            return arr
+        else:
+            return arr.reshape(arr.shape[-1], -1).T
+    history_values = ensure_time_first(history_values)
+    future_values = ensure_time_first(future_values)
+    if max_context_length is not None and history_values.shape[0] > max_context_length:
+        history_values = history_values[-max_context_length:]
+    # Convert Period to Timestamp if needed
+    start_timestamp = (
+        start_period.to_timestamp()
+        if hasattr(start_period, "to_timestamp")
+        else pd.Timestamp(start_period)
+    )
+    return history_values, future_values, start_timestamp
+def _extract_quantile_predictions(
+    forecast,
+) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
+    def ensure_2d_time_first(arr):
+        if arr is None:
+            return None
+        arr = np.asarray(arr)
+        if arr.ndim == 1:
+            return arr.reshape(-1, 1)
+        elif arr.ndim == 2:
+            return arr
+        else:
+            return arr.reshape(arr.shape[0], -1)
+    if isinstance(forecast, QuantileForecast):
+        try:
+            median_pred = forecast.quantile(0.5)
+            try:
+                lower_bound = forecast.quantile(0.1)
+                upper_bound = forecast.quantile(0.9)
+            except (KeyError, ValueError):
+                lower_bound = None
+                upper_bound = None
+            median_pred = ensure_2d_time_first(median_pred)
+            lower_bound = ensure_2d_time_first(lower_bound)
+            upper_bound = ensure_2d_time_first(upper_bound)
+            return median_pred, lower_bound, upper_bound
+        except Exception:
+            try:
+                median_pred = forecast.quantile(0.5)
+                median_pred = ensure_2d_time_first(median_pred)
+                return median_pred, None, None
+            except Exception:
+                return None, None, None
+    else:
+        try:
+            samples = forecast.samples
+            if samples.ndim == 1:
+                median_pred = samples
+            elif samples.ndim == 2:
+                if samples.shape[0] == 1:
+                    median_pred = samples[0]
+                else:
+                    median_pred = np.median(samples, axis=0)
+            elif samples.ndim == 3:
+                median_pred = np.median(samples, axis=0)
+            else:
+                median_pred = samples[0] if len(samples) > 0 else samples
+            median_pred = ensure_2d_time_first(median_pred)
+            return median_pred, None, None
+        except Exception:
+            return None, None, None
+def _create_plot(
+    input_data: dict,
+    label_data: dict,
+    forecast,
+    dataset_full_name: str,
+    dataset_freq: str,
+    max_context_length: int,
+    title: Optional[str] = None,
+):
+    try:
+        history_values, future_values, start_timestamp = _prepare_data_for_plotting(
+            input_data, label_data, max_context_length
+        )
+        median_pred, lower_bound, upper_bound = _extract_quantile_predictions(forecast)
+        if median_pred is None:
+            logger.warning(f"Could not extract predictions for {dataset_full_name}")
+            return None
+        def ensure_compatible_shape(pred_arr, target_arr):
+            if pred_arr is None:
+                return None
+            pred_arr = np.asarray(pred_arr)
+            target_arr = np.asarray(target_arr)
+            if pred_arr.ndim == 1:
+                pred_arr = pred_arr.reshape(-1, 1)
+            if target_arr.ndim == 1:
+                target_arr = target_arr.reshape(-1, 1)
+            if pred_arr.shape != target_arr.shape:
+                if pred_arr.shape[0] == target_arr.shape[0]:
+                    if pred_arr.shape[1] == 1 and target_arr.shape[1] > 1:
+                        pred_arr = np.broadcast_to(pred_arr, target_arr.shape)
+                    elif pred_arr.shape[1] > 1 and target_arr.shape[1] == 1:
+                        pred_arr = pred_arr[:, :1]
+                elif pred_arr.shape[1] == target_arr.shape[1]:
+                    min_time = min(pred_arr.shape[0], target_arr.shape[0])
+                    pred_arr = pred_arr[:min_time]
+                else:
+                    if pred_arr.T.shape == target_arr.shape:
+                        pred_arr = pred_arr.T
+                    else:
+                        if pred_arr.size >= target_arr.shape[0]:
+                            pred_arr = pred_arr.flatten()[
+                                : target_arr.shape[0]
+                            ].reshape(-1, 1)
+                            if target_arr.shape[1] > 1:
+                                pred_arr = np.broadcast_to(pred_arr, target_arr.shape)
+            return pred_arr
+        median_pred = ensure_compatible_shape(median_pred, future_values)
+        lower_bound = ensure_compatible_shape(lower_bound, future_values)
+        upper_bound = ensure_compatible_shape(upper_bound, future_values)
+        title = title or f"GIFT-Eval: {dataset_full_name}"
+        frequency = parse_frequency(dataset_freq)
+        fig = plot_multivariate_timeseries(
+            history_values=history_values,
+            future_values=future_values,
+            predicted_values=median_pred,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+            start=start_timestamp,
+            frequency=frequency,
+            title=title,
+            show=False,
+        )
+        return fig
+    except Exception as e:
+        logger.warning(f"Failed to create plot for {dataset_full_name}: {e}")
+        return None
+def create_plots_for_dataset(
+    forecasts: List,
+    test_data,
+    dataset_metadata,
+    max_plots: int,
+    max_context_length: int,
+) -> List[Tuple[object, str]]:
+    input_data_list = list(test_data.input)
+    label_data_list = list(test_data.label)
+    num_plots = min(len(forecasts), max_plots)
+    logger.info(
+        f"Creating {num_plots} plots for {getattr(dataset_metadata, 'full_name', str(dataset_metadata))}"
+    )
+    figures_with_names: List[Tuple[object, str]] = []
+    for i in range(num_plots):
+        try:
+            forecast = forecasts[i]
+            input_data = input_data_list[i]
+            label_data = label_data_list[i]
+            title = (
+                f"GIFT-Eval: {dataset_metadata.full_name} - Window {i + 1}/{num_plots}"
+                if hasattr(dataset_metadata, "full_name")
+                else f"Window {i + 1}/{num_plots}"
+            )
+            fig = _create_plot(
+                input_data=input_data,
+                label_data=label_data,
+                forecast=forecast,
+                dataset_full_name=getattr(dataset_metadata, "full_name", "dataset"),
+                dataset_freq=getattr(dataset_metadata, "freq", "D"),
+                max_context_length=max_context_length,
+                title=title,
+            )
+            if fig is not None:
+                filename = (
+                    f"{getattr(dataset_metadata, 'freq', 'D')}_window_{i + 1:03d}.png"
+                )
+                figures_with_names.append((fig, filename))
+        except Exception as e:
+            logger.warning(f"Error creating plot for window {i + 1}: {e}")
+            continue
+    return figures_with_names

src/plotting/plot_timeseries.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import logging
+from typing import List, Optional, Tuple, Union
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+import torchmetrics
+from matplotlib.figure import Figure
+from src.data.containers import BatchTimeSeriesContainer
+from src.data.frequency import Frequency
+logger = logging.getLogger(__name__)
+def calculate_smape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    """Calculate Symmetric Mean Absolute Percentage Error (SMAPE)."""
+    pred_tensor = torch.from_numpy(y_pred).float()
+    true_tensor = torch.from_numpy(y_true).float()
+    return torchmetrics.SymmetricMeanAbsolutePercentageError()(
+        pred_tensor, true_tensor
+    ).item()
+def _create_date_ranges(
+    start: Optional[Union[np.datetime64, pd.Timestamp]],
+    frequency: Optional[Union[Frequency, str]],
+    history_length: int,
+    prediction_length: int,
+) -> Tuple[pd.DatetimeIndex, pd.DatetimeIndex]:
+    """Create date ranges for history and future periods."""
+    if start is not None and frequency is not None:
+        start_timestamp = pd.Timestamp(start)
+        pandas_freq = frequency.to_pandas_freq(for_date_range=True)
+        history_dates = pd.date_range(
+            start=start_timestamp, periods=history_length, freq=pandas_freq
+        )
+        if prediction_length > 0:
+            next_timestamp = history_dates[-1] + pd.tseries.frequencies.to_offset(
+                pandas_freq
+            )
+            future_dates = pd.date_range(
+                start=next_timestamp, periods=prediction_length, freq=pandas_freq
+            )
+        else:
+            future_dates = pd.DatetimeIndex([])
+    else:
+        # Fallback to default daily frequency
+        history_dates = pd.date_range(
+            end=pd.Timestamp.now(), periods=history_length, freq="D"
+        )
+        if prediction_length > 0:
+            future_dates = pd.date_range(
+                start=history_dates[-1] + pd.Timedelta(days=1),
+                periods=prediction_length,
+                freq="D",
+            )
+        else:
+            future_dates = pd.DatetimeIndex([])
+    return history_dates, future_dates
+def _plot_single_channel(
+    ax: plt.Axes,
+    channel_idx: int,
+    history_dates: pd.DatetimeIndex,
+    future_dates: pd.DatetimeIndex,
+    history_values: np.ndarray,
+    future_values: Optional[np.ndarray] = None,
+    predicted_values: Optional[np.ndarray] = None,
+    lower_bound: Optional[np.ndarray] = None,
+    upper_bound: Optional[np.ndarray] = None,
+) -> None:
+    """Plot a single channel's time series data."""
+    # Plot history
+    ax.plot(
+        history_dates, history_values[:, channel_idx], color="black", label="History"
+    )
+    # Plot ground truth future
+    if future_values is not None:
+        ax.plot(
+            future_dates,
+            future_values[:, channel_idx],
+            color="blue",
+            label="Ground Truth",
+        )
+    # Plot predictions
+    if predicted_values is not None:
+        ax.plot(
+            future_dates,
+            predicted_values[:, channel_idx],
+            color="orange",
+            linestyle="--",
+            label="Prediction (Median)",
+        )
+    # Plot uncertainty band
+    if lower_bound is not None and upper_bound is not None:
+        ax.fill_between(
+            future_dates,
+            lower_bound[:, channel_idx],
+            upper_bound[:, channel_idx],
+            color="orange",
+            alpha=0.2,
+            label="Uncertainty Band",
+        )
+    ax.set_title(f"Channel {channel_idx + 1}")
+    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+def _setup_figure(num_channels: int) -> Tuple[Figure, List[plt.Axes]]:
+    """Create and configure the matplotlib figure and axes."""
+    fig, axes = plt.subplots(
+        num_channels, 1, figsize=(15, 3 * num_channels), sharex=True
+    )
+    if num_channels == 1:
+        axes = [axes]
+    return fig, axes
+def _finalize_plot(
+    fig: Figure,
+    axes: List[plt.Axes],
+    title: Optional[str] = None,
+    smape_value: Optional[float] = None,
+    output_file: Optional[str] = None,
+    show: bool = True,
+) -> None:
+    """Add legend, title, and save/show the plot."""
+    # Create legend from first axis
+    handles, labels = axes[0].get_legend_handles_labels()
+    fig.legend(handles, labels, loc="upper right")
+    # Set title with optional SMAPE
+    if title:
+        if smape_value is not None:
+            title = f"{title} | SMAPE: {smape_value:.4f}"
+        fig.suptitle(title, fontsize=16)
+    # Adjust layout
+    plt.tight_layout(rect=[0, 0.03, 1, 0.95] if title else None)
+    # Save and/or show
+    if output_file:
+        plt.savefig(output_file, dpi=300)
+    if show:
+        plt.show()
+    else:
+        plt.close(fig)
+def plot_multivariate_timeseries(
+    history_values: np.ndarray,
+    future_values: Optional[np.ndarray] = None,
+    predicted_values: Optional[np.ndarray] = None,
+    start: Optional[Union[np.datetime64, pd.Timestamp]] = None,
+    frequency: Optional[Union[Frequency, str]] = None,
+    title: Optional[str] = None,
+    output_file: Optional[str] = None,
+    show: bool = True,
+    lower_bound: Optional[np.ndarray] = None,
+    upper_bound: Optional[np.ndarray] = None,
+) -> Figure:
+    """Plot a multivariate time series with history, future, predictions, and uncertainty bands."""
+    # Calculate SMAPE if both predicted and true values are available
+    smape_value = None
+    if predicted_values is not None and future_values is not None:
+        try:
+            smape_value = calculate_smape(future_values, predicted_values)
+        except Exception as e:
+            logger.warning(f"Failed to calculate SMAPE: {str(e)}")
+    # Extract dimensions
+    num_channels = history_values.shape[1]
+    history_length = history_values.shape[0]
+    prediction_length = (
+        predicted_values.shape[0]
+        if predicted_values is not None
+        else (future_values.shape[0] if future_values is not None else 0)
+    )
+    # Create date ranges
+    history_dates, future_dates = _create_date_ranges(
+        start, frequency, history_length, prediction_length
+    )
+    # Setup figure
+    fig, axes = _setup_figure(num_channels)
+    # Plot each channel
+    for i in range(num_channels):
+        _plot_single_channel(
+            ax=axes[i],
+            channel_idx=i,
+            history_dates=history_dates,
+            future_dates=future_dates,
+            history_values=history_values,
+            future_values=future_values,
+            predicted_values=predicted_values,
+            lower_bound=lower_bound,
+            upper_bound=upper_bound,
+        )
+    # Finalize plot
+    _finalize_plot(fig, axes, title, smape_value, output_file, show)
+    return fig
+def _extract_quantile_predictions(
+    predicted_values: np.ndarray,
+    model_quantiles: List[float],
+) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
+    """Extract median, lower, and upper bound predictions from quantile output."""
+    try:
+        median_idx = model_quantiles.index(0.5)
+        lower_idx = model_quantiles.index(0.1)
+        upper_idx = model_quantiles.index(0.9)
+        median_preds = predicted_values[..., median_idx]
+        lower_bound = predicted_values[..., lower_idx]
+        upper_bound = predicted_values[..., upper_idx]
+        return median_preds, lower_bound, upper_bound
+    except (ValueError, IndexError):
+        logger.warning(
+            "Could not find 0.1, 0.5, 0.9 quantiles for plotting. Using median of available quantiles."
+        )
+        median_preds = predicted_values[..., predicted_values.shape[-1] // 2]
+        return median_preds, None, None
+def plot_from_container(
+    batch: BatchTimeSeriesContainer,
+    sample_idx: int,
+    predicted_values: Optional[np.ndarray] = None,
+    model_quantiles: Optional[List[float]] = None,
+    title: Optional[str] = None,
+    output_file: Optional[str] = None,
+    show: bool = True,
+) -> Figure:
+    """Plot a single sample from a BatchTimeSeriesContainer with proper quantile handling."""
+    # Extract data for the specific sample
+    history_values = batch.history_values[sample_idx].cpu().numpy()
+    future_values = batch.future_values[sample_idx].cpu().numpy()
+    # Process predictions
+    if predicted_values is not None:
+        # Handle batch vs single sample predictions
+        if predicted_values.ndim >= 3 or (
+            predicted_values.ndim == 2
+            and predicted_values.shape[0] > future_values.shape[0]
+        ):
+            sample_preds = predicted_values[sample_idx]
+        else:
+            sample_preds = predicted_values
+        # Extract quantile information if available
+        if model_quantiles:
+            median_preds, lower_bound, upper_bound = _extract_quantile_predictions(
+                sample_preds, model_quantiles
+            )
+        else:
+            median_preds = sample_preds
+            lower_bound = None
+            upper_bound = None
+    else:
+        median_preds = None
+        lower_bound = None
+        upper_bound = None
+    # Create the plot
+    return plot_multivariate_timeseries(
+        history_values=history_values,
+        future_values=future_values,
+        predicted_values=median_preds,
+        start=batch.start[sample_idx],
+        frequency=batch.frequency[sample_idx],
+        title=title,
+        output_file=output_file,
+        show=show,
+        lower_bound=lower_bound,
+        upper_bound=upper_bound,
+    )

src/synthetic_generation/__init__.py ADDED Viewed

File without changes

src/synthetic_generation/abstract_classes.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+import numpy as np
+import torch
+from src.data.containers import TimeSeriesContainer
+from src.data.frequency import (
+    select_safe_random_frequency,
+    select_safe_start_date,
+)
+from src.synthetic_generation.generator_params import GeneratorParams
+class AbstractTimeSeriesGenerator(ABC):
+    """
+    Abstract base class for synthetic time series generators.
+    """
+    @abstractmethod
+    def generate_time_series(self, random_seed: Optional[int] = None) -> np.ndarray:
+        """
+        Generate synthetic time series data.
+        Parameters
+        ----------
+        random_seed : int, optional
+            Random seed for reproducibility.
+        Returns
+        -------
+        np.ndarray
+            Time series values of shape (length,) for univariate or
+            (length, num_channels) for multivariate time series.
+        """
+        pass
+class GeneratorWrapper:
+    """
+    Unified base class for all generator wrappers, using a GeneratorParams dataclass
+    for configuration. Provides parameter sampling, validation, and batch formatting utilities.
+    """
+    def __init__(self, params: GeneratorParams):
+        """
+        Initialize the GeneratorWrapper with a GeneratorParams dataclass.
+        Parameters
+        ----------
+        params : GeneratorParams
+            Dataclass instance containing all generator configuration parameters.
+        """
+        self.params = params
+        self._set_random_seeds(self.params.global_seed)
+    def _set_random_seeds(self, seed: int) -> None:
+        # For parameter sampling, we want diversity across batches even with similar seeds
+        # Use a hash of the generator class name to ensure different generators get different parameter sequences
+        param_seed = seed + hash(self.__class__.__name__) % 2**31
+        self.rng = np.random.default_rng(param_seed)
+        # Set global numpy and torch seeds for deterministic behavior in underlying generators
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+    def _sample_parameters(self, batch_size: int) -> Dict[str, Any]:
+        """
+        Sample parameters with total_length fixed and history_length calculated.
+        Returns
+        -------
+        Dict[str, Any]
+            Dictionary containing sampled parameter values where
+            history_length = total_length - future_length.
+        """
+        # Select a suitable frequency based on the total length
+        frequency = [
+            select_safe_random_frequency(self.params.length, self.rng)
+            for _ in range(batch_size)
+        ]
+        start = [
+            select_safe_start_date(self.params.length, frequency[i], self.rng)
+            for i in range(batch_size)
+        ]
+        return {
+            "frequency": frequency,
+            "start": start,
+        }
+    @abstractmethod
+    def generate_batch(
+        self, batch_size: int, seed: Optional[int] = None, **kwargs
+    ) -> TimeSeriesContainer:
+        raise NotImplementedError("Subclasses must implement generate_batch()")