# -*- coding: utf-8 -*-
# --- Standard Libraries ---
import copy
import time
import warnings
from datetime import datetime, timedelta
import os
import json
from IPython.display import Video

# --- Data Manipulation ---
import numpy as np
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar

# --- Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns

# --- Machine Learning ---
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --- PyTorch ---
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# --- Weather Data ---
# Ensure meteostat is installed: pip install meteostat
try:
    from meteostat import Stations, Hourly
except ImportError:
    print("Meteostat library not found. Please install it: pip install meteostat")
    # You might want to exit or handle this case appropriately
    # For now, we'll proceed assuming it might be installed later or data is cached
    pass

# --- Configure Warnings and Plot Style ---
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')

# --- Check for GPU Availability ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Helper Functions/Classes ---

def mean_absolute_percentage_error(y_true, y_pred):
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Avoid division by zero
    non_zero_mask = y_true != 0
    # Handle cases where all true values are zero
    if not np.any(non_zero_mask):
        return 0.0
    return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        # *** CORRECTED LINE BELOW ***
        self.val_loss_min = np.inf # Use np.inf instead of np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose: # Only print counter if verbose
                 self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        try:
             torch.save(model.state_dict(), self.path)
        except Exception as e:
             self.trace_func(f"Error saving checkpoint: {e}") # Report errors during saving
        self.val_loss_min = val_loss

# Placeholder for Configs class
class Configs:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

Using device: cuda

csv_path = '../data/3/PJME_hourly.csv'
if not os.path.exists(csv_path):
     print(f"Error: CSV file not found at {csv_path}")
     print("Please download the PJME hourly energy data and place it as PJME_hourly.csv in the current directory.")
     pjme = None # Set to None to prevent further errors if file is missing
else:
    pjme = (
        pd.read_csv(
            csv_path,
            index_col='Datetime',
            parse_dates=['Datetime']
        )
        .sort_index()
        .loc['2003-01-01':'2018-08-02'] # Using the same date range as provided
    )

if pjme is not None:
    # --- Feature Engineering: Holidays and Calendar Features ---
    cal = USFederalHolidayCalendar()
    fed_hols = cal.holidays(start=pjme.index.min(), end=pjme.index.max())

    # Extend July 4th holiday based on weekday
    extended_hols = set(fed_hols)
    for year in range(pjme.index.year.min(), pjme.index.year.max() + 1):
        try:
            july4 = datetime(year, 7, 4)
            wd = july4.weekday()
            if wd == 1:  # If July 4th is a Tuesday, add Monday
                extended_hols.add(july4 - timedelta(days=1))
            elif wd == 2:  # If July 4th is a Wednesday, add Thursday and Friday
                extended_hols.add(july4 + timedelta(days=1))
                extended_hols.add(july4 + timedelta(days=2))
            elif wd == 3:  # If July 4th is a Thursday, add Friday
                extended_hols.add(july4 + timedelta(days=1))
            # Add July 4th itself regardless of the day
            extended_hols.add(july4)
        except ValueError: # Handle cases like leap years if necessary, though July 4th is fixed
             pass

    all_hols = pd.DatetimeIndex(sorted(list(extended_hols))) # Convert set to list before sorting

    pjme['is_holiday'] = pjme.index.normalize().isin(all_hols)
    pjme['is_weekend'] = pjme.index.weekday >= 5
    pjme['is_dayoff'] = (pjme['is_holiday'] | pjme['is_weekend']).astype(int) # Convert boolean to int
    pjme.drop(columns=['is_holiday', 'is_weekend'], inplace=True)

    # Calendar Features
    pjme['hour']       = pjme.index.hour
    pjme['dayofweek']  = pjme.index.dayofweek # Monday=0, Sunday=6
    pjme['dayofmonth'] = pjme.index.day
    pjme['month']      = pjme.index.month
    pjme['year']       = pjme.index.year
    pjme['dayofyear']  = pjme.index.dayofyear
    pjme['weekofyear'] = pjme.index.isocalendar().week.astype(int)


    # --- Fetch and Process Weather Data ---
    # Define date range for weather data (slightly wider to ensure coverage)
    start_dt = pjme.index.min() - timedelta(days=1)
    end_dt   = pjme.index.max() + timedelta(days=1)

    # Coordinates for PJME region center (approximate)
    lat, lon = 39.95, -75.17

    # Target ICAOs based on the original script
    target_icaos = ['KPHL', 'KEWR', 'KBWI', 'KDCA'] # Philadelphia, Newark, Baltimore, Washington DC area airports

    average_weather = None
    try:
        # Find nearby stations (using a reasonable radius, e.g., 200km)
        stations_query = Stations()
        # stations_query = stations_query.nearby(lat, lon, 200000) # 200 km radius
        stations_query = stations_query.inventory('hourly', (start_dt, end_dt))
        nearby_stations_df = stations_query.fetch()

        # Filter for target ICAOs if found, otherwise use closest available
        target_stations_df = nearby_stations_df[nearby_stations_df['icao'].isin(target_icaos)]
        if target_stations_df.empty:
            print(f"Warning: None of the target ICAOs {target_icaos} found nearby or had data. Using the closest available stations.")
            # Optionally, select the top N closest stations if target ones are missing
            station_ids = nearby_stations_df.head(4).index.tolist() # Example: use top 4 closest
        else:
            station_ids = target_stations_df.index.tolist()

        if station_ids:
            print(f"Fetching weather data for stations: {station_ids}")
            # Fetch hourly data
            weather_all = Hourly(station_ids, start_dt, end_dt).fetch()

            # Select relevant weather columns and calculate average
            weather_cols = ['temp', 'dwpt', 'rhum', 'prcp', 'wspd'] # Temperature (°C), Dew Point (°C), Relative Humidity (%), Precipitation (mm), Wind Speed (km/h)
            # Ensure only selected columns exist before grouping
            valid_cols = [col for col in weather_cols if col in weather_all.columns]
            average_weather = weather_all.groupby(level='time').mean(numeric_only=True)[valid_cols].ffill().bfill() # Forward fill then backfill
            print("Weather data fetched and processed.")
        else:
            print("Warning: No suitable weather stations found.")

    except Exception as e:
        print(f"An error occurred during weather data fetching: {e}")
        print("Proceeding without weather data.")

    # --- Combine Energy and Weather Data ---
    if average_weather is not None:
        pjme_weather = pjme.join(average_weather, how='left')
        # Fill missing weather data (e.g., at the beginning/end or if fetching failed partially)
        # Using forward fill first, then backward fill
        pjme_weather[average_weather.columns] = pjme_weather[average_weather.columns].ffill().bfill()
        print("Energy and weather data joined.")
    else:
        pjme_weather = pjme.copy()
        # Add placeholder columns if weather data is missing
        weather_cols = ['temp', 'dwpt', 'rhum', 'prcp', 'wspd']
        for col in weather_cols:
            if col not in pjme_weather.columns:
                 pjme_weather[col] = 0 # Or another suitable default
        print("Proceeding with energy data only.")


    # Ensure no NaNs remain after joins and fills
    # Check for NaNs and report if any
    nan_counts = pjme_weather.isna().sum()
    if nan_counts.sum() > 0:
        print("Warning: NaNs found after processing:")
        print(nan_counts[nan_counts > 0])
        # Option 1: Drop rows with NaNs (might lose data)
        # pjme_weather.dropna(inplace=True)
        # Option 2: Fill with a specific value (like 0 or mean) - already tried ffill/bfill
        # Consider more sophisticated imputation if necessary
        print("Attempting final fill with 0 for any remaining NaNs...")
        pjme_weather.fillna(0, inplace=True)


    print(f'Combined data shape: {pjme_weather.shape}')
    # Check index monotonicity again
    pjme_weather = pjme_weather.sort_index()
    print(f'Index monotonic? {pjme_weather.index.is_monotonic_increasing}')
    print("Data Head:")
    print(pjme_weather.head())
    print("\nData Tail:")
    print(pjme_weather.tail())
    print("\nData Info:")
    pjme_weather.info()
    print("\nNaN check after processing:")
    print(pjme_weather.isna().sum())


    # --- Define Target and Features ---
    TARGET = 'PJME_MW'
    # Features = All columns except the target
    FEATURES = [col for col in pjme_weather.columns if col != TARGET]
    print(f"\nTarget variable: {TARGET}")
    print(f"Feature columns: {FEATURES}")
    print(f"Number of features: {len(FEATURES)}")

else:
    print("Execution stopped due to missing data file.")

Fetching weather data for stations: ['72405', '72406', '72408', '72502']
Weather data fetched and processed.
Energy and weather data joined.
Combined data shape: (136608, 14)
Index monotonic? True
Data Head:
                     PJME_MW  is_dayoff  hour  dayofweek  dayofmonth  month  \
2003-01-01 00:00:00  27008.0          1     0          2           1      1   
2003-01-01 01:00:00  25591.0          1     1          2           1      1   
2003-01-01 02:00:00  24235.0          1     2          2           1      1   
2003-01-01 03:00:00  23121.0          1     3          2           1      1   
2003-01-01 04:00:00  22445.0          1     4          2           1      1   

                     year  dayofyear  weekofyear   temp   dwpt   rhum  prcp  \
2003-01-01 00:00:00  2003          1           1  9.450  4.675  73.25   0.0   
2003-01-01 01:00:00  2003          1           1  9.300  4.750  74.00   0.0   
2003-01-01 02:00:00  2003          1           1  7.800  4.400  79.25   0.0   
2003-01-01 03:00:00  2003          1           1  7.775  4.200  79.00   0.0   
2003-01-01 04:00:00  2003          1           1  7.350  4.050  80.50   0.0   

                      wspd  
2003-01-01 00:00:00  7.950  
2003-01-01 01:00:00  7.400  
2003-01-01 02:00:00  8.825  
2003-01-01 03:00:00  6.950  
2003-01-01 04:00:00  6.050  

Data Tail:
                     PJME_MW  is_dayoff  hour  dayofweek  dayofmonth  month  \
2018-08-02 19:00:00  45641.0          0    19          3           2      8   
2018-08-02 20:00:00  44057.0          0    20          3           2      8   
2018-08-02 21:00:00  43256.0          0    21          3           2      8   
2018-08-02 22:00:00  41552.0          0    22          3           2      8   
2018-08-02 23:00:00  38500.0          0    23          3           2      8   

                     year  dayofyear  weekofyear    temp    dwpt   rhum  \
2018-08-02 19:00:00  2018        214          31  28.500  22.000  68.75   
2018-08-02 20:00:00  2018        214          31  27.625  22.525  75.75   
2018-08-02 21:00:00  2018        214          31  27.375  22.100  75.00   
2018-08-02 22:00:00  2018        214          31  27.375  22.175  75.00   
2018-08-02 23:00:00  2018        214          31  26.400  22.450  80.00   

                         prcp    wspd  
2018-08-02 19:00:00  0.066667  18.100  
2018-08-02 20:00:00  0.933333  17.100  
2018-08-02 21:00:00  1.100000  13.850  
2018-08-02 22:00:00  0.250000  11.525  
2018-08-02 23:00:00  1.366667  12.100  

Data Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 136608 entries, 2003-01-01 00:00:00 to 2018-08-02 23:00:00
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   PJME_MW     136608 non-null  float64
 1   is_dayoff   136608 non-null  int64  
 2   hour        136608 non-null  int32  
 3   dayofweek   136608 non-null  int32  
 4   dayofmonth  136608 non-null  int32  
 5   month       136608 non-null  int32  
 6   year        136608 non-null  int32  
 7   dayofyear   136608 non-null  int32  
 8   weekofyear  136608 non-null  int64  
 9   temp        136608 non-null  float64
 10  dwpt        136608 non-null  float64
 11  rhum        136608 non-null  float64
 12  prcp        136608 non-null  float64
 13  wspd        136608 non-null  float64
dtypes: float64(6), int32(6), int64(2)
memory usage: 12.5 MB

NaN check after processing:
PJME_MW       0
is_dayoff     0
hour          0
dayofweek     0
dayofmonth    0
month         0
year          0
dayofyear     0
weekofyear    0
temp          0
dwpt          0
rhum          0
prcp          0
wspd          0
dtype: int64

Target variable: PJME_MW
Feature columns: ['is_dayoff', 'hour', 'dayofweek', 'dayofmonth', 'month', 'year', 'dayofyear', 'weekofyear', 'temp', 'dwpt', 'rhum', 'prcp', 'wspd']
Number of features: 13

if pjme_weather is not None:
    total_hours = len(pjme_weather)
    # Using the same 80/10/10 split ratios
    test_split_idx = int(total_hours * 0.9)
    val_split_idx = int(total_hours * 0.8)

    train_df = pjme_weather.iloc[:val_split_idx].copy()
    val_df = pjme_weather.iloc[val_split_idx:test_split_idx].copy()
    test_df = pjme_weather.iloc[test_split_idx:].copy()

    print(f"Train: {len(train_df)} rows ({train_df.index.min()} to {train_df.index.max()})")
    print(f"Val  : {len(val_df)} rows ({val_df.index.min()} to {val_df.index.max()})")
    print(f"Test : {len(test_df)} rows ({test_df.index.min()} to {test_df.index.max()})")

    # Visualize the split
    plt.figure(figsize=(15, 5))
    plt.plot(train_df.index, train_df[TARGET], label='Train')
    plt.plot(val_df.index, val_df[TARGET], label='Validation')
    plt.plot(test_df.index, test_df[TARGET], label='Test')
    plt.title('PJME Energy Consumption - Train/Val/Test Split')
    plt.xlabel('Date')
    plt.ylabel('MW')
    plt.legend()
    plt.show()
else:
     print("Skipping split due to missing data.")

Train: 109286 rows (2003-01-01 00:00:00 to 2015-06-21 13:00:00)
Val  : 13661 rows (2015-06-21 14:00:00 to 2017-01-10 17:00:00)
Test : 13661 rows (2017-01-10 18:00:00 to 2018-08-02 23:00:00)

if pjme_weather is not None:
    # Scale features (exogenous variables)
    feature_scaler = StandardScaler()
    train_df[FEATURES] = feature_scaler.fit_transform(train_df[FEATURES])
    val_df[FEATURES] = feature_scaler.transform(val_df[FEATURES])
    test_df[FEATURES] = feature_scaler.transform(test_df[FEATURES])

    # Scale target variable separately (important for inverse transform later)
    target_scaler = StandardScaler()
    train_df[[TARGET]] = target_scaler.fit_transform(train_df[[TARGET]])
    val_df[[TARGET]] = target_scaler.transform(val_df[[TARGET]])
    test_df[[TARGET]] = target_scaler.transform(test_df[[TARGET]])

    print("Features and target variable scaled.")
    print("Train DataFrame head after scaling:")
    print(train_df.head())
else:
     print("Skipping scaling due to missing data.")

Features and target variable scaled.
Train DataFrame head after scaling:
                      PJME_MW  is_dayoff      hour  dayofweek  dayofmonth  \
2003-01-01 00:00:00 -0.830722   1.477475 -1.661665  -0.500253   -1.672229   
2003-01-01 01:00:00 -1.051358   1.477475 -1.517189  -0.500253   -1.672229   
2003-01-01 02:00:00 -1.262495   1.477475 -1.372713  -0.500253   -1.672229   
2003-01-01 03:00:00 -1.435951   1.477475 -1.228238  -0.500253   -1.672229   
2003-01-01 04:00:00 -1.541209   1.477475 -1.083762  -0.500253   -1.672229   

                        month      year  dayofyear  weekofyear      temp  \
2003-01-01 00:00:00 -1.566074 -1.593315  -1.691714   -1.665309 -0.405299   
2003-01-01 01:00:00 -1.566074 -1.593315  -1.691714   -1.665309 -0.420108   
2003-01-01 02:00:00 -1.566074 -1.593315  -1.691714   -1.665309 -0.568199   
2003-01-01 03:00:00 -1.566074 -1.593315  -1.691714   -1.665309 -0.570667   
2003-01-01 04:00:00 -1.566074 -1.593315  -1.691714   -1.665309 -0.612627   

                         dwpt      rhum      prcp      wspd  
2003-01-01 00:00:00 -0.119048  0.547299 -0.221923 -0.763367  
2003-01-01 01:00:00 -0.112001  0.587985 -0.221923 -0.840180  
2003-01-01 02:00:00 -0.144889  0.872787 -0.221923 -0.641165  
2003-01-01 03:00:00 -0.163682  0.859225 -0.221923 -0.903027  
2003-01-01 04:00:00 -0.177777  0.940597 -0.221923 -1.028720

# --- Positional Embedding ---
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEmbedding, self).__init__()
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(np.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Keep pe shape as [max_len, d_model] for easier slicing
        # pe = pe.unsqueeze(0) # Remove batch dim here
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Input x shape: Tensor for which we need positional embedding, used to determine seq_len
        # Output shape: [seq_len, d_model] - The calling function will handle batch expansion
        seq_len = x.size(1) # Assuming x has shape [B, L, D] or similar where L is seq_len
        if seq_len > self.pe.size(0):
             raise ValueError(f"Sequence length {seq_len} exceeds maximum positional embedding length {self.pe.size(0)}")
        # Return the slice corresponding to the sequence length
        return self.pe[:seq_len, :]


# --- Time Feature Embedding (Part of DataEmbedding_inverted) ---
class TimeFeatureEmbedding(nn.Module):
    def __init__(self, d_model, embed_type='timeF', freq='h'):
        super(TimeFeatureEmbedding, self).__init__()

        # Determine input dimension based on frequency
        # Adjust based on the actual features computed in TimeSeriesDataset._compute_time_features
        # Current features: hour, dayofweek, day, dayofyear, month, weekofyear -> 6 features
        if freq == 'h':
            # Match the number of features created in the Dataset class
            d_inp = 6 # hour, dayofweek, dayofmonth, dayofyear, month, weekofyear
        # Add other frequencies if needed
        # elif freq == 't': d_inp = ...
        else:
            d_inp = 2 # Default fallback
        self.embed = nn.Linear(d_inp, d_model, bias=False)

    def forward(self, x):
        # x shape: [Batch, SeqLen, NumTimeFeatures]
        return self.embed(x) # Output: [Batch, SeqLen, d_model]

# --- Value Embedding (Part of DataEmbedding_inverted) ---
# Simple Linear layer for value embedding
class TokenEmbedding(nn.Module):
    def __init__(self, c_in, d_model):
        super(TokenEmbedding, self).__init__()
        padding = 1 if torch.__version__ >= '1.5.0' else 2
        # Use Linear layer for value embedding per feature, seems more common than Conv1d here
        # self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
        #                            kernel_size=3, padding=padding, padding_mode='circular', bias=False)
        self.value_embedding = nn.Linear(c_in, d_model) # Embeds features directly

        # Initialize weights
        # for m in self.modules():
        #     if isinstance(m, nn.Conv1d):
        #         nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')

    def forward(self, x):
        # x shape: [Batch, SeqLen, Features]
        # x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) # If using Conv1d
        x = self.value_embedding(x) # Using Linear
        return x # Shape: [Batch, SeqLen, d_model]


# --- Data Embedding (Combined - Simplified version of DataEmbedding_inverted) ---
# This embedding is used for the *exogenous* variables in TimeXer
class DataEmbeddingExog(nn.Module):
    def __init__(self, c_in_exog, c_in_time, d_model, embed_type='timeF', freq='h', dropout=0.1):
        super(DataEmbeddingExog, self).__init__()

        # Embeds the exogenous variable values
        self.value_embedding = TokenEmbedding(c_in=c_in_exog, d_model=d_model)
        # Embeds the time features (like hour, dayofweek)
        self.temporal_embedding = TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
        # Positional embedding (sinusoidal) - Added here for consistency
        self.position_embedding = PositionalEmbedding(d_model=d_model)

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, x_mark):
        # x: Exogenous variable values [Batch, SeqLen, NumExogVars (c_in_exog)]
        # x_mark: Time features [Batch, SeqLen, NumTimeFeatures (c_in_time)]

        # Get positional encoding slice [SeqLen, d_model]
        # Need to pass a tensor with correct seq_len dimension (dim 1)
        pos_enc_slice = self.position_embedding(x) # Pass x to get seq_len

        # Apply embeddings and add them
        # Expand pos_enc_slice to [Batch, SeqLen, d_model] for addition
        x = self.value_embedding(x) + self.temporal_embedding(x_mark) + pos_enc_slice.unsqueeze(0).expand(x.shape[0], -1, -1)

        return self.dropout(x) # Shape: [Batch, SeqLen, d_model]


# --- Attention Layers (Standard Implementation -UNCHANGED from previous version) ---
class FullAttention(nn.Module):
    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(FullAttention, self).__init__()
        self.scale = scale
        self.mask_flag = mask_flag
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
        B, L, H, E = queries.shape
        _, S, _, D = values.shape
        scale = self.scale or 1. / np.sqrt(E)

        scores = torch.einsum("blhe,bshe->bhls", queries, keys)

        if self.mask_flag and attn_mask is not None:
            # Ensure attn_mask is broadcastable
            if attn_mask.dim() == 2: # [L, S]
                attn_mask = attn_mask.unsqueeze(0).unsqueeze(0) # [1, 1, L, S]
            elif attn_mask.dim() == 3: # [B, L, S]
                 attn_mask = attn_mask.unsqueeze(1) # [B, 1, L, S]
            scores = scores.masked_fill(attn_mask == 0, -float('inf')) # Use float('-inf')


        A = self.dropout(torch.softmax(scores * scale, dim=-1))
        V = torch.einsum("bhls,bshe->blhe", A, values).contiguous() # Use contiguous here

        if self.output_attention:
            return (V.view(B, L, -1), A) # Reshape V back
        else:
            return (V.view(B, L, -1), None) # Reshape V back


class AttentionLayer(nn.Module):
    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None):
        super(AttentionLayer, self).__init__()

        d_keys = d_keys or (d_model // n_heads)
        d_values = d_values or (d_model // n_heads)

        self.inner_attention = attention
        self.query_projection = nn.Linear(d_model, d_keys * n_heads)
        self.key_projection = nn.Linear(d_model, d_keys * n_heads)
        self.value_projection = nn.Linear(d_model, d_values * n_heads)
        self.out_projection = nn.Linear(d_values * n_heads, d_model)
        self.n_heads = n_heads

    def forward(self, queries, keys, values, attn_mask, tau=None, delta=None):
        B, L, _ = queries.shape
        _, S, _ = keys.shape
        H = self.n_heads

        # Project queries, keys, values
        queries = self.query_projection(queries).view(B, L, H, -1)
        keys = self.key_projection(keys).view(B, S, H, -1)
        values = self.value_projection(values).view(B, S, H, -1)

        # Apply attention mechanism
        out, attn = self.inner_attention(
            queries,
            keys,
            values,
            attn_mask,
            tau=tau, delta=delta
        )
        # Concatenate heads and apply final projection
        out = self.out_projection(out) # Shape [B, L, d_model]
        return out, attn

Video("media/videos/manim/480p15/EmbeddingProcessVisualization.mp4", embed=True, width=720, height=405)

Video("media/videos/manim/480p15/PatchingConcept.mp4", embed=True, width=720, height=405)

class FlattenHead(nn.Module):
    def __init__(self, n_vars, nf, target_window, head_dropout=0):
        super().__init__()
        self.n_vars = n_vars
        self.flatten = nn.Flatten(start_dim=-2) # Flattens d_model and patch_num dimensions
        self.linear = nn.Linear(nf, target_window)
        self.dropout = nn.Dropout(head_dropout)

    def forward(self, x):  # x: [bs, nvars, d_model, patch_num]
        # Flatten along dimensions d_model and patch_num
        # Input shape to flatten: [bs * nvars, d_model, patch_num] (if reshaped before)
        # Or handle [bs, nvars, d_model, patch_num] directly
        # Let's assume input x is [bs, nvars, d_model, patch_num] based on comment

        # Flatten d_model and patch_num dims. Input is [bs, nvars, d_model, patch_num]
        # Result shape: [bs, nvars, d_model * patch_num]
        x = self.flatten(x)

        # Apply linear layer. Input is [bs, nvars, nf (=d_model*patch_num)]
        # Output shape: [bs, nvars, target_window]
        x = self.linear(x)
        x = self.dropout(x)
        return x


class EnEmbedding(nn.Module):
    """
    Embedding for Endogenous variable(s).
    Applies patching, value embedding, positional embedding, and adds a global token.
    *** CORRECTED Positional Embedding Handling ***
    """
    def __init__(self, n_vars, d_model, patch_len, dropout):
        super(EnEmbedding, self).__init__()
        self.patch_len = patch_len
        self.n_vars = n_vars
        self.d_model = d_model

        # Linear layer to embed each patch
        self.value_embedding = nn.Linear(patch_len, d_model, bias=False)
        # Learnable global token (one per endogenous variable)
        self.glb_token = nn.Parameter(torch.randn(1, n_vars, 1, d_model))
        # Positional embedding for patches
        self.position_embedding = PositionalEmbedding(d_model) # Instantiate corrected PositionalEmbedding
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Input x shape: [batch_size, seq_len, n_vars] -> expecting n_vars=1 based on paper for univariate endogenous
        # Transpose to [batch_size, n_vars, seq_len] for unfolding
        x = x.permute(0, 2, 1)
        batch_size, n_vars_in, seq_len = x.shape
        assert n_vars_in == self.n_vars, f"Input n_vars ({n_vars_in}) doesn't match model n_vars ({self.n_vars})"

        # Patching using unfold
        x_unfolded = x.unfold(dimension=-1, size=self.patch_len, step=self.patch_len)
        num_patches = x_unfolded.shape[2] # Shape after unfold: [batch_size, n_vars, num_patches, patch_len]

        # Reshape for linear embedding: [batch_size * n_vars * num_patches, patch_len]
        x_reshaped = x_unfolded.permute(0, 1, 2, 3).reshape(-1, self.patch_len)

        # Apply value embedding
        x_embedded = self.value_embedding(x_reshaped) # Shape: [batch_size * n_vars * num_patches, d_model]

        # Reshape back to [batch_size, n_vars, num_patches, d_model]
        x_embedded = x_embedded.reshape(batch_size, n_vars_in, num_patches, self.d_model)

        # --- Get and Add Positional Embedding ---
        # Get positional encoding slice [num_patches, d_model]
        # Pass a dummy tensor with correct seq_len (num_patches) to positional embedding
        dummy_pos_input = torch.zeros(1, num_patches, self.d_model, device=x.device)
        pos_encoding_slice = self.position_embedding(dummy_pos_input) # Shape: [num_patches, d_model]

        # Expand slice and add to x_embedded
        # Expand to [batch_size, n_vars, num_patches, d_model]
        pos_embed = pos_encoding_slice.unsqueeze(0).unsqueeze(0).expand(batch_size, n_vars_in, -1, -1)
        x_patched_embedded = x_embedded + pos_embed # Add positional embedding

        # Prepare global token: repeat for batch and concatenate
        glb = self.glb_token.repeat((batch_size, 1, 1, 1)) # Shape: [batch_size, n_vars, 1, d_model]

        # Concatenate global token with patch tokens along the sequence (num_patches) dimension
        x_with_glb = torch.cat([x_patched_embedded, glb], dim=2) # Shape: [batch_size, n_vars, num_patches + 1, d_model]

        # Reshape for Transformer layers: [batch_size * n_vars, num_patches + 1, d_model]
        x_final = x_with_glb.reshape(batch_size * n_vars_in, num_patches + 1, self.d_model)

        return self.dropout(x_final), n_vars_in # Return reshaped tokens and original number of variables


class Encoder(nn.Module):
    def __init__(self, layers, norm_layer=None):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList(layers)
        self.norm = norm_layer

    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
        # x: Endogenous tokens [B*n_vars_en, N+1, D]
        # cross: Exogenous tokens [B, C, D] or potentially [B, SeqLen_exog, D] if using DataEmbeddingExog output
        for layer in self.layers:
            x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask, tau=tau, delta=delta)

        if self.norm is not None:
            x = self.norm(x)

        return x


class EncoderLayer(nn.Module):
    def __init__(self, self_attention, cross_attention, d_model, n_vars_en, c_exog, d_ff=None,
                 dropout=0.1, activation="relu"):
        super(EncoderLayer, self).__init__()
        d_ff = d_ff or 4 * d_model
        self.n_vars_en = n_vars_en # Number of endogenous variables
        self.c_exog = c_exog # Number of exogenous variables

        self.self_attention = self_attention # Attends over endogenous patches + global token
        self.cross_attention = cross_attention # Attends global token (query) to exogenous variate tokens (key/value)

        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
        # x: Endogenous tokens (Patch + Global) [Batch * n_vars_en, N+1, D]
        # cross: Exogenous tokens [Batch, C, D] or [Batch, SeqLen_exog, D]

        # --- Self-Attention over Endogenous Tokens (Patches + Global) ---
        x_res = x # Store residual connection input
        x_attn, _ = self.self_attention(
            x, x, x,
            attn_mask=x_mask,
            tau=tau, delta=None
        )
        x = x_res + self.dropout(x_attn) # Add residual
        x = self.norm1(x)
        # x shape: [B * n_vars_en, N+1, D]

        # Store output for later FFN residual connection
        x_after_self_attn = x

        # --- Cross-Attention: Endogenous Global Token <-> Exogenous Tokens ---
        x_glb_en = x[:, -1, :] # Get current global token (output of norm1)
        x_glb_en_query = x_glb_en.unsqueeze(1) # Shape: [B*n_vars_en, 1, D]

        # Prepare exogenous 'cross' input
        batch_size_x = x.shape[0]
        batch_size_cross = cross.shape[0]

        if batch_size_x != batch_size_cross:
             if batch_size_x == batch_size_cross * self.n_vars_en:
                 cross_repeated = cross.repeat_interleave(self.n_vars_en, dim=0)
             else:
                  raise ValueError("Batch size mismatch between endogenous and exogenous tokens.")
        else:
             cross_repeated = cross

        # Apply cross-attention
        x_glb_attn, _ = self.cross_attention(
            x_glb_en_query, cross_repeated, cross_repeated,
            attn_mask=cross_mask,
            tau=tau, delta=delta
        ) # Shape: [B*n_vars_en, 1, D]

        # Add result back to original global token (residual connection for cross-attention)
        x_glb_updated = x_glb_en + self.dropout(x_glb_attn.squeeze(1)) # Remove seq dim 1
        x_glb_updated = self.norm2(x_glb_updated) # Shape: [B*n_vars_en, D]


        # --- Combine Patches and Updated Global Token ---
        # *** CORRECTION: Avoid inplace modification ***
        x_patches = x_after_self_attn[:, :-1, :] # Get patch tokens [B*n_vars_en, N, D]
        x_glb_updated_unsqueezed = x_glb_updated.unsqueeze(1) # Shape [B*n_vars_en, 1, D]
        # Create the new tensor 'x' for the FFN input
        x_ffn_input = torch.cat([x_patches, x_glb_updated_unsqueezed], dim=1) # Shape [B*n_vars_en, N+1, D]


        # --- Feed Forward Network ---
        # Apply FFN to the combined tensor
        y = x_ffn_input
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) # [B*n_vars_en, d_ff, N+1]
        y = self.dropout(self.conv2(y).transpose(-1, 1)) # [B*n_vars_en, N+1, d_model]

        # Final residual connection and norm
        return self.norm3(x_ffn_input + y) # Add residual from *before* FFN
    

class Model(nn.Module):
    """
    TimeXer Model for Forecasting with Exogenous Variables.
    """
    def __init__(self, configs):
        super(Model, self).__init__()
        self.task_name = configs.task_name
        self.features = configs.features # 'M' (multivariate) or 'S' (univariate target) or 'MS' (multi-variate input, single target)
        self.seq_len = configs.seq_len
        self.pred_len = configs.pred_len
        self.use_norm = configs.use_norm # Instance Normalization
        self.patch_len = configs.patch_len
        self.patch_num = int(np.ceil(configs.seq_len / configs.patch_len)) # Use ceil for potential last partial patch
        self.d_model = configs.d_model

        # Determine number of endogenous and exogenous variables based on 'features' config
        if self.features == 'S': # Univariate target, no exogenous in input tensor `x_enc`
            self.n_vars_en = 1
            self.n_vars_exog = 0
            self.c_exog = configs.c_exog # Number of separate exogenous features passed potentially
        elif self.features == 'M': # Multivariate targets, all vars in `x_enc` are endogenous
            self.n_vars_en = configs.enc_in
            self.n_vars_exog = 0
            self.c_exog = configs.c_exog
        elif self.features == 'MS': # Single endogenous target, others in `x_enc` are exogenous (TimeXer's intended main use)
            self.n_vars_en = 1 # Typically 1 target variable
            self.n_vars_exog = configs.enc_in - self.n_vars_en # Rest are exogenous in the input tensor `x_enc`
            self.c_exog = self.n_vars_exog # Exogenous variables are handled by ExogEmbedding
            # Store total number of input variables (endog + exog potentially)
            self.total_vars = configs.enc_in
        else:
            raise ValueError(f"Unknown features type: {self.features}")

        # --- Embeddings ---
        # Endogenous Embedding (Patching + Global Token)
        self.en_embedding = EnEmbedding(self.n_vars_en, configs.d_model, self.patch_len, configs.dropout)

        # Exogenous Embedding (Handles time features and exogenous values)
        # c_in should be the number of exogenous features + time features if concatenated
        # Let's refine this: Exog embedding should probably take exogenous series values and time features separately.
        # Using DataEmbeddingExog which combines value, temporal, and positional embedding.
        # It expects `x` (exog values) and `x_mark` (time features).
        # Exogenous Embedding (Handles time features and exogenous values)
        if self.c_exog > 0:
             # Get number of time features from dummy data (or config)
             # Let's assume TimeSeriesDataset computes 6 time features for 'h' freq
             num_time_features = 6 # Hardcoded based on TimeSeriesDataset._compute_time_features for freq='h'
             # OR, get dynamically:
             # temp_dataset = TimeSeriesDataset(train_df[:10], TARGET, pjme_weather.columns.tolist(), configs.seq_len, configs.pred_len, configs.freq)
             # num_time_features = temp_dataset.time_features.shape[1]

             # c_in_exog: number of exogenous variables
             # c_in_time: number of time features
             self.ex_embedding = DataEmbeddingExog(self.c_exog, num_time_features, configs.d_model, configs.embed, configs.freq, configs.dropout)
        else:
             self.ex_embedding = None # No exogenous variables


        # --- Encoder ---
        self.encoder = Encoder(
            [
                EncoderLayer(
                    # Self-Attention for Endogenous
                    AttentionLayer(
                        FullAttention(mask_flag=False, factor=configs.factor, attention_dropout=configs.dropout, output_attention=False),
                        configs.d_model, configs.n_heads),
                    # Cross-Attention for Endog Global <-> Exog Variate
                    AttentionLayer(
                        FullAttention(mask_flag=False, factor=configs.factor, attention_dropout=configs.dropout, output_attention=False),
                        configs.d_model, configs.n_heads),
                    configs.d_model,
                    n_vars_en = self.n_vars_en, # Pass number of endogenous vars
                    c_exog = self.c_exog,      # Pass number of exogenous vars
                    d_ff=configs.d_ff,
                    dropout=configs.dropout,
                    activation=configs.activation
                ) for _ in range(configs.e_layers)
            ],
            norm_layer=torch.nn.LayerNorm(configs.d_model)
        )

        # --- Prediction Head ---
        # nf = output features from encoder = d_model * (patch_num + 1) # Flattened features per variable
        self.head_nf = configs.d_model * (self.patch_num + 1) # Patches + Global token
        # The head takes the output for *each* endogenous variable and predicts its future
        self.head = FlattenHead(self.n_vars_en, self.head_nf, configs.pred_len, head_dropout=configs.dropout)


    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
        # x_enc: [Batch, SeqLen, NumFeatures] - Contains target and potentially exogenous features based on `self.features`
        # x_mark_enc: [Batch, SeqLen, NumTimeFeatures] - Time features for input sequence
        # x_dec: [Batch, PredLen (or SeqLen), NumFeatures] - Often used for decoder input, maybe just placeholder here
        # x_mark_dec: [Batch, PredLen, NumTimeFeatures] - Time features for prediction window

        # --- Instance Normalization ---
        if self.use_norm:
            # Normalize the target variable (last feature assuming 'MS' or 'S')
            target_idx = -1 # Assuming target is the last column in x_enc for 'S'/'MS'
            means = x_enc[:, :, target_idx].mean(1, keepdim=True).detach() # Mean across seq_len
            x_enc_target = x_enc[:, :, target_idx] - means
            stdev = torch.sqrt(torch.var(x_enc_target, dim=1, keepdim=True, unbiased=False) + 1e-5).detach()
            x_enc_target_norm = x_enc_target / stdev
            # Keep original means and stdev for denormalization
            means = means.squeeze(1) # Shape [B, 1] -> [B]
            stdev = stdev.squeeze(1) # Shape [B, 1] -> [B]
        else:
             x_enc_target_norm = x_enc[:, :, -1] # Use original target if no norm

        # --- Prepare Endogenous and Exogenous Data ---
        # Assuming 'MS' mode: x_enc has shape [B, L, total_vars], target is last column
        x_endog = x_enc_target_norm.unsqueeze(-1) # [B, L, 1] - Normalized target variable
        if self.c_exog > 0:
            x_exog = x_enc[:, :, :-1] # [B, L, C] - Exogenous variables (already scaled)
        else:
            x_exog = None

        # --- Embeddings ---
        # Endogenous Embedding (Patching + Global Token)
        # Input shape: [B, L, 1]
        en_embed, n_vars_processed = self.en_embedding(x_endog)
        # en_embed shape: [B * n_vars_en (1), N+1, D] = [B, N+1, D]

        # Exogenous Embedding (Value + Time + Positional)
        if self.ex_embedding is not None and x_exog is not None:
            # Input shapes: x_exog [B, L, C], x_mark_enc [B, L, NumTimeFeatures]
            ex_embed = self.ex_embedding(x_exog, x_mark_enc) # [B, L, D]
            # How to convert [B, L, D] time-step embeddings to Variate Tokens [B, C, D]?
            # Option 1: Average Pooling over sequence length
            # ex_variate_tokens = torch.mean(ex_embed, dim=1) # Shape [B, D] - Problem: Loses variate info
            # Option 2: Linear projection from [L, D] to [1, D] for each var? Seems complex.
            # Option 3: Paper implies VariateEmbed(z^(i)) -> R^D. This suggests embedding *each* exog series *independently* of time features.
            # Let's redefine Exogenous Embedding to match the paper's likely intent: Embed each series to a single vector.
            # --- Revised Exogenous Embedding Strategy ---
            # We need a VariateEmbed layer applied *before* the main forward pass, maybe in the Dataset or training loop.
            # For now, let's *simulate* Variate Tokens by averaging the ExEmbedding output. This is a simplification.
            # A better approach would be a separate embedding layer for variate tokens.
            # Let's proceed with the simpler averaging for now, acknowledging it's not exactly the paper's description.
            # A simpler ExEmbedding: Linear layer on flattened series?
            # Let's try using the TimeFeatureEmbedding output as the 'cross' input, shape [B, L, D]. CrossAttention needs Query [B,1,D], Key/Value [B,L,D]
            cross_input = ex_embed # Use time-step embeddings as key/value for cross-attn. Shape [B, L, D]

        else:
            # Handle case with no exogenous vars - cross_input should be dummy or None
            # The EncoderLayer needs a valid 'cross' input. Let's pass zeros.
            cross_input = torch.zeros(x_enc.shape[0], self.seq_len, self.d_model, device=x_enc.device)


        # --- Encoder Forward Pass ---
        # Input: en_embed [B, N+1, D], cross_input [B, L, D]
        enc_out = self.encoder(en_embed, cross_input)
        # enc_out shape: [B, N+1, D] (since n_vars_en=1)

        # --- Prediction Head ---
        # Reshape encoder output for the head
        # Head expects [bs, nvars, d_model, patch_num+1] or similar
        # Current enc_out: [B, N+1, D]
        # Reshape to [B, n_vars_en (1), D, N+1] to match FlattenHead expectation if start_dim=-2 flattens last two dims
        enc_out_reshaped = enc_out.unsqueeze(1).permute(0, 1, 3, 2) # [B, 1, D, N+1]

        # FlattenHead expects nf = d_model * (patch_num + 1)
        # Input to FlattenHead: [B, 1, D, N+1], should be flattened to [B, 1, D*(N+1)]
        # Let's adjust FlattenHead or the input shape.
        # If FlattenHead flattens last two dims: input [B, 1, D, N+1] -> output [B, 1, pred_len]
        # Check FlattenHead nf calculation vs input shape

        # Recalculating head_nf based on reshaped input [B, 1, D, N+1]
        # Flattened dim = D * (N+1)
        # Adjust head_nf if different from initial calculation
        current_head_nf = self.d_model * (self.patch_num + 1)
        # assert self.head_nf == current_head_nf, "Head NF mismatch" # Ensure consistency

        # Apply head
        dec_out = self.head(enc_out_reshaped) # Output: [B, 1, pred_len]

        # Permute to [B, pred_len, 1]
        dec_out = dec_out.permute(0, 2, 1)

        # --- De-Normalization ---
        if self.use_norm:
            dec_out = dec_out * stdev.unsqueeze(1).unsqueeze(2).repeat(1, self.pred_len, 1) # stdev shape [B] -> [B, 1, 1]
            dec_out = dec_out + means.unsqueeze(1).unsqueeze(2).repeat(1, self.pred_len, 1) # means shape [B] -> [B, 1, 1]

        return dec_out # Shape: [B, pred_len, 1]


    # forecast_multi implementation needs adjustment based on how multivariate inputs/outputs are handled.
    # For 'MS' (single target), forecast_multi might not be needed, or should behave like forecast.
    # If 'M' (multiple targets), the embeddings and head need modification.
    # Sticking to 'MS' case as implied by the project description (forecasting grid load).

    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
        # This forward function calls the appropriate forecast method based on task and features.
        # For the current setup ('MS' features, forecasting task), we use the 'forecast' method.

        if self.task_name == 'long_term_forecast' or self.task_name == 'short_term_forecast':
            if self.features == 'MS' or self.features == 'S':
                dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
                return dec_out[:, -self.pred_len:, :]  # Ensure only pred_len steps are returned [B, L, D]
            # elif self.features == 'M':
                # dec_out = self.forecast_multi(x_enc, x_mark_enc, x_dec, x_mark_dec) # Requires forecast_multi implementation
                # return dec_out[:, -self.pred_len:, :]
            else:
                 raise ValueError(f"Features type {self.features} not implemented in forward pass.")
        else:
             # Handle other tasks like imputation or classification if needed
             return None

if pjme_weather is not None:
    # Model Hyperparameters (adjust as needed)
    # These are examples; tuning is likely required.
    configs_dict = {
        'task_name': 'short_term_forecast', # Or 'long_term_forecast'
        'features': 'MS',        # Multivariate input, Single target ('PJME_MW')
        'seq_len': 168,          # Input sequence length (e.g., 7 days of hourly data) [cite: 133]
        'pred_len': 24,          # Output prediction length (e.g., 1 day) [cite: 133]
        'patch_len': 24,         # Patch length [cite: 133]
        'enc_in': pjme_weather.shape[1], # Total number of features (target + exogenous)
        'c_exog': len(FEATURES), # Number of exogenous features
        'd_model': 128,          # Model dimension (smaller due to potential memory constraints) [cite: 292]
        'n_heads': 8,            # Number of attention heads
        'e_layers': 2,           # Number of encoder layers [cite: 291]
        'd_ff': 256,             # Feedforward dimension (adjust based on d_model)
        'dropout': 0.1,          # Dropout rate
        'activation': 'gelu',    # Activation function
        'use_norm': True,        # Use Instance Normalization
        'embed': 'timeF',        # Type of time embedding ('timeF', 'fixed', 'learned')
        'freq': 'h',             # Frequency of data ('h' for hourly)
        'factor': 3              # Attention factor (for potential future use in attention variants)
    }

    # Training Hyperparameters
    learning_rate = 0.0001       # [cite: 289]
    batch_size = 32
    num_epochs = 50 # Increased epochs, but early stopping will likely trigger
    patience = 5    # Early stopping patience

    configs = Configs(**configs_dict)
    print("Configurations set:")
    for key, value in configs_dict.items():
        print(f"- {key}: {value}")

else:
     print("Skipping configuration due to missing data.")

Configurations set:
- task_name: short_term_forecast
- features: MS
- seq_len: 168
- pred_len: 24
- patch_len: 24
- enc_in: 14
- c_exog: 13
- d_model: 128
- n_heads: 8
- e_layers: 2
- d_ff: 256
- dropout: 0.1
- activation: gelu
- use_norm: True
- embed: timeF
- freq: h
- factor: 3

class TimeSeriesDataset(Dataset):
    def __init__(self, data_df, target_col, feature_cols, sequence_length, prediction_length, freq='h'):
        self.sequence_length = sequence_length
        self.prediction_length = prediction_length
        self.target_col = target_col
        self.feature_cols = feature_cols # All columns (target + exog)
        self.freq = freq

        # Ensure target is the last column for easy slicing later if needed by norm/model
        if data_df.columns[-1] != target_col:
             cols = [col for col in data_df.columns if col != target_col] + [target_col]
             data_df = data_df[cols]
             print(f"Reordered columns. Target '{target_col}' is now last.")
             self.feature_cols = data_df.columns.tolist() # Update feature list


        self.data = data_df[self.feature_cols].values # Use all columns for x_enc
        self.target_data = data_df[[self.target_col]].values # Explicitly target
        self.index = data_df.index

        # Precompute time features
        self.time_features = self._compute_time_features(self.index, freq=self.freq)

    def _compute_time_features(self, dt_index, freq='h'):
        # Based on freq, generate time features
        # Example for hourly frequency: Hour, DayOfWeek, DayOfMonth, DayOfYear, Month, WeekOfYear
        if freq == 'h':
            features = [
                dt_index.hour.to_numpy(),
                dt_index.dayofweek.to_numpy(),
                dt_index.day.to_numpy(),
                dt_index.dayofyear.to_numpy(),
                dt_index.month.to_numpy(),
                #dt_index.weekofyear is deprecated, use isocalendar().week
                dt_index.isocalendar().week.to_numpy(dtype=np.float32) # Ensure correct dtype
            ]
        # Add more frequencies if needed (e.g., 't' for minutely)
        # elif freq == 't':
        #     features = [...]
        else:
            # Default minimal features
            features = [dt_index.dayofyear.to_numpy(), dt_index.month.to_numpy()]

        # Stack features and normalize (e.g., scale to [0, 1] or [-0.5, 0.5])
        time_features = np.vstack(features).transpose().astype(np.float32)

        # Simple normalization example (min-max to [0, 1]) - apply per feature
        min_vals = time_features.min(axis=0)
        max_vals = time_features.max(axis=0)
        range_vals = max_vals - min_vals
        range_vals[range_vals == 0] = 1.0 # Avoid division by zero for constant features
        time_features = (time_features - min_vals) / range_vals

        return time_features

    def __len__(self):
        # Total length minus sequence length and prediction length
        return len(self.data) - self.sequence_length - self.prediction_length + 1

    def __getitem__(self, idx):
        # Indices for input sequence
        seq_start = idx
        seq_end = idx + self.sequence_length

        # Indices for target sequence (prediction window)
        pred_start = seq_end
        pred_end = pred_start + self.prediction_length

        # --- Input Data ---
        # x_enc: Input sequence including target and exogenous features
        x_enc = self.data[seq_start:seq_end]
        # x_mark_enc: Time features for the input sequence
        x_mark_enc = self.time_features[seq_start:seq_end]

        # --- Target Data ---
        # y: True target values for the prediction window
        y = self.target_data[pred_start:pred_end]
        # y_mark / x_mark_dec: Time features for the prediction window
        x_mark_dec = self.time_features[pred_start:pred_end]

        # --- Decoder Input (Placeholders for Encoder-Only Model) ---
        # x_dec: Often zeros or last part of input sequence + zeros
        # Let's use zeros for simplicity
        x_dec = np.zeros((self.prediction_length, x_enc.shape[1])) # Match feature dim


        return {
            'x_enc': torch.tensor(x_enc, dtype=torch.float32),
            'x_mark_enc': torch.tensor(x_mark_enc, dtype=torch.float32),
            'x_dec': torch.tensor(x_dec, dtype=torch.float32), # Placeholder
            'x_mark_dec': torch.tensor(x_mark_dec, dtype=torch.float32),
            'y': torch.tensor(y, dtype=torch.float32) # Target values
        }

if pjme_weather is not None:
    # Create datasets
    train_dataset = TimeSeriesDataset(train_df, TARGET, pjme_weather.columns.tolist(), configs.seq_len, configs.pred_len, configs.freq)
    val_dataset = TimeSeriesDataset(val_df, TARGET, pjme_weather.columns.tolist(), configs.seq_len, configs.pred_len, configs.freq)
    test_dataset = TimeSeriesDataset(test_df, TARGET, pjme_weather.columns.tolist(), configs.seq_len, configs.pred_len, configs.freq)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) # num_workers=0 for main process
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    print(f"Datasets created: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")
    print(f"DataLoaders created with batch size: {batch_size}")

    # Example batch check
    try:
        batch = next(iter(train_loader))
        print("\nSample batch shapes:")
        for key, value in batch.items():
            print(f"- {key}: {value.shape}")
        print(f"Target feature index in x_enc: -1 (assumed)")
    except StopIteration:
         print("\nWarning: Train loader is empty. Check data splitting and dataset length.")
    except Exception as e:
         print(f"\nError checking batch: {e}")

else:
    print("Skipping Dataset/DataLoader creation due to missing data.")

Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
Datasets created: Train=109095, Val=13470, Test=13470
DataLoaders created with batch size: 32

Sample batch shapes:
- x_enc: torch.Size([32, 168, 14])
- x_mark_enc: torch.Size([32, 168, 6])
- x_dec: torch.Size([32, 24, 14])
- x_mark_dec: torch.Size([32, 24, 6])
- y: torch.Size([32, 24, 1])
Target feature index in x_enc: -1 (assumed)

if pjme_weather is not None:
    # Initialize Model, Loss, Optimizer
    model = Model(configs).to(device)
    criterion = nn.MSELoss() # L2 Loss [cite: 109]
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) # [cite: 289]
    early_stopping = EarlyStopping(patience=patience, verbose=True, path='timexer_checkpoint.pt')

    print("Model initialized:")
    print(model)
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTotal trainable parameters: {total_params:,}")


    train_losses = []
    val_losses = []

    print("\nStarting Training...")
    start_time = time.time()

    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        model.train()
        running_loss = 0.0
        batch_count = 0

        for i, batch in enumerate(train_loader):
            # Move batch to device
            x_enc = batch['x_enc'].to(device)
            x_mark_enc = batch['x_mark_enc'].to(device)
            x_dec = batch['x_dec'].to(device)
            x_mark_dec = batch['x_mark_dec'].to(device)
            y_true = batch['y'].to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(x_enc, x_mark_enc, x_dec, x_mark_dec)

            # Ensure output and target shapes match
            # Output: [B, pred_len, 1], Target: [B, pred_len, 1]
            loss = criterion(outputs, y_true)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            batch_count += 1

            # Optional: Print progress within epoch
            # if (i + 1) % 100 == 0:
            #     print(f'  Epoch {epoch+1}, Batch {i+1}/{len(train_loader)}, Loss: {loss.item():.6f}')

        avg_train_loss = running_loss / batch_count
        train_losses.append(avg_train_loss)

        # --- Validation ---
        model.eval()
        running_val_loss = 0.0
        val_batch_count = 0
        with torch.no_grad():
            for batch in val_loader:
                x_enc = batch['x_enc'].to(device)
                x_mark_enc = batch['x_mark_enc'].to(device)
                x_dec = batch['x_dec'].to(device)
                x_mark_dec = batch['x_mark_dec'].to(device)
                y_true = batch['y'].to(device)

                outputs = model(x_enc, x_mark_enc, x_dec, x_mark_dec)
                loss = criterion(outputs, y_true)
                running_val_loss += loss.item()
                val_batch_count += 1

        avg_val_loss = running_val_loss / val_batch_count
        val_losses.append(avg_val_loss)

        epoch_time = time.time() - epoch_start_time
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}, Time: {epoch_time:.2f}s')

        # --- Early Stopping ---
        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered.")
            break

    total_training_time = time.time() - start_time
    print(f'\nTraining Finished. Total time: {total_training_time:.2f}s')

    # Load the best model saved by early stopping
    print("Loading best model checkpoint...")
    try:
        model.load_state_dict(torch.load('timexer_checkpoint.pt'))
        print("Best model loaded successfully.")
    except FileNotFoundError:
         print("Warning: Checkpoint file not found. Using the model from the last epoch.")
    except Exception as e:
         print(f"Error loading checkpoint: {e}. Using the model from the last epoch.")


    # Plot training and validation loss
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Model Loss During Training')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.legend()
    plt.show()

else:
    print("Skipping Training due to missing data.")
    model = None # Ensure model is None if not trained

Model initialized:
Model(
  (en_embedding): EnEmbedding(
    (value_embedding): Linear(in_features=24, out_features=128, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ex_embedding): DataEmbeddingExog(
    (value_embedding): TokenEmbedding(
      (value_embedding): Linear(in_features=13, out_features=128, bias=True)
    )
    (temporal_embedding): TimeFeatureEmbedding(
      (embed): Linear(in_features=6, out_features=128, bias=False)
    )
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-1): 2 x EncoderLayer(
        (self_attention): AttentionLayer(
          (inner_attention): FullAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (query_projection): Linear(in_features=128, out_features=128, bias=True)
          (key_projection): Linear(in_features=128, out_features=128, bias=True)
          (value_projection): Linear(in_features=128, out_features=128, bias=True)
          (out_projection): Linear(in_features=128, out_features=128, bias=True)
        )
        (cross_attention): AttentionLayer(
          (inner_attention): FullAttention(
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (query_projection): Linear(in_features=128, out_features=128, bias=True)
          (key_projection): Linear(in_features=128, out_features=128, bias=True)
          (value_projection): Linear(in_features=128, out_features=128, bias=True)
          (out_projection): Linear(in_features=128, out_features=128, bias=True)
        )
        (conv1): Conv1d(128, 256, kernel_size=(1,), stride=(1,))
        (conv2): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (head): FlattenHead(
    (flatten): Flatten(start_dim=-2, end_dim=-1)
    (linear): Linear(in_features=1024, out_features=24, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

Total trainable parameters: 428,184

Starting Training...
Epoch 1/50, Train Loss: 0.179126, Val Loss: 0.090551, Time: 74.61s
Validation loss decreased (inf --> 0.090551).  Saving model ...
Epoch 2/50, Train Loss: 0.147890, Val Loss: 0.086291, Time: 75.00s
Validation loss decreased (0.090551 --> 0.086291).  Saving model ...
Epoch 3/50, Train Loss: 0.139661, Val Loss: 0.079674, Time: 78.05s
Validation loss decreased (0.086291 --> 0.079674).  Saving model ...
Epoch 4/50, Train Loss: 0.134318, Val Loss: 0.076228, Time: 72.99s
Validation loss decreased (0.079674 --> 0.076228).  Saving model ...
Epoch 5/50, Train Loss: 0.130756, Val Loss: 0.076344, Time: 77.78s
EarlyStopping counter: 1 out of 5
Epoch 6/50, Train Loss: 0.127994, Val Loss: 0.072837, Time: 72.98s
Validation loss decreased (0.076228 --> 0.072837).  Saving model ...
Epoch 7/50, Train Loss: 0.125897, Val Loss: 0.075359, Time: 70.22s
EarlyStopping counter: 1 out of 5
Epoch 8/50, Train Loss: 0.123730, Val Loss: 0.070576, Time: 77.31s
Validation loss decreased (0.072837 --> 0.070576).  Saving model ...
Epoch 9/50, Train Loss: 0.122007, Val Loss: 0.072582, Time: 78.92s
EarlyStopping counter: 1 out of 5
Epoch 10/50, Train Loss: 0.120174, Val Loss: 0.078044, Time: 75.69s
EarlyStopping counter: 2 out of 5
Epoch 11/50, Train Loss: 0.118237, Val Loss: 0.078026, Time: 77.04s
EarlyStopping counter: 3 out of 5
Epoch 12/50, Train Loss: 0.115938, Val Loss: 0.074494, Time: 85.69s
EarlyStopping counter: 4 out of 5
Epoch 13/50, Train Loss: 0.114572, Val Loss: 0.077988, Time: 78.00s
EarlyStopping counter: 5 out of 5
Early stopping triggered.

Training Finished. Total time: 994.42s
Loading best model checkpoint...
Best model loaded successfully.

if model is not None and pjme_weather is not None:
    print("\nStarting Evaluation on Test Set...")
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in test_loader:
            x_enc = batch['x_enc'].to(device)
            x_mark_enc = batch['x_mark_enc'].to(device)
            x_dec = batch['x_dec'].to(device)
            x_mark_dec = batch['x_mark_dec'].to(device)
            y_true = batch['y'].to(device) # Scaled actuals

            outputs = model(x_enc, x_mark_enc, x_dec, x_mark_dec) # Scaled predictions

            # Inverse transform using the target scaler
            # Output shape: [B, pred_len, 1]
            # Need to reshape to 2D for scaler: [B * pred_len, 1]
            preds_inv = target_scaler.inverse_transform(outputs.cpu().numpy().reshape(-1, 1))
            actuals_inv = target_scaler.inverse_transform(y_true.cpu().numpy().reshape(-1, 1))

            predictions.extend(preds_inv.flatten().tolist())
            actuals.extend(actuals_inv.flatten().tolist())

    # Calculate metrics
    predictions = np.array(predictions)
    actuals = np.array(actuals)

    # Ensure shapes are reasonable (e.g., matching length)
    print(f"Shape of predictions array: {predictions.shape}")
    print(f"Shape of actuals array: {actuals.shape}")

    if len(predictions) == len(actuals) and len(predictions) > 0:
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        mae = mean_absolute_error(actuals, predictions)
        mape = mean_absolute_percentage_error(actuals, predictions)

        print("\nEvaluation Metrics on Test Set (Inverse Scaled):")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE:  {mae:.4f}")
        print(f"MAPE: {mape:.4f}%")

        # Store results for comparison if needed
        results = {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}
    else:
        print("Error: Predictions and actuals lengths mismatch or are empty. Cannot calculate metrics.")
        results = None

else:
    print("Skipping Evaluation because the model was not trained or data was missing.")
    results = None

Starting Evaluation on Test Set...
Shape of predictions array: (323280,)
Shape of actuals array: (323280,)

Evaluation Metrics on Test Set (Inverse Scaled):
RMSE: 1941.5523
MAE:  1351.1955
MAPE: 4.2878%

if results is not None and pjme_weather is not None:
    print("\nVisualizing Predictions vs Actuals (Sample from Test Set)...")

    # We need the timestamps for the test predictions
    # The number of predictions might not exactly match the test set length
    # due to the sequence generation (len(test_set) - seq_len - pred_len + 1 batches)
    # Let's get the start index of the test predictions
    test_pred_start_idx = len(train_df) + len(val_df) + configs.seq_len
    test_pred_end_idx = test_pred_start_idx + len(predictions) # Assuming predictions cover the possible range

    if test_pred_end_idx > len(pjme_weather):
        test_pred_end_idx = len(pjme_weather)
        print(f"Adjusting prediction end index to match data length: {test_pred_end_idx}")


    if test_pred_start_idx < test_pred_end_idx and len(predictions) > 0:
        # Select the corresponding timestamps from the original dataframe
        pred_index = pjme_weather.index[test_pred_start_idx:test_pred_end_idx]

        # Ensure the length matches the number of predictions
        if len(pred_index) != len(predictions):
             print(f"Warning: Length mismatch between prediction index ({len(pred_index)}) and predictions ({len(predictions)}). Truncating index.")
             min_len = min(len(pred_index), len(predictions))
             pred_index = pred_index[:min_len]
             predictions_to_plot = predictions[:min_len]
             actuals_to_plot = actuals[:min_len]
        else:
             predictions_to_plot = predictions
             actuals_to_plot = actuals


        # Create a DataFrame for plotting
        plot_df = pd.DataFrame({
            'Actual': actuals_to_plot,
            'Predicted': predictions_to_plot
        }, index=pred_index)

        # Plot a sample (e.g., first N hours or a specific period)
        sample_size = 24 * 7 * 1 # Plot one week
        plt.figure(figsize=(18, 6))
        plt.plot(plot_df.index[:sample_size], plot_df['Actual'][:sample_size], label='Actual', marker='.', linestyle='-')
        plt.plot(plot_df.index[:sample_size], plot_df['Predicted'][:sample_size], label='Predicted', marker='.', linestyle='--')
        plt.title(f'TimeXer Forecast vs Actuals (First {sample_size} hours of Test Predictions)')
        plt.xlabel('Timestamp')
        plt.ylabel(f'{TARGET} (MW)')
        plt.legend()
        plt.grid(True)
        # Rotate x-axis labels for better readability
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    else:
        print("Cannot generate plot due to index/prediction length issues.")


elif pjme_weather is None:
     print("Skipping Visualization due to missing data.")
else:
     print("Skipping Visualization because evaluation metrics could not be calculated.")

Visualizing Predictions vs Actuals (Sample from Test Set)...
Adjusting prediction end index to match data length: 136608
Warning: Length mismatch between prediction index (13493) and predictions (323280). Truncating index.

import optuna

def objective(trial):
    """Optuna objective function for hyperparameter optimization."""

    global train_df, val_df, TARGET, FEATURES, pjme_weather # Access global dataframes/configs if needed, or pass them
    global device, target_scaler # Access device and scaler

    # --- 1. Suggest Hyperparameters ---
    # Use suggested values or fall back to defaults if needed
    try:
        cfg_learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
        cfg_d_model = trial.suggest_categorical('d_model', [64, 128, 256])
        cfg_seq_len = trial.suggest_categorical('seq_len', [24*7, 24*10, 24*14]) # 7, 10, 14 days
        cfg_patch_len = trial.suggest_categorical('patch_len', [8, 12, 16, 24])

        # Fixed hyperparameters (can be defined globally or passed as args)
        fixed_params = {
            'task_name': 'short_term_forecast',
            'features': 'MS',
            'pred_len': 24,
            'enc_in': pjme_weather.shape[1],
            'c_exog': len(FEATURES),
            'n_heads': 8,            # Keep fixed for now, ensure d_model % n_heads == 0
            'e_layers': 2,           # Keep fixed for now
            'dropout': 0.1,          # Keep fixed for now
            'activation': 'gelu',
            'use_norm': True,
            'embed': 'timeF',
            'freq': 'h',
            'factor': 3,
            # Dependent HPs
            'd_ff': cfg_d_model * 4 # Often set relative to d_model
        }

        # Check compatibility
        if cfg_d_model % fixed_params['n_heads'] != 0:
             # Skip this trial if incompatible
             print(f"Skipping trial: d_model={cfg_d_model} not divisible by n_heads={fixed_params['n_heads']}")
             raise optuna.exceptions.TrialPruned("Incompatible d_model and n_heads")

        # Combine suggested and fixed params
        current_configs_dict = {
            'learning_rate': cfg_learning_rate,
            'd_model': cfg_d_model,
            'seq_len': cfg_seq_len,
            'patch_len': cfg_patch_len,
            **fixed_params # Add fixed parameters
        }
        configs = Configs(**current_configs_dict)

        # --- 2. Recreate Datasets & DataLoaders (Essential for varying seq_len) ---
        # Make sure train_df, val_df exist and are scaled
        if 'train_df' not in globals() or train_df is None:
             print("Error: train_df not found or is None in objective function.")
             return float('inf') # Return high value if data is missing

        try:
            # Use original scaled dataframes
            train_dataset = TimeSeriesDataset(train_df, TARGET, pjme_weather.columns.tolist(), configs.seq_len, configs.pred_len, configs.freq)
            val_dataset = TimeSeriesDataset(val_df, TARGET, pjme_weather.columns.tolist(), configs.seq_len, configs.pred_len, configs.freq)

            # Use a reasonable batch size (can also be tuned)
            current_batch_size = 32
            train_loader = DataLoader(train_dataset, batch_size=current_batch_size, shuffle=True, num_workers=0)
            val_loader = DataLoader(val_dataset, batch_size=current_batch_size, shuffle=False, num_workers=0)
            # Handle potential empty dataloaders if seq_len is too large for val_df
            if len(val_loader) == 0 or len(train_loader) == 0:
                 print(f"Warning: DataLoader empty for seq_len={configs.seq_len}. Pruning trial.")
                 raise optuna.exceptions.TrialPruned("DataLoader empty for this seq_len")

        except Exception as e:
             print(f"Error creating datasets/dataloaders: {e}. Pruning trial.")
             raise optuna.exceptions.TrialPruned(f"Dataset/Dataloader error: {e}")


        # --- 3. Initialize Model, Loss, Optimizer ---
        model = Model(configs).to(device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=configs.learning_rate)
        # Use a simpler stopping mechanism for Optuna trials or fewer epochs
        num_optuna_epochs = 15 # Train for fewer epochs per trial
        patience_optuna = 3   # Stricter patience for early stopping

        # --- 4. Training & Validation Loop (Condensed) ---
        best_val_loss = float('inf')
        stopping_counter = 0

        for epoch in range(num_optuna_epochs):
            model.train()
            train_loss_epoch = 0.0
            train_batches = 0
            for batch in train_loader:
                x_enc, x_mark_enc, x_dec, x_mark_dec, y_true = [b.to(device) for b in batch.values()]
                optimizer.zero_grad()
                outputs = model(x_enc, x_mark_enc, x_dec, x_mark_dec)
                loss = criterion(outputs, y_true)
                loss.backward()
                optimizer.step()
                train_loss_epoch += loss.item()
                train_batches += 1
            avg_train_loss = train_loss_epoch / train_batches if train_batches > 0 else float('inf')


            model.eval()
            val_loss_epoch = 0.0
            val_batches = 0
            with torch.no_grad():
                for batch in val_loader:
                     x_enc, x_mark_enc, x_dec, x_mark_dec, y_true = [b.to(device) for b in batch.values()]
                     outputs = model(x_enc, x_mark_enc, x_dec, x_mark_dec)
                     loss = criterion(outputs, y_true)
                     val_loss_epoch += loss.item()
                     val_batches += 1
            avg_val_loss = val_loss_epoch / val_batches if val_batches > 0 else float('inf')

            print(f"  Trial {trial.number} Epoch {epoch+1}/{num_optuna_epochs} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")

            # Update best validation loss
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                stopping_counter = 0 # Reset counter
            else:
                stopping_counter += 1

            # --- Optuna Pruning & Early Stopping ---
            trial.report(avg_val_loss, epoch) # Report intermediate value to Optuna
            if trial.should_prune():
                print(f"  Trial {trial.number} pruned at epoch {epoch+1}.")
                raise optuna.exceptions.TrialPruned()

            # Simple early stopping within the trial
            if stopping_counter >= patience_optuna:
                 print(f"  Trial {trial.number} stopped early at epoch {epoch+1}.")
                 break

        # --- 5. Return Metric ---
        # Optuna aims to minimize this value
        return best_val_loss

    except optuna.exceptions.TrialPruned as e:
         # Important: Re-raise TrialPruned exceptions
         raise e
    except Exception as e:
        # Handle other errors like CUDA OOM, etc.
        print(f"Trial {trial.number} failed with error: {e}")
        # Return a large value or NaN to indicate failure
        return float('inf')

if pjme_weather is not None:
    # Create Optuna study
    # Consider using a pruner to stop unpromising trials early
    # MedianPruner is a common choice
    study = optuna.create_study(direction='minimize',
                                pruner=optuna.pruners.MedianPruner(n_startup_trials=5, # Allow first 5 trials to complete
                                                                   n_warmup_steps=5, # Prune based on first 5 epochs
                                                                   interval_steps=1)) # Prune every epoch after warmup

    # Start optimization
    # n_trials: Number of different hyperparameter combinations to test
    # Increase this number for a more thorough search (e.g., 50, 100)
    # Be mindful of computation time! Each trial trains a model.
    n_trials_optuna = 20 # Start with a smaller number (e.g., 20-30) to test
    try:
        study.optimize(objective, n_trials=n_trials_optuna, timeout=None) # Add timeout in seconds if needed
    except KeyboardInterrupt:
         print("Optimization stopped manually.")


    # --- 5. Get Best Results ---
    if study.best_trial:
        print("\nOptimization Finished!")
        print(f"Number of finished trials: {len(study.trials)}")
        print(f"Best trial value (min validation loss): {study.best_value:.6f}")
        print("Best trial hyperparameters:")
        for key, value in study.best_params.items():
            print(f"  {key}: {value}")

        # Store best params for retraining
        best_hyperparams = study.best_params
    else:
        print("\nOptimization did not complete successfully or no trials finished.")
        best_hyperparams = None

    # Optional: Visualize optimization history
    try:
        fig1 = optuna.visualization.plot_optimization_history(study)
        fig1.show()
        fig2 = optuna.visualization.plot_param_importances(study)
        fig2.show()
        # fig3 = optuna.visualization.plot_slice(study, params=['learning_rate', 'd_model', 'seq_len', 'patch_len'])
        # fig3.show()
    except Exception as e:
         print(f"Could not generate Optuna plots: {e}. Ensure plotly is installed ('pip install plotly').")


else:
    print("Skipping Optuna study due to missing data.")
    best_hyperparams = None

[I 2025-04-30 22:25:23,391] A new study created in memory with name: no-name-fd15d67a-1b2b-4e96-965c-f97783154be9

Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
  Trial 0 Epoch 1/15 | Train Loss: 0.217030 | Val Loss: 0.105313
  Trial 0 Epoch 2/15 | Train Loss: 0.166503 | Val Loss: 0.093077
  Trial 0 Epoch 3/15 | Train Loss: 0.157144 | Val Loss: 0.087432
  Trial 0 Epoch 4/15 | Train Loss: 0.152093 | Val Loss: 0.083669
  Trial 0 Epoch 5/15 | Train Loss: 0.147551 | Val Loss: 0.082294
  Trial 0 Epoch 6/15 | Train Loss: 0.144979 | Val Loss: 0.082872
  Trial 0 Epoch 7/15 | Train Loss: 0.142083 | Val Loss: 0.079418
  Trial 0 Epoch 8/15 | Train Loss: 0.140346 | Val Loss: 0.080104
  Trial 0 Epoch 9/15 | Train Loss: 0.138132 | Val Loss: 0.077621
  Trial 0 Epoch 10/15 | Train Loss: 0.136381 | Val Loss: 0.077658
  Trial 0 Epoch 11/15 | Train Loss: 0.134699 | Val Loss: 0.074820
  Trial 0 Epoch 12/15 | Train Loss: 0.133442 | Val Loss: 0.076419
  Trial 0 Epoch 13/15 | Train Loss: 0.132338 | Val Loss: 0.077062
  Trial 0 Epoch 14/15 | Train Loss: 0.131231 | Val Loss: 0.073753

[I 2025-04-30 22:47:28,791] Trial 0 finished with value: 0.07375267870016754 and parameters: {'learning_rate': 6.341291657814797e-05, 'd_model': 64, 'seq_len': 336, 'patch_len': 24}. Best is trial 0 with value: 0.07375267870016754.

  Trial 0 Epoch 15/15 | Train Loss: 0.130404 | Val Loss: 0.075435
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
  Trial 1 Epoch 1/15 | Train Loss: 0.198532 | Val Loss: 0.097690
  Trial 1 Epoch 2/15 | Train Loss: 0.159336 | Val Loss: 0.088491
  Trial 1 Epoch 3/15 | Train Loss: 0.152304 | Val Loss: 0.087903
  Trial 1 Epoch 4/15 | Train Loss: 0.147917 | Val Loss: 0.082525
  Trial 1 Epoch 5/15 | Train Loss: 0.143585 | Val Loss: 0.084831
  Trial 1 Epoch 6/15 | Train Loss: 0.139828 | Val Loss: 0.081442
  Trial 1 Epoch 7/15 | Train Loss: 0.136908 | Val Loss: 0.079589
  Trial 1 Epoch 8/15 | Train Loss: 0.133869 | Val Loss: 0.076668
  Trial 1 Epoch 9/15 | Train Loss: 0.131673 | Val Loss: 0.078508
  Trial 1 Epoch 10/15 | Train Loss: 0.129862 | Val Loss: 0.074957
  Trial 1 Epoch 11/15 | Train Loss: 0.127300 | Val Loss: 0.076290
  Trial 1 Epoch 12/15 | Train Loss: 0.126343 | Val Loss: 0.077393

[I 2025-04-30 23:09:15,026] Trial 1 finished with value: 0.0749570621807764 and parameters: {'learning_rate': 1.720702918879371e-05, 'd_model': 256, 'seq_len': 336, 'patch_len': 8}. Best is trial 0 with value: 0.07375267870016754.

  Trial 1 Epoch 13/15 | Train Loss: 0.124359 | Val Loss: 0.084127
  Trial 1 stopped early at epoch 13.
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
  Trial 2 Epoch 1/15 | Train Loss: 0.170220 | Val Loss: 0.089017
  Trial 2 Epoch 2/15 | Train Loss: 0.140639 | Val Loss: 0.075147
  Trial 2 Epoch 3/15 | Train Loss: 0.133560 | Val Loss: 0.076382
  Trial 2 Epoch 4/15 | Train Loss: 0.128867 | Val Loss: 0.071336
  Trial 2 Epoch 5/15 | Train Loss: 0.125996 | Val Loss: 0.071917
  Trial 2 Epoch 6/15 | Train Loss: 0.122622 | Val Loss: 0.079919

[I 2025-04-30 23:18:42,101] Trial 2 finished with value: 0.07133583417560489 and parameters: {'learning_rate': 0.00015954349007674337, 'd_model': 128, 'seq_len': 168, 'patch_len': 12}. Best is trial 2 with value: 0.07133583417560489.
[I 2025-04-30 23:18:42,215] Trial 3 finished with value: inf and parameters: {'learning_rate': 0.00013597006517053505, 'd_model': 256, 'seq_len': 168, 'patch_len': 16}. Best is trial 2 with value: 0.07133583417560489.

  Trial 2 Epoch 7/15 | Train Loss: 0.119697 | Val Loss: 0.080204
  Trial 2 stopped early at epoch 7.
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
Trial 3 failed with error: mat1 and mat2 shapes cannot be multiplied (32x2816 and 3072x24)
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.

[I 2025-04-30 23:18:42,323] Trial 4 finished with value: inf and parameters: {'learning_rate': 1.791413337548682e-05, 'd_model': 256, 'seq_len': 168, 'patch_len': 16}. Best is trial 2 with value: 0.07133583417560489.

Trial 4 failed with error: mat1 and mat2 shapes cannot be multiplied (32x2816 and 3072x24)
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
  Trial 5 Epoch 1/15 | Train Loss: 0.205242 | Val Loss: 0.099044
  Trial 5 Epoch 2/15 | Train Loss: 0.161388 | Val Loss: 0.092211
  Trial 5 Epoch 3/15 | Train Loss: 0.153695 | Val Loss: 0.087069
  Trial 5 Epoch 4/15 | Train Loss: 0.149091 | Val Loss: 0.080829
  Trial 5 Epoch 5/15 | Train Loss: 0.145358 | Val Loss: 0.082344
  Trial 5 Epoch 6/15 | Train Loss: 0.142136 | Val Loss: 0.081631
  Trial 5 Epoch 7/15 | Train Loss: 0.138700 | Val Loss: 0.076245
  Trial 5 Epoch 8/15 | Train Loss: 0.136049 | Val Loss: 0.076145
  Trial 5 Epoch 9/15 | Train Loss: 0.134296 | Val Loss: 0.073640
  Trial 5 Epoch 10/15 | Train Loss: 0.132466 | Val Loss: 0.078079
  Trial 5 Epoch 11/15 | Train Loss: 0.130479 | Val Loss: 0.081748

import matplotlib.pyplot as plt
from optuna.visualization.matplotlib import (
    plot_optimization_history as mplt_opt_hist,
    plot_param_importances  as mplt_param_imp,
)

# History
ax1 = mplt_opt_hist(study)
ax1.set_title("Optuna Optimization History")
plt.show()

# Importances
ax2 = mplt_param_imp(study)
# ax2.set_title("Optuna Parameter Importances")
plt.show()

[W 2025-05-01 08:15:04,804] Trial 3 is omitted in visualization because its objective value is inf or nan.
[W 2025-05-01 08:15:04,804] Trial 4 is omitted in visualization because its objective value is inf or nan.

# --- Save the best hyperparameters found by Optuna ---

# Check if the study ran successfully and found best parameters
if 'best_hyperparams' in locals() and best_hyperparams is not None:
    # Define a filename for storing the parameters
    params_filename = "timexer_best_hyperparams.json"

    # Save the dictionary to a JSON file
    try:
        with open(params_filename, 'w') as f:
            # Use indent for readability
            json.dump(best_hyperparams, f, indent=4)
        print(f"Best hyperparameters successfully saved to {params_filename}")
        print("Saved parameters:")
        # Pretty print the parameters that were saved
        print(json.dumps(best_hyperparams, indent=4))
    except Exception as e:
        print(f"Error saving best hyperparameters to {params_filename}: {e}")
elif 'study' in locals() and not study.best_trial:
     print("Optuna study finished, but no best trial found (e.g., all trials failed or were pruned). Cannot save parameters.")
else:
    print("Optuna study did not run or `best_hyperparams` variable not found. Cannot save parameters.")

Best hyperparameters successfully saved to timexer_best_hyperparams.json
Saved parameters:
{
    "learning_rate": 7.944348310121449e-05,
    "d_model": 128,
    "seq_len": 168,
    "patch_len": 12
}

# --- Load the best hyperparameters from the saved file ---

# Define the filename where parameters were saved
params_filename = "timexer_best_hyperparams.json"
loaded_best_params = None # Initialize variable

try:
    with open(params_filename, 'r') as f:
        loaded_best_params = json.load(f)
    print(f"Successfully loaded hyperparameters from {params_filename}:")
    # Pretty print the loaded parameters
    print(json.dumps(loaded_best_params, indent=4))

    # Now you can use the 'loaded_best_params' dictionary, for example:
    # best_hyperparams = loaded_best_params # Assign it back if needed for subsequent cells
    # Or use it directly in the final model configuration

except FileNotFoundError:
    print(f"Error: Could not find the parameter file {params_filename}.")
    print("Please ensure the Optuna study ran and saved the parameters, or check the filename.")
except Exception as e:
    print(f"Error loading hyperparameters from {params_filename}: {e}")

# Example of assigning it back for use in the next cell (Retraining)
if loaded_best_params is not None:
     best_hyperparams = loaded_best_params
else:
     # Handle the case where loading failed, maybe fall back to defaults or stop
     print("WARNING: Using default or potentially undefined 'best_hyperparams' as loading failed.")
     # best_hyperparams = default_params # Define default_params if needed
     pass # Or raise an error

Successfully loaded hyperparameters from timexer_best_hyperparams.json:
{
    "learning_rate": 7.944348310121449e-05,
    "d_model": 128,
    "seq_len": 168,
    "patch_len": 12
}

if best_hyperparams is not None and pjme_weather is not None:
    print("\nRetraining final model using best hyperparameters found by Optuna...")

    # --- Setup with best params ---
    final_configs_dict = {
        # Fixed parameters (repeat from objective or load from saved config)
        'task_name': 'short_term_forecast', 'features': 'MS', 'pred_len': 24,
        'enc_in': pjme_weather.shape[1], 'c_exog': len(FEATURES), 'n_heads': 8,
        'e_layers': 2, 'dropout': 0.1, 'activation': 'gelu',
        'use_norm': True, 'embed': 'timeF', 'freq': 'h', 'factor': 3,
        # Best hyperparameters from Optuna study
        **best_hyperparams,
        # Derived parameter - ensure d_ff matches best d_model
        'd_ff': best_hyperparams['d_model'] * 4
    }
    # Ensure n_heads compatibility if it wasn't tuned
    if final_configs_dict['d_model'] % final_configs_dict['n_heads'] != 0:
         print(f"Warning: Best d_model ({final_configs_dict['d_model']}) not divisible by n_heads ({final_configs_dict['n_heads']}). Adjusting n_heads or choose different params.")
         # Example: Find largest divisor <= 8
         # best_n_heads = max(h for h in [1, 2, 4, 8] if final_configs_dict['d_model'] % h == 0)
         # final_configs_dict['n_heads'] = best_n_heads
         # Or simply report error and stop
         raise ValueError("Incompatible best d_model and fixed n_heads")


    final_configs = Configs(**final_configs_dict)
    print("\nFinal Model Configuration:")
    for key, value in final_configs_dict.items():
         # Skip printing learning_rate here as it's used for optimizer, not model init
         if key != 'learning_rate':
             print(f"- {key}: {getattr(final_configs, key, 'N/A')}")

    # Recreate datasets with the best seq_len
    print(f"Recreating datasets with seq_len = {final_configs.seq_len}...")
    final_train_dataset = TimeSeriesDataset(train_df, TARGET, pjme_weather.columns.tolist(), final_configs.seq_len, final_configs.pred_len, final_configs.freq)
    final_val_dataset = TimeSeriesDataset(val_df, TARGET, pjme_weather.columns.tolist(), final_configs.seq_len, final_configs.pred_len, final_configs.freq)
    final_test_dataset = TimeSeriesDataset(test_df, TARGET, pjme_weather.columns.tolist(), final_configs.seq_len, final_configs.pred_len, final_configs.freq)

    # Use the same batch size as before, or potentially the one used in Optuna trial
    final_batch_size = 32
    final_train_loader = DataLoader(final_train_dataset, batch_size=final_batch_size, shuffle=True, num_workers=0)
    final_val_loader = DataLoader(final_val_dataset, batch_size=final_batch_size, shuffle=False, num_workers=0)
    final_test_loader = DataLoader(final_test_dataset, batch_size=final_batch_size, shuffle=False, num_workers=0)

    print(f"Final Datasets created: Train={len(final_train_dataset)}, Val={len(final_val_dataset)}, Test={len(final_test_dataset)}")

    # --- Initialize and Train Final Model ---
    final_model = Model(final_configs).to(device)
    final_criterion = nn.MSELoss()
    final_optimizer = optim.Adam(final_model.parameters(), lr=best_hyperparams['learning_rate'])

    # Use original training settings (epochs, patience)
    final_num_epochs = 50 # Original number of epochs
    final_patience = 5    # Original patience
    final_early_stopping = EarlyStopping(patience=final_patience, verbose=True, path='timexer_final_best_model.pt')

    print(f"\nStarting Final Training with {final_num_epochs} epochs and patience {final_patience}...")
    final_train_losses = []
    final_val_losses = []
    final_start_time = time.time()

    # --- Re-run the FULL Training Loop (copy/paste or refactor into a function) ---
    for epoch in range(final_num_epochs):
        epoch_start_time = time.time()
        final_model.train()
        running_loss = 0.0
        batch_count = 0
        for i, batch in enumerate(final_train_loader): 
            x_enc, x_mark_enc, x_dec, x_mark_dec, y_true = [b.to(device) for b in batch.values()]
            final_optimizer.zero_grad()
            outputs = final_model(x_enc, x_mark_enc, x_dec, x_mark_dec)
            loss = final_criterion(outputs, y_true)
            loss.backward()
            final_optimizer.step()
            running_loss += loss.item()
            batch_count += 1
        avg_train_loss = running_loss / batch_count if batch_count > 0 else float('inf')
        final_train_losses.append(avg_train_loss)

        # Validation
        final_model.eval()
        running_val_loss = 0.0
        val_batch_count = 0
        with torch.no_grad():
            for batch in final_val_loader:
                x_enc, x_mark_enc, x_dec, x_mark_dec, y_true = [b.to(device) for b in batch.values()]
                outputs = final_model(x_enc, x_mark_enc, x_dec, x_mark_dec)
                loss = final_criterion(outputs, y_true)
                running_val_loss += loss.item()
                val_batch_count += 1
        avg_val_loss = running_val_loss / val_batch_count if val_batch_count > 0 else float('inf')
        final_val_losses.append(avg_val_loss)

        epoch_time = time.time() - epoch_start_time
        print(f'Final Training Epoch {epoch+1}/{final_num_epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}, Time: {epoch_time:.2f}s')

        # Early Stopping
        final_early_stopping(avg_val_loss, final_model)
        if final_early_stopping.early_stop:
            print("Final training early stopping triggered.")
            break
    # --- End of Training Loop ---

    total_final_training_time = time.time() - final_start_time
    print(f'\nFinal Training Finished. Total time: {total_final_training_time:.2f}s')

    # Load best model from final training
    print("Loading best model from final training...")
    try:
        final_model.load_state_dict(torch.load('timexer_final_best_model.pt'))
        print("Best final model loaded successfully.")
    except Exception as e:
         print(f"Warning: Could not load final best model checkpoint: {e}. Using model from last epoch.")


    # --- Final Evaluation on Test Set ---
    print("\nEvaluating final model on Test Set...")
    final_model.eval()
    final_predictions = []
    final_actuals = []
    with torch.no_grad():
        for batch in final_test_loader:
             x_enc, x_mark_enc, x_dec, x_mark_dec, y_true = [b.to(device) for b in batch.values()]
             outputs = final_model(x_enc, x_mark_enc, x_dec, x_mark_dec)
             preds_inv = target_scaler.inverse_transform(outputs.cpu().numpy().reshape(-1, 1))
             actuals_inv = target_scaler.inverse_transform(y_true.cpu().numpy().reshape(-1, 1))
             final_predictions.extend(preds_inv.flatten().tolist())
             final_actuals.extend(actuals_inv.flatten().tolist())

    final_predictions = np.array(final_predictions)
    final_actuals = np.array(final_actuals)

    if len(final_predictions) == len(final_actuals) and len(final_predictions) > 0:
        final_rmse = np.sqrt(mean_squared_error(final_actuals, final_predictions))
        final_mae = mean_absolute_error(final_actuals, final_predictions)
        final_mape = mean_absolute_percentage_error(final_actuals, final_predictions)

        print("\nFinal Model Evaluation Metrics on Test Set:")
        print(f"RMSE: {final_rmse:.4f}")
        print(f"MAE:  {final_mae:.4f}")
        print(f"MAPE: {final_mape:.4f}%")
    else:
         print("Error calculating final metrics.")

    # (Optional) Add final visualization code here if needed

else:
    print("\nSkipping final model retraining as best hyperparameters were not found or data is missing.")

Retraining final model using best hyperparameters found by Optuna...

Final Model Configuration:
- task_name: short_term_forecast
- features: MS
- pred_len: 24
- enc_in: 14
- c_exog: 13
- n_heads: 8
- e_layers: 2
- dropout: 0.1
- activation: gelu
- use_norm: True
- embed: timeF
- freq: h
- factor: 3
- d_model: 128
- seq_len: 168
- patch_len: 12
- d_ff: 512
Recreating datasets with seq_len = 168...
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
Reordered columns. Target 'PJME_MW' is now last.
Final Datasets created: Train=109095, Val=13470, Test=13470

Starting Final Training with 50 epochs and patience 5...
Final Training Epoch 1/50, Train Loss: 0.179199, Val Loss: 0.090186, Time: 79.89s
Validation loss decreased (inf --> 0.090186).  Saving model ...
Final Training Epoch 2/50, Train Loss: 0.148138, Val Loss: 0.084014, Time: 84.02s
Validation loss decreased (0.090186 --> 0.084014).  Saving model ...
Final Training Epoch 3/50, Train Loss: 0.138947, Val Loss: 0.085302, Time: 82.61s
EarlyStopping counter: 1 out of 5
Final Training Epoch 4/50, Train Loss: 0.133927, Val Loss: 0.080707, Time: 78.12s
Validation loss decreased (0.084014 --> 0.080707).  Saving model ...
Final Training Epoch 5/50, Train Loss: 0.130665, Val Loss: 0.079094, Time: 72.91s
Validation loss decreased (0.080707 --> 0.079094).  Saving model ...
Final Training Epoch 6/50, Train Loss: 0.128490, Val Loss: 0.069685, Time: 71.45s
Validation loss decreased (0.079094 --> 0.069685).  Saving model ...
Final Training Epoch 7/50, Train Loss: 0.126014, Val Loss: 0.074268, Time: 76.96s
EarlyStopping counter: 1 out of 5
Final Training Epoch 8/50, Train Loss: 0.124683, Val Loss: 0.071808, Time: 74.11s
EarlyStopping counter: 2 out of 5
Final Training Epoch 9/50, Train Loss: 0.122875, Val Loss: 0.077030, Time: 74.97s
EarlyStopping counter: 3 out of 5
Final Training Epoch 10/50, Train Loss: 0.121887, Val Loss: 0.074584, Time: 75.89s
EarlyStopping counter: 4 out of 5
Final Training Epoch 11/50, Train Loss: 0.120177, Val Loss: 0.072910, Time: 74.66s
EarlyStopping counter: 5 out of 5
Final training early stopping triggered.

Final Training Finished. Total time: 847.11s
Loading best model from final training...
Best final model loaded successfully.

Evaluating final model on Test Set...

Final Model Evaluation Metrics on Test Set:
RMSE: 1951.3168
MAE:  1362.5760
MAPE: 4.2847%

PJM Hourly Load Forecasting with TimeXer¶

Why Transformers supersede RNNs:¶

1. Model Methodologies and Feature Handling¶

0. Environment Setup & Imports¶

1. Data Loading & Feature Engineering¶

2. Train / Validation / Test Split¶

3. Data Scaling¶

4. Model Building Blocks¶

5. TimeXer Model Definition¶

6. Configuration & Hyperparameters¶

7. Dataset & DataLoader¶

8. Training Loop¶

9. Model Evaluation¶

10. Forecast Visualization¶

11. Hyperparameter Tuning with Optuna¶

12. Final Retraining with Best Hyperparameters¶

13. Comparison of Time Series Models for Hourly Energy Forecasting (PJM East)¶

1. Model Methodologies and Feature Handling¶

2. Strengths and Weaknesses¶

3. Discussion and User Preference¶

4. Next Steps¶