{ "cells": [ { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "from opt_einsum.paths import branch_1\n", "!apt-get update\n", "!apt-get install graphviz -y\n", "\n", "!pip install tensorflow\n", "!pip install numpy\n", "!pip install pandas\n", "\n", "!pip install keras\n", "!pip install scikit-learn\n", "!pip install matplotlib\n", "!pip install joblib\n", "!pip install pyarrow\n", "!pip install fastparquet\n", "!pip install scipy\n", "!pip install seaborn\n", "!pip install tqdm\n", "!pip install pydot\n", "!pip install tensorflow-io\n", "!pip install tensorflow-addons" ], "id": "5e0376433a89bbda" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "import tensorflow as tf\n", "from tensorflow.keras.layers import Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, Input, Activation, Lambda, Bidirectional, Add, MaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D, \\\n", " GlobalMaxPooling1D, Concatenate\n", "from tensorflow.keras import regularizers\n", "from tensorflow.keras.models import Model\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import RobustScaler\n", "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n", "from tensorflow.keras.optimizers import AdamW\n", "import json\n", "from datetime import datetime\n", "import matplotlib.pyplot as plt\n", "from tensorflow.keras.utils import plot_model\n", "import tensorflow_addons as tfa\n", "import os\n", "import joblib\n", "import seaborn as sns\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix\n", "\n", "folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\n", "\n", "random_state_value = None" ], "id": "4e7cf95955575047" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def get_season(date):\n", " month = date.month\n", " day = date.day\n", " if (month == 12 and day >= 21) or (month <= 3 and day < 20):\n", " return 'Winter'\n", " elif (month == 3 and day >= 20) or (month <= 6 and day < 21):\n", " return 'Spring'\n", " elif (month == 6 and day >= 21) or (month <= 9 and day < 23):\n", " return 'Summer'\n", " elif (month == 9 and day >= 23) or (month <= 12 and day < 21):\n", " return 'Autumn'\n", " else:\n", " return 'Unknown'\n", "\n", "\n", "def get_time_period(hour):\n", " if 5 <= hour < 12:\n", " return 'Morning'\n", " elif 12 <= hour < 17:\n", " return 'Afternoon'\n", " elif 17 <= hour < 21:\n", " return 'Evening'\n", " else:\n", " return 'Night'\n", "\n", "\n", "def add_time_features(df):\n", " \"\"\"\n", " Add time-based features to the DataFrame.\n", " Works with both 'datetime' as column or index.\n", " \"\"\"\n", " # Se datetime è l'indice, lo usiamo direttamente\n", " if isinstance(df.index, pd.DatetimeIndex):\n", " datetime_col = df.index\n", " else:\n", " # Se datetime è una colonna, la convertiamo\n", " if 'datetime' in df.columns:\n", " datetime_col = pd.to_datetime(df['datetime'])\n", " else:\n", " raise ValueError(\"No datetime column or index found in DataFrame\")\n", "\n", " # Creazione delle feature temporali\n", " df['timestamp'] = datetime_col.astype(np.int64) // 10 ** 9\n", " df['year'] = datetime_col.year\n", " df['month'] = datetime_col.month\n", " df['day'] = datetime_col.day\n", " df['hour'] = datetime_col.hour\n", " df['minute'] = datetime_col.minute\n", " df['hour_sin'] = np.sin(datetime_col.hour * (2 * np.pi / 24))\n", " df['hour_cos'] = np.cos(datetime_col.hour * (2 * np.pi / 24))\n", " df['day_of_week'] = datetime_col.dayofweek\n", " df['day_of_year'] = datetime_col.dayofyear\n", " df['week_of_year'] = datetime_col.isocalendar().week.astype(int)\n", " df['quarter'] = datetime_col.quarter\n", " df['is_month_end'] = datetime_col.is_month_end.astype(int)\n", " df['is_quarter_end'] = datetime_col.is_quarter_end.astype(int)\n", " df['is_year_end'] = datetime_col.is_year_end.astype(int)\n", " df['month_sin'] = np.sin(datetime_col.month * (2 * np.pi / 12))\n", " df['month_cos'] = np.cos(datetime_col.month * (2 * np.pi / 12))\n", " df['day_of_year_sin'] = np.sin(datetime_col.dayofyear * (2 * np.pi / 365.25))\n", " df['day_of_year_cos'] = np.cos(datetime_col.dayofyear * (2 * np.pi / 365.25))\n", " df['season'] = datetime_col.map(get_season)\n", " df['time_period'] = datetime_col.hour.map(get_time_period)\n", "\n", " return df\n", "\n", "\n", "def add_solar_features(df):\n", " # Solar angle calculation\n", " df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))\n", "\n", " # Interactions between relevant features\n", " df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']\n", " df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])\n", "\n", " # Derived features\n", " df['clear_sky_index'] = (100 - df['cloudcover']) / 100\n", " df['temp_gradient'] = df['temp'] - df['tempmin']\n", "\n", " return df\n", "\n", "\n", "def add_solar_specific_features(df):\n", " \"\"\"\n", " Aggiunge feature specifiche per la predizione della radiazione solare\n", " combinando caratteristiche astronomiche e meteorologiche\n", " \"\"\"\n", " # Caratteristiche astronomiche\n", " df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)\n", " df['solar_noon'] = 12 - df['hour']\n", " df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * np.cos(2 * np.pi * df['solar_noon'] / 24)\n", "\n", " # Angolo solare teorico\n", " df['solar_angle'] = np.sin(df['hour_sin']) * np.sin(df['day_of_year_sin'])\n", "\n", " # Interazioni con condizioni atmosferiche\n", " df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']\n", " df['visibility_elevation'] = df['visibility'] * df['solar_elevation']\n", " df['uv_cloud_interaction'] = df['uvindex'] * (100 - df['cloudcover']) / 100\n", "\n", " # Indici di chiarezza e trasmissione\n", " df['clearness_index'] = (100 - df['cloudcover']) * df['visibility'] / 10000\n", " df['atmospheric_attenuation'] = (df['pressure'] / 1013.25) * (1 - (df['humidity'] / 100) * 0.6)\n", "\n", " # Radiazione teorica e attenuazione\n", " df['theoretical_radiation'] = df['solar_angle'].clip(0, 1) * 1000\n", " df['expected_radiation'] = df['theoretical_radiation'] * df['clearness_index']\n", "\n", " # Rolling features\n", " df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12).mean()\n", " df['temp_rolling_12h'] = df['temp'].rolling(window=12).mean()\n", " df['uv_rolling_12h'] = df['uvindex'].rolling(window=12).mean()\n", "\n", " # Interazioni temperatura-radiazione\n", " df['temp_radiation_potential'] = df['temp'] * df['solar_elevation']\n", "\n", " return df\n", "\n", "\n", "def add_radiation_energy_features(df):\n", " \"\"\"Adds specific features based on solarenergy and uvindex\"\"\"\n", "\n", " # Assicuriamoci che l'indice sia di tipo datetime\n", " if not isinstance(df.index, pd.DatetimeIndex):\n", " df.index = pd.to_datetime(df['datetime'])\n", "\n", " # Solar energy to UV ratio (independent from solarradiation)\n", " df['energy_uv_ratio'] = df['solarenergy'] / (df['uvindex'] + 1e-6)\n", "\n", " # Time aggregations\n", " # Moving averages\n", " windows = [3, 6, 12, 24] # hours\n", " for w in windows:\n", " df[f'energy_rolling_mean_{w}h'] = df['solarenergy'].rolling(window=w).mean()\n", " df[f'uv_rolling_mean_{w}h'] = df['uvindex'].rolling(window=w).mean()\n", "\n", " # Daily aggregations utilizzando datetime\n", " df['energy_daily_sum'] = df.groupby(df.index.date)['solarenergy'].transform('sum')\n", " df['uv_daily_max'] = df.groupby(df.index.date)['uvindex'].transform('max')\n", "\n", " # Changes\n", " df['energy_change'] = df['solarenergy'].diff()\n", " df['uv_change'] = df['uvindex'].diff()\n", "\n", " # Lag features\n", " lags = [1, 2, 3, 6, 12, 24] # hours\n", " for lag in lags:\n", " df[f'energy_lag_{lag}h'] = df['solarenergy'].shift(lag)\n", " df[f'uv_lag_{lag}h'] = df['uvindex'].shift(lag)\n", "\n", " # Peak indicators\n", " df['is_energy_peak'] = (df['solarenergy'] > df['energy_rolling_mean_6h'] * 1.2).astype(int)\n", " df['is_uv_peak'] = (df['uvindex'] > df['uv_rolling_mean_6h'] * 1.2).astype(int)\n", "\n", " # Aggiungiamo alcune metriche di volatilità\n", " df['energy_volatility'] = df['energy_change'].rolling(window=24).std()\n", " df['uv_volatility'] = df['uv_change'].rolling(window=24).std()\n", "\n", " # Indice di intensità solare composito\n", " df['solar_intensity_index'] = (df['solarenergy'] * df['uvindex']) / (df['cloudcover'] + 1e-6)\n", "\n", " # Interazioni\n", " df['uv_cloud_interaction'] = df['uvindex'] * (100 - df['cloudcover']) / 100\n", " df['energy_temp_interaction'] = df['solarenergy'] * df['temp']\n", "\n", " return df\n", "\n", "\n", "def add_advanced_features(df):\n", " \"\"\"\n", " Add all advanced features to the DataFrame\n", " Assumes df has a DatetimeIndex\n", " \"\"\"\n", " # Verifichiamo che abbiamo un DatetimeIndex\n", " if not isinstance(df.index, pd.DatetimeIndex):\n", " raise ValueError(\"DataFrame must have a DatetimeIndex\")\n", "\n", " # Existing features\n", " df = add_time_features(df)\n", " df = add_solar_features(df)\n", " df = add_solar_specific_features(df)\n", " df = add_radiation_energy_features(df)\n", "\n", " # Weather variable interactions\n", " df['temp_humidity'] = df['temp'] * df['humidity']\n", " df['temp_cloudcover'] = df['temp'] * df['cloudcover']\n", " df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']\n", "\n", " # Derived features\n", " df['clear_sky_factor'] = (100 - df['cloudcover']) / 100\n", " df['temp_humidity_interaction'] = df['temp'] * df['humidity'] / 100\n", " df['atmospheric_transparency'] = (100 - df['cloudcover']) * (df['visibility'] / 10)\n", "\n", " # Rolling means\n", " df['temp_rolling_mean_6h'] = df['temp'].rolling(window=6).mean()\n", " df['cloudcover_rolling_mean_6h'] = df['cloudcover'].rolling(window=6).mean()\n", "\n", " # Lag features\n", " df['temp_1h_lag'] = df['temp'].shift(1)\n", " df['cloudcover_1h_lag'] = df['cloudcover'].shift(1)\n", " df['humidity_1h_lag'] = df['humidity'].shift(1)\n", "\n", " # Extreme conditions indicator\n", " df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) &\n", " (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n", "\n", " # One-hot encoding for categorical features\n", " df = pd.get_dummies(df, columns=['season', 'time_period'])\n", "\n", " return df\n", "\n", "\n", "def prepare_advanced_data(df):\n", " \"\"\"\n", " Prepare data for advanced modeling with proper datetime handling\n", " \"\"\"\n", " # Assicuriamoci che abbiamo una copia del DataFrame\n", " df = df.copy()\n", "\n", " # Verifichiamo se datetime è già l'indice\n", " if not isinstance(df.index, pd.DatetimeIndex):\n", " if 'datetime' in df.columns:\n", " df['datetime'] = pd.to_datetime(df['datetime'])\n", " df.set_index('datetime', inplace=True)\n", " else:\n", " raise ValueError(\"No datetime column or index found in DataFrame\")\n", "\n", " # Ordiniamo il DataFrame per datetime\n", " df = df.sort_index()\n", "\n", " # Apply feature engineering functions\n", " df = add_advanced_features(df)\n", "\n", " #all_columns = list(df.columns)\n", " #print(all_columns)\n", "\n", " features = {\n", " # Primary Features (strong direct correlation)\n", " 'primary_features': [\n", " 'uvindex', # Direct radiation indicator\n", " 'cloudcover', # Cloud coverage\n", " 'visibility', # Atmospheric transparency\n", " 'temp', # Temperature\n", " 'pressure', # Atmospheric pressure\n", " 'humidity', # Humidity\n", " ],\n", "\n", " # Astronomical and Temporal Features\n", " 'astronomical_features': [\n", " 'solar_elevation', # Solar elevation\n", " 'solar_angle', # Solar angle\n", " 'day_length', # Day length\n", " 'hour_sin', # Daily cycle\n", " 'hour_cos',\n", " 'day_of_year_sin', # Annual cycle\n", " 'day_of_year_cos',\n", " 'month_sin', # Monthly cycle\n", " 'month_cos',\n", " ],\n", "\n", " # Key Indices and Interactions\n", " 'key_interactions': [\n", " 'clear_sky_index', # Clear sky index\n", " 'atmospheric_attenuation', # Atmospheric attenuation\n", " 'theoretical_radiation', # Theoretical radiation\n", " 'expected_radiation', # Expected radiation\n", " 'cloud_elevation', # Cloud-elevation interaction\n", " 'visibility_elevation', # Visibility-elevation interaction\n", " 'uv_cloud_interaction', # UV-cloud interaction\n", " 'temp_radiation_potential', # Temperature-radiation potential\n", " ],\n", "\n", " # Rolling Features (temporal trends)\n", " 'rolling_features': [\n", " 'cloud_rolling_12h', # Cloud coverage moving average\n", " 'temp_rolling_12h', # Temperature moving average\n", " 'uv_rolling_12h', # UV moving average\n", " 'cloudcover_rolling_mean_6h',\n", " 'temp_rolling_mean_6h',\n", " ],\n", "\n", " # Lag Features (most recent)\n", " 'lag_features': [\n", " 'temp_1h_lag', # 1-hour temperature lag\n", " 'cloudcover_1h_lag', # 1-hour cloud coverage lag\n", " 'humidity_1h_lag', # 1-hour humidity lag\n", " 'uv_lag_1h', # 1-hour UV lag\n", " ],\n", "\n", " # Categorical Features\n", " 'categorical_features': [\n", " 'season_Spring', # Seasons\n", " 'season_Summer',\n", " 'season_Autumn',\n", " 'season_Winter',\n", " 'time_period_Morning', # Time periods\n", " 'time_period_Afternoon',\n", " 'time_period_Evening',\n", " 'time_period_Night',\n", " ]\n", " }\n", "\n", " final_features = [feature for group in features.values() for feature in group]\n", "\n", " # Handle missing values\n", " target_variables = ['solarradiation', 'solarenergy', 'uvindex']\n", " for column in final_features + target_variables:\n", " if column in df.columns:\n", " df[column] = df[column].interpolate(method='time')\n", " df.fillna(0, inplace=True)\n", "\n", " # Temporal split\n", " data_after_2010 = df[df['year'] >= 2010].copy()\n", " data_before_2010 = df[df['year'] < 2010].copy()\n", "\n", " X = data_after_2010[final_features]\n", " y = data_after_2010['solarradiation']\n", " X_to_predict = data_before_2010[final_features]\n", "\n", " # Train-test split\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42, shuffle=False\n", " )\n", "\n", " # Scaling\n", " feature_scaler = RobustScaler()\n", " X_train_scaled = feature_scaler.fit_transform(X_train)\n", " X_test_scaled = feature_scaler.transform(X_test)\n", " X_to_predict_scaled = feature_scaler.transform(X_to_predict)\n", "\n", " target_scaler = RobustScaler()\n", " y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))\n", " y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))\n", "\n", " # Print info about selected features\n", " print(\"\\nSelected features:\")\n", " print(f\"Number of features: {len(final_features)}\")\n", " print(\"Features list:\", final_features)\n", "\n", " return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler, final_features, X_to_predict_scaled\n", "\n", "\n", "def create_sequence_data(X, sequence_length=24):\n", " \"\"\"\n", " Converts data into sequences for LSTM input\n", " sequence_length represents how many previous hours to consider\n", " \"\"\"\n", " sequences = []\n", " for i in range(len(X) - sequence_length + 1):\n", " sequences.append(X[i:i + sequence_length])\n", " return np.array(sequences)\n", "\n", "\n", "def prepare_hybrid_data(df):\n", " X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler, features, X_to_predict_scaled = prepare_advanced_data(df)\n", "\n", " # Convert data into sequences\n", " sequence_length = 24 # 24 hours of historical data\n", "\n", " X_train_seq = create_sequence_data(X_train_scaled, sequence_length)\n", " X_test_seq = create_sequence_data(X_test_scaled, sequence_length)\n", "\n", " # Adjust y by removing the first (sequence_length-1) elements\n", " y_train = y_train_scaled[sequence_length - 1:]\n", " y_test = y_test_scaled[sequence_length - 1:]\n", "\n", " X_to_predict_seq = create_sequence_data(X_to_predict_scaled, sequence_length)\n", "\n", " return X_train_seq, X_test_seq, y_train, y_test, feature_scaler, target_scaler, features, X_to_predict_seq" ], "id": "1f7b15beaf12c0eb" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01, return_sequences=True, survival_probability=0.8):\n", " \"\"\"\n", " Creates a bidirectional LSTM layer with residual connections and regularization.\n", "\n", " Parameters:\n", " x: Input tensor\n", " units: Number of LSTM units\n", " dropout_rate: Dropout rate for regularization\n", " l2_reg: L2 regularization factor\n", " return_sequences: Whether to return sequences or just the last output\n", " survival_probability: Probability of layer survival for stochastic depth\n", " \"\"\"\n", " residual = x\n", " x = Bidirectional(LSTM(units, return_sequences=return_sequences, kernel_regularizer=regularizers.l2(l2_reg)))(x)\n", " x = LayerNormalization()(x)\n", " x = Dropout(dropout_rate)(x)\n", "\n", " if return_sequences:\n", " if int(residual.shape[-1]) != 2 * units:\n", " residual = Dense(2 * units, activation='linear')(residual)\n", " x = tfa.layers.StochasticDepth(survival_probability)([x, residual])\n", " return x\n", "\n", "\n", "def attention_block(x, units, num_heads=8, survival_probability=0.8):\n", " \"\"\"\n", " Creates a multi-head attention block with residual connections.\n", "\n", " Parameters:\n", " x: Input tensor\n", " units: Dimension of the key space\n", " num_heads: Number of attention heads\n", " survival_probability: Probability of layer survival for stochastic depth\n", " \"\"\"\n", " attention = MultiHeadAttention(num_heads=num_heads, key_dim=units)(x, x)\n", " x = tfa.layers.StochasticDepth(survival_probability)([x, attention])\n", " x = LayerNormalization()(x)\n", " return x\n", "\n", "\n", "def create_solarradiation_model(input_shape, folder_name, l2_lambda=0.005, min_output=0, max_output=1):\n", " \"\"\"\n", " Creates a deep learning model for solar radiation prediction using LSTM and attention mechanisms.\n", "\n", " Parameters:\n", " input_shape: Shape of input data\n", " folder_name: Directory to save model architecture visualization\n", " l2_lambda: L2 regularization factor\n", " \"\"\"\n", " inputs = Input(shape=input_shape)\n", "\n", " # Progressive hyperparameters for model architecture\n", " survival_probs = [0.9, 0.8, 0.7, 0.6] # Decreasing survival probabilities for deeper layers\n", " attention_survival_probs = [0.85, 0.75, 0.65, 0.55] # Survival probabilities for attention blocks\n", " lstm_units = [256, 128, 64, 32] # Decreasing number of units for LSTM layers\n", " dropout_rates = [0.4, 0.3, 0.2, 0.2] # Decreasing dropout rates\n", " attention_heads = [32, 24, 16, 8] # Decreasing number of attention heads\n", "\n", " lstm_blocks = 4\n", " # Main network architecture\n", " x = inputs\n", " for i in range(lstm_blocks):\n", " # LSTM layer with residual connections\n", " x = create_residual_lstm_layer(\n", " x,\n", " units=lstm_units[i],\n", " dropout_rate=dropout_rates[i],\n", " l2_reg=l2_lambda,\n", " return_sequences=True,\n", " survival_probability=survival_probs[i]\n", " )\n", " # Attention block\n", " x = attention_block(\n", " x,\n", " units=lstm_units[i],\n", " num_heads=attention_heads[i],\n", " survival_probability=attention_survival_probs[i]\n", " )\n", " if i < lstm_blocks - 1: # No pooling after last LSTM layer\n", " x = MaxPooling1D()(x)\n", "\n", " # Final LSTM layer for sequence aggregation\n", " x = create_residual_lstm_layer(\n", " x,\n", " units=32,\n", " dropout_rate=0.1,\n", " l2_reg=l2_lambda,\n", " return_sequences=False,\n", " survival_probability=0.6\n", " )\n", "\n", " # Dense layers for final prediction\n", " dense_units = [128, 64, 32]\n", " dense_dropout = [0.2, 0.1, 0.05]\n", "\n", " for units, dropout in zip(dense_units, dense_dropout):\n", " x = Dense(units, kernel_regularizer=regularizers.l2(l2_lambda))(x)\n", " x = BatchNormalization()(x)\n", " x = Activation('swish')(x)\n", " x = Dropout(dropout)(x)\n", "\n", " # Output layer with value clipping\n", " outputs = Dense(1)(x)\n", " outputs = Lambda(lambda x: tf.clip_by_value(x, min_output, max_output))(outputs)\n", "\n", " # Model compilation\n", " model = Model(inputs=inputs, outputs=outputs, name=\"SolarRadiationModel\")\n", "\n", " # Improved loss function\n", " def hybrid_focal_loss(y_true, y_pred):\n", " # MSE with focal weighting\n", " mse = tf.square(y_true - y_pred)\n", " error_ratio = tf.abs(y_true - y_pred) / (tf.abs(y_true) + 1.0)\n", " focal_weight = tf.pow(error_ratio, 2)\n", " weighted_mse = focal_weight * mse\n", "\n", " # MAE component\n", " mae = tf.abs(y_true - y_pred)\n", "\n", " return tf.reduce_mean(0.7 * weighted_mse + 0.3 * mae)\n", "\n", " # Custom metrics\n", " def rmse(y_true, y_pred):\n", " return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))\n", "\n", " def custom_mape(y_true, y_pred):\n", " epsilon = 1e-7\n", " diff = tf.abs((y_true - y_pred) / (y_true + epsilon))\n", " diff = tf.clip_by_value(diff, 0, 1)\n", " return tf.reduce_mean(diff) * 100\n", "\n", " # Learning rate schedule\n", " lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(\n", " initial_learning_rate=0.0003,\n", " first_decay_steps=1000,\n", " t_mul=2.0,\n", " m_mul=0.9,\n", " alpha=1e-5\n", " )\n", "\n", " # Optimizer\n", " optimizer = AdamW(\n", " learning_rate=lr_schedule,\n", " beta_1=0.9,\n", " beta_2=0.999,\n", " epsilon=1e-7,\n", " weight_decay=0.001,\n", " amsgrad=True\n", " )\n", "\n", " model.compile(\n", " optimizer=optimizer,\n", " loss=hybrid_focal_loss,\n", " metrics=['mse', 'mae', rmse, custom_mape]\n", " )\n", "\n", " model.summary()\n", "\n", " plot_model(model,\n", " to_file=f'{folder_name}_model_architecture.png',\n", " show_shapes=True,\n", " show_layer_names=True,\n", " dpi=150,\n", " show_layer_activations=True)\n", "\n", " return model\n", "\n", "\n", "def evaluate_solarradiation_predictions(y_true, y_pred, hour=None, folder_name=None):\n", " \"\"\"\n", " Comprehensive evaluation of solar radiation predictions with detailed analysis and visualizations.\n", "\n", " Parameters:\n", " -----------\n", " y_true : array-like\n", " Actual solar radiation values (W/m²)\n", " y_pred : array-like\n", " Predicted solar radiation values (W/m²)\n", " hour : array-like, optional\n", " Array of hours corresponding to predictions, for temporal analysis\n", " folder_name : str, optional\n", " Directory to save analysis plots\n", "\n", " Returns:\n", " --------\n", " dict\n", " Dictionary containing all calculated metrics\n", " \"\"\"\n", "\n", " # Data preparation\n", " y_true = np.array(y_true).ravel()\n", " y_pred = np.array(y_pred).ravel()\n", " errors = y_pred - y_true\n", "\n", " # Basic metrics calculation\n", " mae_raw = mean_absolute_error(y_true, y_pred)\n", " rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred))\n", " r2_raw = r2_score(y_true, y_pred)\n", " mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-7))) * 100\n", "\n", " # Error margin accuracy\n", " within_5_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.05)\n", " within_10_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.10)\n", " within_20_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.20)\n", "\n", " # Radiation level classification\n", " def get_radiation_level(value):\n", " if value <= 200:\n", " return 'Very Low'\n", " elif value <= 400:\n", " return 'Low'\n", " elif value <= 600:\n", " return 'Moderate'\n", " elif value <= 800:\n", " return 'High'\n", " elif value <= 1000:\n", " return 'Very High'\n", " else:\n", " return 'Extreme'\n", "\n", " # Calculate radiation levels\n", " y_true_levels = [get_radiation_level(v) for v in y_true]\n", " y_pred_levels = [get_radiation_level(v) for v in y_pred]\n", " level_accuracy = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels)])\n", "\n", " # Print main metrics\n", " print(\"\\nSolar Radiation Prediction Metrics:\")\n", " print(\"\\nAbsolute Metrics:\")\n", " print(f\"MAE: {mae_raw:.2f} W/m²\")\n", " print(f\"RMSE: {rmse_raw:.2f} W/m²\")\n", " print(f\"R² Score: {r2_raw:.3f}\")\n", " print(f\"MAPE: {mape:.2f}%\")\n", "\n", " print(\"\\nPercentage Accuracy:\")\n", " print(f\"Within ±5%: {within_5_percent * 100:.1f}%\")\n", " print(f\"Within ±10%: {within_10_percent * 100:.1f}%\")\n", " print(f\"Within ±20%: {within_20_percent * 100:.1f}%\")\n", "\n", " print(\"\\nLevel Accuracy:\")\n", " print(f\"Level Accuracy: {level_accuracy * 100:.1f}%\")\n", "\n", " # Confusion matrix for radiation levels\n", " cm = confusion_matrix(y_true_levels, y_pred_levels)\n", " print(\"\\nConfusion Matrix for Radiation Levels:\")\n", " cm_df = pd.DataFrame(\n", " cm,\n", " columns=['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extreme'],\n", " index=['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extreme']\n", " )\n", " print(cm_df)\n", "\n", " # Time period analysis\n", " if hour is not None:\n", " day_periods = {\n", " 'Morning (5-11)': (5, 11),\n", " 'Noon (11-13)': (11, 13),\n", " 'Afternoon (13-17)': (13, 17),\n", " 'Evening (17-21)': (17, 21),\n", " 'Night (21-5)': (21, 5)\n", " }\n", "\n", " print(\"\\nAnalysis by Time Period:\")\n", " for period, (start, end) in day_periods.items():\n", " if start < end:\n", " mask = (hour >= start) & (hour < end)\n", " else:\n", " mask = (hour >= start) | (hour < end)\n", "\n", " if np.any(mask):\n", " period_mae = mean_absolute_error(y_true[mask], y_pred[mask])\n", " period_mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / (y_true[mask] + 1e-7))) * 100\n", " print(f\"\\n{period}:\")\n", " print(f\"MAE: {period_mae:.2f} W/m²\")\n", " print(f\"MAPE: {period_mape:.2f}%\")\n", "\n", " # Visualizations\n", " if folder_name is not None:\n", " try:\n", "\n", " # Figure 1: Main analysis plots\n", " plt.figure(figsize=(20, 15))\n", "\n", " # Plot 1: Scatter plot of actual vs predicted values\n", " plt.subplot(3, 2, 1)\n", " plt.scatter(y_true, y_pred, alpha=0.5)\n", " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n", " plt.xlabel('Actual Radiation (W/m²)')\n", " plt.ylabel('Predicted Radiation (W/m²)')\n", " plt.title('Actual vs Predicted Values')\n", " plt.grid(True)\n", "\n", " # Plot 2: Absolute error distribution\n", " plt.subplot(3, 2, 2)\n", " plt.hist(errors, bins=50, alpha=0.7)\n", " plt.xlabel('Prediction Error (W/m²)')\n", " plt.ylabel('Frequency')\n", " plt.title('Error Distribution')\n", " plt.grid(True)\n", "\n", " # Plot 3: Percentage error distribution\n", " plt.subplot(3, 2, 3)\n", " percentage_errors = ((y_pred - y_true) / (y_true + 1e-7)) * 100\n", " plt.hist(np.clip(percentage_errors, -100, 100), bins=50, alpha=0.7)\n", " plt.xlabel('Percentage Error (%)')\n", " plt.ylabel('Frequency')\n", " plt.title('Percentage Error Distribution')\n", " plt.grid(True)\n", "\n", " # Plot 4: Errors vs actual values\n", " plt.subplot(3, 2, 4)\n", " plt.scatter(y_true, errors, alpha=0.5)\n", " plt.axhline(y=0, color='r', linestyle='--')\n", " plt.xlabel('Actual Radiation (W/m²)')\n", " plt.ylabel('Error (W/m²)')\n", " plt.title('Errors vs Actual Values')\n", " plt.grid(True)\n", "\n", " # Plot 5: Error boxplot by radiation level\n", " plt.subplot(3, 2, 5)\n", " sns.boxplot(x=[get_radiation_level(v) for v in y_true], y=errors)\n", " plt.xticks(rotation=45)\n", " plt.xlabel('Radiation Level')\n", " plt.ylabel('Error (W/m²)')\n", " plt.title('Error Distribution by Level')\n", "\n", " # Plot 6: Confusion matrix heatmap\n", " plt.subplot(3, 2, 6)\n", " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n", " plt.title('Confusion Matrix')\n", " plt.xticks(rotation=45)\n", " plt.yticks(rotation=45)\n", "\n", " plt.tight_layout()\n", " filename = f'{folder_name}_radiation_analysis.png'\n", " plt.savefig(filename, dpi=300, bbox_inches='tight')\n", " print(f\"\\nPlot saved as: {filename}\")\n", " plt.close()\n", "\n", " except Exception as e:\n", " print(f\"\\nError saving plots: {str(e)}\")\n", "\n", " # Additional error statistics\n", " print(\"\\nError Statistics:\")\n", " print(f\"Mean error: {np.mean(errors):.3f}\")\n", " print(f\"Error standard deviation: {np.std(errors):.3f}\")\n", " print(f\"Median error: {np.median(errors):.3f}\")\n", " print(f\"95th percentile absolute error: {np.percentile(np.abs(errors), 95):.3f}\")\n", "\n", " # Return structured metrics\n", " metrics = {\n", " 'absolute': {\n", " 'mae': mae_raw,\n", " 'rmse': rmse_raw,\n", " 'r2': r2_raw,\n", " 'mape': mape\n", " },\n", " 'percentage_accuracy': {\n", " 'within_5_percent': within_5_percent,\n", " 'within_10_percent': within_10_percent,\n", " 'within_20_percent': within_20_percent\n", " },\n", " 'categorical': {\n", " 'level_accuracy': level_accuracy\n", " },\n", " 'error_stats': {\n", " 'mean': float(np.mean(errors)),\n", " 'std': float(np.std(errors)),\n", " 'median': float(np.median(errors)),\n", " 'p95_abs': float(np.percentile(np.abs(errors), 95))\n", " }\n", " }\n", "\n", " return metrics\n", "\n", "\n", "def plot_training_history(history, folder_name=None):\n", " \"\"\"\n", " Visualize and save training loss and metrics plots\n", "\n", " Parameters:\n", " -----------\n", " history : tensorflow.keras.callbacks.History\n", " History object returned by model training\n", " folder_name : str\n", " Directory to save the plots and metrics\n", " \"\"\"\n", "\n", " try:\n", " # Create figure\n", " plt.figure(figsize=(12, 4))\n", "\n", " # Loss plot\n", " plt.subplot(1, 2, 1)\n", " plt.plot(history.history['loss'], label='Training Loss')\n", " plt.plot(history.history['val_loss'], label='Validation Loss')\n", " plt.title('Model Loss')\n", " plt.xlabel('Epoch')\n", " plt.ylabel('Loss')\n", " plt.legend()\n", " plt.grid(True)\n", "\n", " # MAE plot\n", " plt.subplot(1, 2, 2)\n", " plt.plot(history.history['mae'], label='Training MAE')\n", " plt.plot(history.history['val_mae'], label='Validation MAE')\n", " plt.title('Model MAE')\n", " plt.xlabel('Epoch')\n", " plt.ylabel('MAE')\n", " plt.legend()\n", " plt.grid(True)\n", "\n", " plt.tight_layout()\n", "\n", " if folder_name is not None:\n", " # Generate filename with timestamp\n", " filename = f'{folder_name}_training_history.png' # Rimossa parentesi extra\n", "\n", " # Save figure\n", " plt.savefig(filename, dpi=300, bbox_inches='tight')\n", " print(f\"\\nTraining history plot saved as: {filename}\")\n", "\n", " # Save numerical data in CSV format\n", " history_df = pd.DataFrame({\n", " 'epoch': range(1, len(history.history['loss']) + 1),\n", " 'training_loss': history.history['loss'],\n", " 'validation_loss': history.history['val_loss'],\n", " 'training_mae': history.history['mae'],\n", " 'validation_mae': history.history['val_mae']})\n", "\n", " if folder_name is not None:\n", " csv_filename = f'{folder_name}_training_history.csv' # Rimossa parentesi extra\n", " history_df.to_csv(csv_filename, index=False)\n", " print(f\"Training history data saved as: {csv_filename}\")\n", "\n", " # Calculate and save final statistics\n", " final_stats = {\n", " 'final_training_loss': history.history['loss'][-1],\n", " 'final_validation_loss': history.history['val_loss'][-1],\n", " 'final_training_mae': history.history['mae'][-1],\n", " 'final_validation_mae': history.history['val_mae'][-1],\n", " 'best_validation_loss': min(history.history['val_loss']),\n", " 'best_validation_mae': min(history.history['val_mae']),\n", " 'epochs': len(history.history['loss']),\n", " }\n", "\n", " if folder_name is not None:\n", " # Save statistics in JSON format\n", " stats_filename = f'{folder_name}_training_stats.json' # Rimossa parentesi extra\n", " with open(stats_filename, 'w') as f:\n", " json.dump(final_stats, f, indent=4)\n", " print(f\"Final statistics saved as: {stats_filename}\")\n", "\n", " # Print main statistics\n", " print(\"\\nFinal Training Statistics:\")\n", " print(f\"Final Loss (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n", " print(f\"Final MAE (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n", " print(f\"Best validation loss: {final_stats['best_validation_loss']:.4f}\")\n", " print(f\"Best validation MAE: {final_stats['best_validation_mae']:.4f}\")\n", "\n", " plt.show()\n", "\n", " except Exception as e:\n", " print(f\"\\nError during plot creation or saving: {str(e)}\")\n", "\n", "\n", "def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='solarradiation'):\n", " \"\"\"\n", " Advanced training function for the hybrid solar radiation model\n", " \"\"\"\n", "\n", " def calculate_metrics(y_true, y_pred):\n", " \"\"\"Helper function to calculate metrics safely\"\"\"\n", " y_true = np.array(y_true).flatten()\n", " y_pred = np.array(y_pred).flatten()\n", "\n", " # Count out of range predictions\n", " out_of_range = np.sum((y_pred < 0) | (y_pred > 1500))\n", "\n", " # Calculate MAPE with clipping to avoid extreme values\n", " diff = np.abs((y_true - y_pred) / (y_true + 1e-7))\n", " diff = np.clip(diff, 0, 1) # Clip to maximum 100% error\n", " mape = np.mean(diff) * 100\n", "\n", " # Calculate accuracy within 10%\n", " within_10_percent = np.mean(diff <= 0.10) * 100\n", "\n", " # Calculate MAE and RMSE\n", " mae = np.mean(np.abs(y_true - y_pred))\n", " rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))\n", "\n", " return out_of_range, mape, within_10_percent, mae, rmse\n", "\n", " callbacks = [\n", " EarlyStopping(\n", " monitor='val_loss',\n", " patience=15,\n", " restore_best_weights=True,\n", " mode='min',\n", " verbose=1,\n", " min_delta=1e-4\n", " ),\n", " ReduceLROnPlateau(\n", " monitor='val_loss',\n", " factor=0.2,\n", " patience=5,\n", " verbose=1,\n", " mode='min',\n", " min_delta=1e-4,\n", " cooldown=3,\n", " min_lr=1e-7\n", " ),\n", " tf.keras.callbacks.ModelCheckpoint(\n", " filepath=f'{folder_name}_best_model.h5',\n", " monitor='val_loss',\n", " save_best_only=True,\n", " mode='min',\n", " save_weights_only=False\n", " ),\n", " tf.keras.callbacks.TensorBoard(\n", " log_dir=f'./{folder_name}_logs',\n", " histogram_freq=1,\n", " write_graph=True,\n", " update_freq='epoch'\n", " ),\n", " tf.keras.callbacks.LambdaCallback(\n", " on_epoch_end=lambda epoch, logs: (\n", " print(f\"\\nEpoch {epoch + 1} Detailed Metrics:\") and\n", " (lambda: (\n", " y_pred := model.predict(X_test, verbose=0),\n", " metrics := calculate_metrics(y_test, y_pred),\n", " print(f\"Out of range: {metrics[0]} predictions\"),\n", " print(f\"MAPE: {metrics[1]:.2f}%\"),\n", " print(f\"Within ±10%: {metrics[2]:.2f}%\"),\n", " print(f\"MAE: {metrics[3]:.2f}\"),\n", " print(f\"RMSE: {metrics[4]:.2f}\")\n", " ))()\n", " ) if epoch % 5 == 0 else None\n", " )\n", " ]\n", "\n", " try:\n", " history = model.fit(\n", " X_train, y_train,\n", " validation_data=(X_test, y_test),\n", " epochs=epochs,\n", " batch_size=batch_size,\n", " callbacks=callbacks,\n", " verbose=1,\n", " shuffle=False\n", " )\n", "\n", " print(\"\\nTraining completed successfully!\")\n", "\n", " # Final evaluation\n", " final_pred = model.predict(X_test, verbose=0)\n", " metrics = calculate_metrics(y_test, final_pred)\n", "\n", " print(\"\\nFinal Model Performance:\")\n", " print(f\"Out of range predictions: {metrics[0]} ({metrics[0] / len(y_test) * 100:.2f}%)\")\n", " print(f\"MAPE: {metrics[1]:.2f}%\")\n", " print(f\"Predictions within ±10%: {metrics[2]:.2f}%\")\n", " print(f\"MAE: {metrics[3]:.2f}\")\n", " print(f\"RMSE: {metrics[4]:.2f}\")\n", "\n", " plot_training_history(history, folder_name=folder_name)\n", "\n", " return history\n", "\n", " except Exception as e:\n", " print(f\"\\nError during training: {str(e)}\")\n", " raise\n", "\n", " finally:\n", " tf.keras.backend.clear_session()\n", "\n", "\n", "def integrate_predictions(df, predictions, sequence_length=24):\n", " \"\"\"\n", " Integrates solar radiation predictions into the original dataset for pre-2010 data.\n", "\n", " Parameters:\n", " -----------\n", " df : pandas.DataFrame\n", " Original dataset\n", " predictions : numpy.ndarray\n", " Array of solar radiation predictions\n", " sequence_length : int\n", " Sequence length used for predictions\n", "\n", " Returns:\n", " --------\n", " pandas.DataFrame\n", " Updated dataset with solar radiation predictions\n", " \"\"\"\n", " # Convert datetime to datetime format if not already\n", " df['datetime'] = pd.to_datetime(df['datetime'])\n", "\n", " # Identify pre-2010 rows\n", " mask_pre_2010 = df['datetime'].dt.year < 2010\n", "\n", " # Create temporary DataFrame with predictions\n", " dates_pre_2010 = df[mask_pre_2010]['datetime'].iloc[sequence_length - 1:]\n", " predictions_df = pd.DataFrame({\n", " 'datetime': dates_pre_2010,\n", " 'solarradiation_predicted': predictions.flatten()})\n", "\n", " # Merge with original dataset\n", " df = df.merge(predictions_df, on='datetime', how='left')\n", "\n", " # Update solar radiation column where missing\n", " df['solarradiation'] = df['solarradiation'].fillna(df['solarradiation_predicted'])\n", "\n", " # Remove temporary column\n", " df = df.drop('solarradiation_predicted', axis=1)\n", "\n", " print(f\"Added {len(predictions)} predictions to dataset\")\n", " print(f\"Rows with solar radiation after integration: {df['solarradiation'].notna().sum()}\")\n", "\n", " return df" ], "id": "93bc105a8dad086d" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "df = pd.read_parquet('../../sources/weather_data_uvindex.parquet')\n", "\n", "# Data preparation\n", "print(\"\\n1. Preparing data...\")\n", "X_train_seq, X_test_seq, y_train, y_test, feature_scaler, target_scaler, features, X_to_predict_seq = prepare_hybrid_data(df)\n", "\n", "print(f\"Training data shape: {X_train_seq.shape}\")\n", "print(f\"Test data shape: {X_test_seq.shape}\")\n", "\n", "# Save or load scaler and features\n", "feature_scaler_path = f'{folder_name}_feature_scaler.joblib'\n", "target_scaler_path = f'{folder_name}_target_scaler.joblib'\n", "features_path = f'{folder_name}_features.json'\n", "model_path = f'{folder_name}_best_model.h5'\n", "history_path = f'{folder_name}_training_history.json'\n", "\n", "if os.path.exists(feature_scaler_path):\n", " print(f\"Loading existing scaler X from: {feature_scaler_path}\")\n", " scaler = joblib.load(feature_scaler_path)\n", "else:\n", " print(f\"Saving scaler X to: {feature_scaler_path}\")\n", " joblib.dump(feature_scaler, feature_scaler_path)\n", "\n", "if os.path.exists(target_scaler_path):\n", " print(f\"Loading existing scaler X from: {target_scaler_path}\")\n", " scaler = joblib.load(target_scaler_path)\n", "else:\n", " print(f\"Saving scaler X to: {target_scaler_path}\")\n", " joblib.dump(target_scaler, target_scaler_path)\n", "\n", "if os.path.exists(features_path):\n", " print(f\"Loading existing features from: {features_path}\")\n", " with open(features_path, 'r') as f:\n", " features = json.load(f)\n", "else:\n", " print(f\"Saving features to: {features_path}\")\n", " with open(features_path, 'w') as f:\n", " json.dump(features, f)\n", "\n", "# Data quality verification\n", "if np.isnan(X_train_seq).any() or np.isnan(y_train).any():\n", " raise ValueError(\"Found NaN values in training data\")" ], "id": "8f346ed2fea89ac5" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Model creation\n", "print(\"\\n2. Creating model...\")\n", "input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])\n", "\n", "MAX_RADIATION = 885\n", "\n", "min_val_scaled = target_scaler.transform([[0]])[0][0]\n", "max_val_scaled = target_scaler.transform([[MAX_RADIATION]])[0][0]\n", "\n", "model = create_solarradiation_model(input_shape=input_shape, folder_name=folder_name, min_output=min_val_scaled, max_output=max_val_scaled)\n", "\n", "print(\"\\n4. Starting training...\")\n", "history = train_hybrid_model(\n", " model=model,\n", " X_train=X_train_seq,\n", " y_train=y_train,\n", " X_test=X_test_seq,\n", " y_test=y_test,\n", " epochs=100,\n", " batch_size=192,\n", " folder_name=folder_name\n", ")" ], "id": "38a2ad01e7fd683c" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "print(\"\\n5. Generating predictions...\")\n", "predictions = model.predict(X_test_seq)\n", "predictions = np.clip(predictions, 0, 11)\n", "\n", "predictions_original = target_scaler.inverse_transform(predictions)\n", "y_test_original = target_scaler.inverse_transform(y_test)\n", "\n", "print(\"\\n6. Evaluating model...\")\n", "metrics = evaluate_solarradiation_predictions(y_test_original, predictions_original, folder_name=folder_name)\n", "\n", "# Create results dictionary\n", "training_results = {\n", " 'model_params': {\n", " 'input_shape': input_shape,\n", " 'n_features': len(features),\n", " 'sequence_length': X_train_seq.shape[1]\n", " },\n", " 'training_params': {\n", " 'batch_size': 32,\n", " 'total_epochs': len(history.history['loss']),\n", " 'best_epoch': np.argmin(history.history['val_loss']) + 1\n", " },\n", " 'performance_metrics': {\n", " 'final_loss': float(history.history['val_loss'][-1]),\n", " 'final_mae': float(history.history['val_mae'][-1]),\n", " 'best_val_loss': float(min(history.history['val_loss'])),\n", " 'out_of_range_predictions': int(np.sum((predictions < 0) | (predictions > 11)))\n", " }\n", "}" ], "id": "d9d0c8e71043aa2e" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "print(\"\\n7. Predicting missing data...\")\n", "to_predict_predictions = model.predict(X_to_predict_seq)\n", "to_predict_predictions = np.clip(to_predict_predictions, 0, 11)\n", "\n", "print(\"\\n8. Integrating predictions into original dataset...\")\n", "df_updated = integrate_predictions(df.copy(), to_predict_predictions)\n", "\n", "df_updated.to_parquet('../../sources/weather_data_solarradiation.parquet')\n", "\n", "# Add prediction statistics to training_results\n", "training_results['prediction_stats'] = {\n", " 'n_predictions_added': len(to_predict_predictions),\n", " 'mean_predicted_solarradiation': float(to_predict_predictions.mean()),\n", " 'min_predicted_solarradiation': float(to_predict_predictions.min()),\n", " 'max_predicted_solarradiation': float(to_predict_predictions.max()),\n", "}\n", "\n", "print(\"\\nTraining completed successfully!\")" ], "id": "de61df609e8053a2" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def plot_error_analysis(y_true, y_pred, folder_name=None):\n", " \"\"\"\n", " Function to visualize prediction error analysis\n", "\n", " Parameters:\n", " -----------\n", " y_true : array-like\n", " Actual values\n", " y_pred : array-like\n", " Predicted values\n", " folder_name : str, optional\n", " Directory to save plots. If None, plots are only displayed\n", "\n", " Generates:\n", " ----------\n", " - Error distribution histogram\n", " - Actual vs Predicted scatter plot\n", " - Errors vs Actual Values scatter plot\n", " - Comprehensive error statistics\n", " \"\"\"\n", "\n", " # Convert to 1D numpy arrays if needed\n", " if isinstance(y_true, pd.Series):\n", " y_true = y_true.values\n", " if isinstance(y_pred, pd.Series):\n", " y_pred = y_pred.values\n", "\n", " y_true = y_true.ravel()\n", " y_pred = y_pred.ravel()\n", "\n", " # Calculate errors\n", " errors = y_pred - y_true\n", "\n", " # Create main figure\n", " fig = plt.figure(figsize=(15, 5))\n", "\n", " # Plot 1: Error Distribution\n", " plt.subplot(1, 3, 1)\n", " plt.hist(errors, bins=50, alpha=0.7)\n", " plt.title('Prediction Error Distribution')\n", " plt.xlabel('Error')\n", " plt.ylabel('Frequency')\n", "\n", " # Plot 2: Actual vs Predicted\n", " plt.subplot(1, 3, 2)\n", " plt.scatter(y_true, y_pred, alpha=0.5)\n", " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n", " plt.title('Actual vs Predicted Values')\n", " plt.xlabel('Actual Values')\n", " plt.ylabel('Predicted Values')\n", "\n", " # Plot 3: Errors vs Actual Values\n", " plt.subplot(1, 3, 3)\n", " plt.scatter(y_true, errors, alpha=0.5)\n", " plt.axhline(y=0, color='r', linestyle='--')\n", " plt.title('Errors vs Actual Values')\n", " plt.xlabel('Actual Values')\n", " plt.ylabel('Error')\n", "\n", " plt.tight_layout()\n", "\n", " # Save plot if directory is specified\n", " if folder_name is not None:\n", " try:\n", " # Create directory if it doesn't exist\n", " filename = f'{folder_name}_error_analysis.png'\n", "\n", " # Save figure\n", " plt.savefig(filename, dpi=300, bbox_inches='tight')\n", " print(f\"\\nPlot saved as: {filename}\")\n", " except Exception as e:\n", " print(f\"\\nError saving plot: {str(e)}\")\n", "\n", " plt.show()\n", "\n", " # Print error statistics\n", " print(\"\\nError Statistics:\")\n", " print(f\"MAE: {np.mean(np.abs(errors)):.4f}\")\n", " print(f\"MSE: {np.mean(errors ** 2):.4f}\")\n", " print(f\"RMSE: {np.sqrt(np.mean(errors ** 2)):.4f}\")\n", " print(f\"Mean error: {np.mean(errors):.4f}\")\n", " print(f\"Error std: {np.std(errors):.4f}\")\n", "\n", " # Calculate percentage of errors within thresholds\n", " thresholds = [0.5, 1.0, 1.5, 2.0]\n", " for threshold in thresholds:\n", " within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n", " print(f\"Predictions within ±{threshold}: {within_threshold:.1f}%\")\n", "\n", "\n", "# Example usage\n", "plot_error_analysis(y_test, predictions, folder_name=folder_name)" ], "id": "8a924adc70eb2f30" } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0rc1" } }, "nbformat": 4, "nbformat_minor": 5 }