diff --git a/.dvc/config b/.dvc/config
index e69de29..ae5faaa 100644
--- a/.dvc/config
+++ b/.dvc/config
@@ -0,0 +1,6 @@
+[core]
+ autostage = true
+ remote = storage
+['remote "storage"']
+ url = s3://olive-oil-dataset
+ region = eu-west-1
diff --git a/.idea/.gitignore b/.idea/.gitignore
old mode 100644
new mode 100755
diff --git a/.idea/.name b/.idea/.name
old mode 100644
new mode 100755
index 365adbf..de4f641
--- a/.idea/.name
+++ b/.idea/.name
@@ -1 +1 @@
-weather_data.parquet
\ No newline at end of file
+Tesi Pegaso
\ No newline at end of file
diff --git a/.idea/TesiPegaso.iml b/.idea/TesiPegaso.iml
old mode 100644
new mode 100755
diff --git a/.idea/csv-editor.xml b/.idea/csv-editor.xml
new file mode 100644
index 0000000..00bb426
--- /dev/null
+++ b/.idea/csv-editor.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
old mode 100644
new mode 100755
diff --git a/.idea/misc.xml b/.idea/misc.xml
old mode 100644
new mode 100755
diff --git a/.idea/modules.xml b/.idea/modules.xml
old mode 100644
new mode 100755
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
old mode 100644
new mode 100755
diff --git a/elaborato_tesi_1_6.pdf b/elaborato_tesi_1_6.pdf
old mode 100644
new mode 100755
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..f848d91
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1 @@
+/sources
diff --git a/src/README.md b/src/README.md
old mode 100644
new mode 100755
index 6040964..ec863e6
--- a/src/README.md
+++ b/src/README.md
@@ -1 +1,4 @@
python -m olive_oil_train_dataset.create_train_dataset --random-seed 42 --num-simulations 100000 --batch-size 10000 --max-workers 7
+
+
+python -m weather.uv_index.uv_index_model.py
diff --git a/src/__init__.py b/src/__init__.py
old mode 100644
new mode 100755
diff --git a/src/__pycache__/__init__.cpython-39.pyc b/src/__pycache__/__init__.cpython-39.pyc
old mode 100644
new mode 100755
diff --git a/src/dashboard/__pycache__/environmental_simulator.cpython-39.pyc b/src/dashboard/__pycache__/environmental_simulator.cpython-39.pyc
old mode 100644
new mode 100755
diff --git a/src/dashboard/environmental_simulator.py b/src/dashboard/environmental_simulator.py
old mode 100644
new mode 100755
diff --git a/src/models/olive_oli/olive_oil.ipynb b/src/models/olive_oli/olive_oil.ipynb
index f75fec7..fd89e32 100644
--- a/src/models/olive_oli/olive_oil.ipynb
+++ b/src/models/olive_oli/olive_oil.ipynb
@@ -315,36 +315,19 @@
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
- "from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, LayerNormalization, Add, Activation, BatchNormalization, MultiHeadAttention, MaxPooling1D, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, \\\n",
- " Concatenate, ZeroPadding1D, Lambda, AveragePooling1D, concatenate\n",
- "from tensorflow.keras.layers import Dense, LSTM, Conv1D, Input, concatenate, Dropout, BatchNormalization, GlobalAveragePooling1D, Bidirectional, TimeDistributed, Attention, MultiHeadAttention\n",
+ "from sklearn.preprocessing import StandardScaler\n",
"import tensorflow_addons as tfa\n",
- "from tensorflow.keras.models import Model\n",
- "from tensorflow.keras.regularizers import l2\n",
- "from tensorflow.keras.optimizers import Adam\n",
- "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint\n",
"from datetime import datetime\n",
"import os\n",
- "import json\n",
"import joblib\n",
"import re\n",
- "import pyarrow as pa\n",
- "import pyarrow.parquet as pq\n",
- "from tqdm import tqdm\n",
- "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
- "from functools import partial\n",
- "import psutil\n",
- "import multiprocessing\n",
- "from typing import List, Dict\n",
+ "from typing import List\n",
"\n",
- "random_state_value = 42\n",
+ "random_state_value = None\n",
"execute_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\n",
"\n",
"base_project_dir = './'\n",
- "data_dir = '../sources/'\n",
+ "data_dir = '../../sources/'\n",
"models_project_dir = base_project_dir\n",
"\n",
"os.makedirs(base_project_dir, exist_ok=True)\n",
@@ -823,16 +806,18 @@
"\n",
" # Split dei dati (usando indici casuali per una migliore distribuzione)\n",
" indices = np.random.permutation(len(X_temporal))\n",
- " #train_idx = int(len(indices) * 0.7)\n",
- " #val_idx = int(len(indices) * 0.85)\n",
"\n",
- " train_idx = int(len(indices) * 0.65) # 65% training\n",
- " val_idx = int(len(indices) * 0.85) # 20% validation\n",
+ " #train_idx = int(len(indices) * 0.7) # 70% training\n",
+ " #val_idx = int(len(indices) * 0.85) # 15% validation\n",
" # Il resto rimane 15% test\n",
"\n",
- " # Oppure versione con 25% validation:\n",
- " #train_idx = int(len(indices) * 0.60) # 60% training\n",
- " #val_idx = int(len(indices) * 0.85) # 25% validation\n",
+ " train_idx = int(len(indices) * 0.65) # 65% training\n",
+ " val_idx = int(len(indices) * 0.85) # 20% validation\n",
+ " # Il resto rimane 15% test\n",
+ "\n",
+ " #train_idx = int(len(indices) * 0.60) # 60% training\n",
+ " #val_idx = int(len(indices) * 0.85) # 25% validation\n",
+ " # Il resto rimane 15% test\n",
"\n",
" train_indices = indices[:train_idx]\n",
" val_indices = indices[train_idx:val_idx]\n",
diff --git a/src/models/solarenergy/solarenergy_model.ipynb b/src/models/solarenergy/solarenergy_model.ipynb
index b91baef..5545abb 100755
--- a/src/models/solarenergy/solarenergy_model.ipynb
+++ b/src/models/solarenergy/solarenergy_model.ipynb
@@ -5,7 +5,6 @@
"id": "8adcbe0819b88578",
"metadata": {},
"source": [
- "'''\n",
"from opt_einsum.paths import branch_1\n",
"!apt-get update\n",
"!apt-get install graphviz -y\n",
@@ -25,8 +24,7 @@
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io\n",
- "!pip install tensorflow-addons\n",
- "'''"
+ "!pip install tensorflow-addons\n"
],
"outputs": [],
"execution_count": null
@@ -43,14 +41,12 @@
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
- "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.preprocessing import RobustScaler\n",
"from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
"from tensorflow.keras.optimizers import AdamW\n",
"import json\n",
"from datetime import datetime\n",
"import matplotlib.pyplot as plt\n",
- "from sklearn.metrics import confusion_matrix\n",
"from tensorflow.keras.utils import plot_model\n",
"import tensorflow_addons as tfa\n",
"\n",
@@ -119,200 +115,148 @@
"\n",
"\n",
"def add_solar_features(df):\n",
- " # Calcolo dell'angolo solare\n",
- " df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))\n",
+ " # Features based only on radiation and other available variables\n",
+ " df['solar_elevation'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))\n",
"\n",
- " # Interazioni tra features rilevanti\n",
- " df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']\n",
- " df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])\n",
+ " # Radiation-specific features\n",
+ " df['radiation_clearsky'] = df['solarradiation'] * (100 - df['cloudcover']) / 100\n",
"\n",
- " # Feature derivate\n",
+ " # Temperature impact on theoretical efficiency\n",
+ " df['temp_efficiency_factor'] = 1 - 0.004 * (df['temp'] - 25) # Typical temperature coefficient\n",
+ "\n",
+ " # Combined features\n",
+ " df['cloud_impact'] = df['cloudcover'] * df['solarradiation']\n",
+ " df['visibility_radiation'] = df['visibility'] * df['solarradiation']\n",
" df['clear_sky_index'] = (100 - df['cloudcover']) / 100\n",
- " df['temp_gradient'] = df['temp'] - df['tempmin']\n",
+ " df['temp_effect'] = df['temp'] - df['tempmin']\n",
"\n",
" return df\n",
"\n",
- "\n",
"def add_solar_specific_features(df):\n",
- " # Angolo solare e durata del giorno\n",
+ " # Solar position and theoretical calculations\n",
" df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)\n",
- " df['solar_noon'] = 12 - df['hour']\n",
- " df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * np.cos(2 * np.pi * df['solar_noon'] / 24)\n",
+ " df['solar_noon_distance'] = np.abs(12 - df['hour'])\n",
+ " df['solar_potential'] = df['clear_sky_index'] * np.cos(df['solar_noon_distance'] * np.pi / 12)\n",
"\n",
- " # Interazioni\n",
- " df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']\n",
- " df['visibility_elevation'] = df['visibility'] * df['solar_elevation']\n",
+ " # Rolling features for radiation\n",
+ " windows = [3, 6, 12]\n",
+ " for w in windows:\n",
+ " df[f'radiation_rolling_{w}h'] = df['solarradiation'].rolling(window=w).mean()\n",
"\n",
- " # Rolling features con finestre più ampie\n",
- " df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12).mean()\n",
- " df['temp_rolling_12h'] = df['temp'].rolling(window=12).mean()\n",
+ " # Theoretical radiation features\n",
+ " df['theoretical_radiation'] = df['solarradiation'] / (df['clear_sky_index'] + 1e-6)\n",
"\n",
" return df\n",
"\n",
- "\n",
"def add_radiation_energy_features(df):\n",
- " \"\"\"Aggiunge feature specifiche basate su solarenergy e uvindex\"\"\"\n",
- "\n",
- " # Rapporto tra energia solare e UV (indipendente da solarradiation)\n",
- " df['energy_uv_ratio'] = df['solarenergy'] / (df['uvindex'] + 1e-6)\n",
- "\n",
- " # Aggregazioni temporali\n",
- " # Medie mobili\n",
- " windows = [3, 6, 12, 24] # ore\n",
+ " # Features based only on radiation\n",
+ " windows = [3, 6, 12]\n",
" for w in windows:\n",
- " df[f'energy_rolling_mean_{w}h'] = df['solarenergy'].rolling(window=w).mean()\n",
- " df[f'uv_rolling_mean_{w}h'] = df['uvindex'].rolling(window=w).mean()\n",
+ " # Radiation features\n",
+ " df[f'radiation_rolling_mean_{w}h'] = df['solarradiation'].rolling(window=w).mean()\n",
+ " df[f'radiation_rolling_std_{w}h'] = df['solarradiation'].rolling(window=w).std()\n",
"\n",
- " # Aggregazioni giornaliere\n",
- " df['energy_daily_sum'] = df.groupby(df.index.date)['solarenergy'].transform('sum')\n",
- " df['uv_daily_max'] = df.groupby(df.index.date)['uvindex'].transform('max')\n",
+ " # Daily aggregations for radiation\n",
+ " df['radiation_daily_sum'] = df.groupby(df.index.date)['solarradiation'].transform('sum')\n",
+ " df['radiation_daily_max'] = df.groupby(df.index.date)['solarradiation'].transform('max')\n",
"\n",
- " # Variazioni\n",
- " df['energy_change'] = df['solarenergy'].diff()\n",
- " df['uv_change'] = df['uvindex'].diff()\n",
- "\n",
- " # Lag features\n",
- " lags = [1, 2, 3, 6, 12, 24] # ore\n",
+ " # Lag features for radiation\n",
+ " lags = [1, 2, 3, 6]\n",
" for lag in lags:\n",
- " df[f'energy_lag_{lag}h'] = df['solarenergy'].shift(lag)\n",
- " df[f'uv_lag_{lag}h'] = df['uvindex'].shift(lag)\n",
- "\n",
- " # Indicatori di picco\n",
- " df['is_energy_peak'] = (df['solarenergy'] > df['energy_rolling_mean_6h'] * 1.2).astype(int)\n",
- " df['is_uv_peak'] = (df['uvindex'] > df['uv_rolling_mean_6h'] * 1.2).astype(int)\n",
+ " df[f'radiation_lag_{lag}h'] = df['solarradiation'].shift(lag)\n",
"\n",
" return df\n",
"\n",
- "\n",
"def add_advanced_features(df):\n",
- " # Features esistenti\n",
" df = add_time_features(df)\n",
" df = add_solar_features(df)\n",
" df = add_solar_specific_features(df)\n",
" df = add_radiation_energy_features(df)\n",
"\n",
- " if not isinstance(df.index, pd.DatetimeIndex):\n",
- " df.index = pd.to_datetime(df.index)\n",
- "\n",
- " # One-hot encoding per le feature categoriche\n",
" df = pd.get_dummies(df, columns=['season', 'time_period'])\n",
"\n",
- " # Interazioni tra variabili meteorologiche\n",
- " df['temp_humidity'] = df['temp'] * df['humidity']\n",
- " df['temp_cloudcover'] = df['temp'] * df['cloudcover']\n",
- " df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']\n",
- "\n",
- " # Features derivate per la radiazione solare\n",
- " df['clear_sky_factor'] = (100 - df['cloudcover']) / 100\n",
- " df['day_length'] = np.sin(df['day_of_year_sin']) * 12 + 12 # approssimazione della durata del giorno\n",
- "\n",
- " # Lag features\n",
- " df['temp_1h_lag'] = df['temp'].shift(1)\n",
- " df['cloudcover_1h_lag'] = df['cloudcover'].shift(1)\n",
- " df['humidity_1h_lag'] = df['humidity'].shift(1)\n",
- "\n",
- " # Rolling means\n",
- " df['temp_rolling_mean_6h'] = df['temp'].rolling(window=6).mean()\n",
- " df['cloudcover_rolling_mean_6h'] = df['cloudcover'].rolling(window=6).mean()\n",
- "\n",
" df['temp_humidity_interaction'] = df['temp'] * df['humidity'] / 100\n",
+ " df['radiation_temp_interaction'] = df['solarradiation'] * df['temp_efficiency_factor']\n",
+ " df['radiation_cloud_interaction'] = df['solarradiation'] * (1 - df['cloudcover']/100)\n",
"\n",
- " # Indicatore di condizioni estreme\n",
- " df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) & (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
- "\n",
- " # Feature composite per la trasparenza atmosferica\n",
- " df['atmospheric_transparency'] = (100 - df['cloudcover']) * (df['visibility'] / 10)\n",
- "\n",
- " # Indicatori temporali più granulari per mezze stagioni\n",
- " df['is_transition_season'] = ((df['season_Spring'] | df['season_Autumn'])).astype(int)\n",
- "\n",
- " # Interazione tra angolo solare e copertura nuvolosa normalizzata\n",
- " df['solar_cloud_effect'] = df['solar_elevation'] * (100 - df['cloudcover']) / 100\n",
- "\n",
- " # Indicatore di stabilità atmosferica\n",
- " df['pressure_stability'] = df.groupby(df.index.date if isinstance(df.index, pd.DatetimeIndex)\n",
- " else df.index.to_series().dt.date)['pressure'].transform(\n",
- " lambda x: x.std()\n",
- " ).fillna(0)\n",
+ " # Theoretical maximum based on clear sky conditions\n",
+ " df['theoretical_max_radiation'] = df['solarradiation'] / (df['clear_sky_index'] + 1e-6)\n",
"\n",
" return df\n",
"\n",
"\n",
"def prepare_advanced_data(df):\n",
- " # Applicazione delle funzioni di feature engineering\n",
+ " # Apply feature engineering functions\n",
" df = add_advanced_features(df)\n",
"\n",
" target_variables = ['solarradiation', 'solarenergy', 'uvindex']\n",
"\n",
- " # Feature selection aggiornata (senza dipendenze da solarradiation)\n",
" selected_features = [\n",
- " # Features meteorologiche base\n",
- " 'temp', 'humidity', 'cloudcover', 'visibility', 'pressure',\n",
+ " # Weather features\n",
+ " 'temp', 'humidity', 'cloudcover', 'visibility',\n",
+ " 'temp_effect',\n",
"\n",
- " # Features solari\n",
- " 'zenith_angle', 'air_mass', 'atmospheric_transmission',\n",
- " 'cloud_transmission', 'theoretical_radiation',\n",
+ " # Solar radiation features\n",
+ " 'solarradiation',\n",
+ " 'radiation_clearsky',\n",
+ " 'radiation_rolling_mean_3h',\n",
+ " 'radiation_rolling_mean_6h',\n",
+ " 'radiation_daily_sum',\n",
+ " 'radiation_daily_max',\n",
+ " 'radiation_lag_1h',\n",
+ " 'radiation_lag_3h',\n",
"\n",
- " # Features temporali\n",
- " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',\n",
+ " # Temperature efficiency\n",
+ " 'temp_efficiency_factor',\n",
+ "\n",
+ " # Time features\n",
+ " 'hour_sin', 'hour_cos',\n",
+ " 'month_sin', 'month_cos',\n",
" 'day_of_year_sin', 'day_of_year_cos',\n",
"\n",
- " # Features atmosferiche\n",
- " 'clear_sky_index', 'humidity_factor', 'atmospheric_clarity',\n",
- " 'vapor_pressure',\n",
+ " # Solar position and potential\n",
+ " 'solar_elevation',\n",
+ " 'solar_potential',\n",
+ " 'day_length',\n",
+ " 'solar_noon_distance',\n",
"\n",
- " # Feature energia solare e UV\n",
- " 'energy_uv_ratio',\n",
+ " # Clear sky and theoretical features\n",
+ " 'clear_sky_index',\n",
+ " 'theoretical_max_radiation',\n",
"\n",
- " # Medie mobili\n",
- " 'energy_rolling_mean_3h', 'energy_rolling_mean_6h',\n",
- " 'uv_rolling_mean_3h', 'uv_rolling_mean_6h',\n",
- "\n",
- " # Aggregazioni giornaliere\n",
- " 'energy_daily_sum', 'uv_daily_max',\n",
- "\n",
- " # Lag features principali\n",
- " 'energy_lag_1h', 'energy_lag_3h', 'energy_lag_6h',\n",
- " 'uv_lag_1h', 'uv_lag_3h',\n",
- "\n",
- " # Indicatori di picco e volatilità\n",
- " 'is_energy_peak', 'is_uv_peak',\n",
- " 'energy_volatility', 'uv_volatility',\n",
- "\n",
- " # Indici compositi\n",
- " 'solar_intensity_index',\n",
- "\n",
- " # Interazioni\n",
- " 'uv_cloud_interaction',\n",
- " 'energy_temp_interaction'\n",
+ " # Interaction features\n",
+ " 'radiation_temp_interaction',\n",
+ " 'radiation_cloud_interaction',\n",
+ " 'temp_humidity_interaction',\n",
+ " 'visibility_radiation'\n",
" ]\n",
"\n",
- " # Aggiungi colonne one-hot\n",
+ " # Add one-hot columns\n",
" categorical_columns = [col for col in df.columns if col.startswith(('season_', 'time_period_'))]\n",
" final_features = selected_features + categorical_columns\n",
"\n",
- " # Preparazione del dataset\n",
+ " # Dataset preparation\n",
" df = df.sort_values('datetime')\n",
" df.set_index('datetime', inplace=True)\n",
"\n",
- " # Gestione valori mancanti\n",
+ " # Handle missing values\n",
" for column in final_features + target_variables:\n",
" df[column] = df[column].interpolate(method='time')\n",
" df.fillna(0, inplace=True)\n",
"\n",
- " # Split temporale\n",
+ " # Temporal split\n",
" data_after_2010 = df[df['year'] >= 2010].copy()\n",
" data_before_2010 = df[df['year'] < 2010].copy()\n",
"\n",
" X = data_after_2010[final_features]\n",
- " y = data_after_2010['solarradiation']\n",
+ " y = data_after_2010['solarenergy']\n",
" X_to_predict = data_before_2010[final_features]\n",
"\n",
" # Train-test split\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=random_state_value)\n",
"\n",
" # Scaling\n",
- " scaler = StandardScaler()\n",
+ " scaler = RobustScaler()\n",
" X_train_scaled = scaler.fit_transform(X_train)\n",
" X_test_scaled = scaler.transform(X_test)\n",
" X_to_predict_scaled = scaler.transform(X_to_predict)\n",
@@ -322,8 +266,8 @@
"\n",
"def create_sequence_data(X, sequence_length=24):\n",
" \"\"\"\n",
- " Converte i dati in sequenze per l'input LSTM\n",
- " sequence_length rappresenta quante ore precedenti considerare\n",
+ " Converts data into sequences for LSTM input\n",
+ " sequence_length represents how many previous hours to consider\n",
" \"\"\"\n",
" sequences = []\n",
" for i in range(len(X) - sequence_length + 1):\n",
@@ -334,13 +278,13 @@
"def prepare_hybrid_data(df):\n",
" X_train_scaled, X_test_scaled, y_train, y_test, scaler, features, X_to_predict_scaled = prepare_advanced_data(df)\n",
"\n",
- " # Convertiamo i dati in sequenze\n",
- " sequence_length = 24 # 24 ore di dati storici\n",
+ " # Convert data into sequences\n",
+ " sequence_length = 24 # 24 hours of historical data\n",
"\n",
" X_train_seq = create_sequence_data(X_train_scaled, sequence_length)\n",
" X_test_seq = create_sequence_data(X_test_scaled, sequence_length)\n",
"\n",
- " # Adattiamo le y rimuovendo i primi (sequence_length-1) elementi\n",
+ " # Adjust y by removing the first (sequence_length-1) elements\n",
" y_train = y_train[sequence_length - 1:]\n",
" y_test = y_test[sequence_length - 1:]\n",
"\n",
@@ -357,42 +301,63 @@
"metadata": {},
"source": [
"def create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01, return_sequences=True, survival_probability=0.8):\n",
+ " \"\"\"\n",
+ " Creates a residual LSTM layer with bidirectional processing, normalization, and dropout\n",
+ " Parameters:\n",
+ " x: input tensor\n",
+ " units: number of LSTM units\n",
+ " dropout_rate: dropout probability\n",
+ " l2_reg: L2 regularization factor\n",
+ " return_sequences: whether to return sequences or just the final output\n",
+ " survival_probability: probability of layer survival for stochastic depth\n",
+ " \"\"\"\n",
" residual = x\n",
" x = Bidirectional(LSTM(units, return_sequences=return_sequences, kernel_regularizer=regularizers.l2(l2_reg)))(x)\n",
" x = LayerNormalization()(x)\n",
" x = Dropout(dropout_rate)(x)\n",
"\n",
" if return_sequences:\n",
+ " # Adjust residual dimensions if needed\n",
" if int(residual.shape[-1]) != 2 * units:\n",
" residual = Dense(2 * units, activation='linear')(residual)\n",
" x = tfa.layers.StochasticDepth(survival_probability)([x, residual])\n",
" return x\n",
"\n",
"def attention_block(x, units, num_heads=8, survival_probability=0.8):\n",
+ " \"\"\"\n",
+ " Creates an attention block with multi-head attention and layer normalization\n",
+ " Parameters:\n",
+ " x: input tensor\n",
+ " units: dimensionality of the attention layer\n",
+ " num_heads: number of attention heads\n",
+ " survival_probability: probability of layer survival for stochastic depth\n",
+ " \"\"\"\n",
" attention = MultiHeadAttention(num_heads=num_heads, key_dim=units)(x, x)\n",
" x = tfa.layers.StochasticDepth(survival_probability)([x, attention])\n",
" x = LayerNormalization()(x)\n",
" return x\n",
"\n",
"def create_solarradiation_model(input_shape, folder_name, l2_lambda=0.005):\n",
+ " \"\"\"\n",
+ " Creates a deep learning model for solar radiation prediction\n",
+ " Parameters:\n",
+ " input_shape: shape of input data\n",
+ " folder_name: directory to save model architecture visualization\n",
+ " l2_lambda: L2 regularization factor\n",
+ " \"\"\"\n",
" inputs = Input(shape=input_shape)\n",
"\n",
- " # Progressive survival probabilities\n",
- " survival_probs = [0.9, 0.8, 0.7]\n",
- " attention_survival_probs = [0.85, 0.75, 0.65]\n",
+ " # Define progressive hyperparameters for model depth\n",
+ " survival_probs = [0.9, 0.8, 0.7] # Decreasing survival probabilities\n",
+ " attention_survival_probs = [0.85, 0.75, 0.65] # Decreasing attention survival probabilities\n",
+ " lstm_units = [256, 128, 64] # Decreasing LSTM units\n",
+ " dropout_rates = [0.4, 0.3, 0.2] # Decreasing dropout rates\n",
+ " attention_heads = [32, 24, 16] # Decreasing attention heads\n",
"\n",
- " # Progressive units for LSTM layers\n",
- " lstm_units = [256, 128, 64]\n",
- "\n",
- " # Progressive dropout rates\n",
- " dropout_rates = [0.4, 0.3, 0.2]\n",
- "\n",
- " # Number of attention heads for each block\n",
- " attention_heads = [32, 24, 16]\n",
- "\n",
- " # LSTM layers with attention blocks\n",
+ " # Build LSTM layers with attention blocks\n",
" x = inputs\n",
" for i in range(3):\n",
+ " # Add residual LSTM layer\n",
" x = create_residual_lstm_layer(\n",
" x,\n",
" units=lstm_units[i],\n",
@@ -401,6 +366,7 @@
" return_sequences=True,\n",
" survival_probability=survival_probs[i]\n",
" )\n",
+ " # Add attention block\n",
" x = attention_block(\n",
" x,\n",
" units=lstm_units[i],\n",
@@ -410,17 +376,17 @@
" if i < 2: # No pooling after last LSTM layer\n",
" x = MaxPooling1D()(x)\n",
"\n",
- " # Final LSTM layer without return sequences\n",
+ " # Final LSTM layer for sequence aggregation\n",
" x = create_residual_lstm_layer(\n",
" x,\n",
" units=32,\n",
" dropout_rate=0.1,\n",
" l2_reg=l2_lambda,\n",
" return_sequences=False,\n",
- " survival_probability=0.6 # Lowest survival probability for final layer\n",
+ " survival_probability=0.6\n",
" )\n",
"\n",
- " # Dense layers with progressive narrowing\n",
+ " # Dense layers for final prediction\n",
" dense_units = [64, 32]\n",
" dense_dropout = [0.2, 0.1]\n",
"\n",
@@ -430,12 +396,13 @@
" x = Activation('swish')(x)\n",
" x = Dropout(dropout)(x)\n",
"\n",
- " # Output layer\n",
+ " # Output layer with value clipping\n",
" outputs = Dense(1)(x)\n",
" outputs = Lambda(lambda x: tf.clip_by_value(x, 0, 1500))(outputs)\n",
"\n",
" model = Model(inputs=inputs, outputs=outputs, name=\"SolarRadiationModel\")\n",
"\n",
+ " # Configure optimizer with weight decay\n",
" optimizer = AdamW(\n",
" learning_rate=0.0003,\n",
" beta_1=0.9,\n",
@@ -444,20 +411,24 @@
" weight_decay=0.001\n",
" )\n",
"\n",
- " # Custom metrics\n",
+ " # Custom evaluation metrics\n",
" def rmse(y_true, y_pred):\n",
+ " \"\"\"Root Mean Square Error\"\"\"\n",
" return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))\n",
"\n",
" def mape(y_true, y_pred):\n",
+ " \"\"\"Mean Absolute Percentage Error\"\"\"\n",
" epsilon = 1e-7\n",
" return tf.reduce_mean(tf.abs((y_true - y_pred) / (y_true + epsilon))) * 100\n",
"\n",
- " # Hybrid loss combining MSE and MAE\n",
+ " # Combined loss function\n",
" def hybrid_loss(y_true, y_pred):\n",
+ " \"\"\"Weighted combination of MSE and MAE\"\"\"\n",
" mse = tf.reduce_mean(tf.square(y_true - y_pred))\n",
" mae = tf.reduce_mean(tf.abs(y_true - y_pred))\n",
" return 0.7 * mse + 0.3 * mae\n",
"\n",
+ " # Compile model with custom loss and metrics\n",
" model.compile(\n",
" optimizer=optimizer,\n",
" loss=hybrid_loss,\n",
@@ -469,6 +440,7 @@
" )\n",
" model.summary()\n",
"\n",
+ " # Save model architecture visualization\n",
" plot_model(model,\n",
" to_file=f'{folder_name}_model_architecture.png',\n",
" show_shapes=True,\n",
@@ -479,92 +451,98 @@
" return model\n",
"\n",
"\n",
- "def evaluate_solarradiation_predictions(y_true, y_pred, folder_name=None):\n",
+ "def evaluate_solarenergy_predictions(y_true, y_pred, hour=None, folder_name=None):\n",
" \"\"\"\n",
- " Valutazione specifica per la radiazione solare con metriche appropriate\n",
+ " Comprehensive evaluation of solar energy predictions with detailed analysis and visualizations\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
- " Valori reali della radiazione solare (W/m²)\n",
+ " Actual solar energy values (kWh)\n",
" y_pred : array-like\n",
- " Valori predetti della radiazione solare (W/m²)\n",
+ " Predicted solar energy values (kWh)\n",
+ " hour : array-like, optional\n",
+ " Array of hours corresponding to predictions, for temporal analysis\n",
" folder_name : str, optional\n",
- " Cartella dove salvare eventuali plot di analisi\n",
+ " Folder to save analysis plots\n",
"\n",
" Returns:\n",
" --------\n",
" dict\n",
- " Dizionario contenente tutte le metriche calcolate\n",
+ " Dictionary containing all calculated metrics\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
" import numpy as np\n",
" import pandas as pd\n",
" import matplotlib.pyplot as plt\n",
- " from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
+ " import seaborn as sns\n",
+ " from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix\n",
"\n",
+ " # Data conversion and preparation\n",
" y_true = np.array(y_true).ravel()\n",
" y_pred = np.array(y_pred).ravel()\n",
+ " errors = y_pred - y_true\n",
"\n",
- " # Calcolo metriche sui valori raw\n",
+ " # Basic metrics\n",
" mae_raw = mean_absolute_error(y_true, y_pred)\n",
" rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred))\n",
" r2_raw = r2_score(y_true, y_pred)\n",
- "\n",
- " # Calcolo MAPE (Mean Absolute Percentage Error)\n",
" mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-7))) * 100\n",
"\n",
- " # Calcolo accuratezza per diversi margini di errore percentuale\n",
+ " # Accuracy for error margins\n",
" within_5_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.05)\n",
" within_10_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.10)\n",
" within_20_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.20)\n",
"\n",
- " # Definizione dei livelli di radiazione solare (W/m²)\n",
- " def get_radiation_level(value):\n",
- " if value <= 200:\n",
+ " # Define solar energy levels\n",
+ " def get_energy_level(value):\n",
+ " if value <= 0.5:\n",
" return 'Very Low'\n",
- " elif value <= 400:\n",
+ " elif value <= 2.0:\n",
" return 'Low'\n",
- " elif value <= 600:\n",
+ " elif value <= 4.0:\n",
" return 'Moderate'\n",
- " elif value <= 800:\n",
+ " elif value <= 6.0:\n",
" return 'High'\n",
- " elif value <= 1000:\n",
+ " elif value <= 8.0:\n",
" return 'Very High'\n",
" else:\n",
" return 'Extreme'\n",
"\n",
- " # Calcola livelli di radiazione\n",
- " y_true_levels = [get_radiation_level(v) for v in y_true]\n",
- " y_pred_levels = [get_radiation_level(v) for v in y_pred]\n",
- "\n",
- " # Calcola accuracy dei livelli\n",
+ " # Calculate levels\n",
+ " y_true_levels = [get_energy_level(v) for v in y_true]\n",
+ " y_pred_levels = [get_energy_level(v) for v in y_pred]\n",
" level_accuracy = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels)])\n",
"\n",
- " print(\"\\nSolar Radiation Prediction Metrics:\")\n",
+ " # Print main metrics\n",
+ " print(\"\\nSolar Energy Prediction Metrics:\")\n",
" print(\"\\nAbsolute Metrics:\")\n",
- " print(f\"MAE: {mae_raw:.2f} W/m²\")\n",
- " print(f\"RMSE: {rmse_raw:.2f} W/m²\")\n",
+ " print(f\"MAE: {mae_raw:.4f} kWh\")\n",
+ " print(f\"RMSE: {rmse_raw:.4f} kWh\")\n",
" print(f\"R² Score: {r2_raw:.3f}\")\n",
" print(f\"MAPE: {mape:.2f}%\")\n",
"\n",
- " print(\"\\nPercentage-based Accuracy:\")\n",
- " print(f\"Within ±5%: {within_5_percent:.3f}\")\n",
- " print(f\"Within ±10%: {within_10_percent:.3f}\")\n",
- " print(f\"Within ±20%: {within_20_percent:.3f}\")\n",
+ " print(\"\\nPercentage Accuracy:\")\n",
+ " print(f\"Within ±5%: {within_5_percent*100:.1f}%\")\n",
+ " print(f\"Within ±10%: {within_10_percent*100:.1f}%\")\n",
+ " print(f\"Within ±20%: {within_20_percent*100:.1f}%\")\n",
"\n",
- " print(\"\\nRadiation Level Accuracy:\")\n",
- " print(f\"Level Accuracy: {level_accuracy:.3f}\")\n",
+ " print(\"\\nLevel Accuracy:\")\n",
+ " print(f\"Level Accuracy: {level_accuracy*100:.1f}%\")\n",
"\n",
- " print(\"\\nRadiation Level Confusion Matrix:\")\n",
- " print(pd.crosstab(\n",
- " pd.Series(y_true_levels, name='Actual'),\n",
- " pd.Series(y_pred_levels, name='Predicted')\n",
- " ))\n",
+ " # Confusion matrix for levels\n",
+ " cm = confusion_matrix(y_true_levels, y_pred_levels)\n",
+ " print(\"\\nConfusion Matrix for Levels:\")\n",
+ " cm_df = pd.DataFrame(\n",
+ " cm,\n",
+ " columns=['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extreme'],\n",
+ " index=['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extreme']\n",
+ " )\n",
+ " print(cm_df)\n",
"\n",
- " # Analisi degli errori per diverse fasce orarie\n",
- " if 'hour' in locals():\n",
+ " # Analysis by time periods\n",
+ " if hour is not None:\n",
" day_periods = {\n",
" 'Morning (5-11)': (5, 11),\n",
" 'Noon (11-13)': (11, 13),\n",
@@ -573,81 +551,118 @@
" 'Night (21-5)': (21, 5)\n",
" }\n",
"\n",
- " print(\"\\nError Analysis by Time of Day:\")\n",
+ " print(\"\\nAnalysis by Time Period:\")\n",
" for period, (start, end) in day_periods.items():\n",
" if start < end:\n",
" mask = (hour >= start) & (hour < end)\n",
- " else: # Per gestire il periodo notturno\n",
+ " else:\n",
" mask = (hour >= start) | (hour < end)\n",
"\n",
- " period_mae = mean_absolute_error(y_true[mask], y_pred[mask])\n",
- " period_mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / (y_true[mask] + 1e-7))) * 100\n",
- " print(f\"\\n{period}:\")\n",
- " print(f\"MAE: {period_mae:.2f} W/m²\")\n",
- " print(f\"MAPE: {period_mape:.2f}%\")\n",
+ " if np.any(mask):\n",
+ " period_mae = mean_absolute_error(y_true[mask], y_pred[mask])\n",
+ " period_mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / (y_true[mask] + 1e-7))) * 100\n",
+ " period_mean = np.mean(y_true[mask])\n",
+ " print(f\"\\n{period}:\")\n",
+ " print(f\"MAE: {period_mae:.4f} kWh\")\n",
+ " print(f\"MAPE: {period_mape:.2f}%\")\n",
+ " print(f\"Mean Energy: {period_mean:.4f} kWh\")\n",
"\n",
- " # Se specificata una cartella, salva i plot di analisi\n",
+ " # Visualizations\n",
" if folder_name is not None:\n",
" try:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"\n",
- " plt.figure(figsize=(15, 10))\n",
+ " # Figure 1: Main analysis\n",
+ " plt.figure(figsize=(20, 15))\n",
"\n",
" # Plot 1: Scatter plot\n",
- " plt.subplot(2, 2, 1)\n",
+ " plt.subplot(3, 2, 1)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
- " plt.plot([0, max(y_true.max(), y_pred.max())],\n",
- " [0, max(y_true.max(), y_pred.max())],\n",
- " 'r--', lw=2)\n",
- " plt.xlabel('Actual Solar Radiation (W/m²)')\n",
- " plt.ylabel('Predicted Solar Radiation (W/m²)')\n",
- " plt.title('Predicted vs Actual Values')\n",
+ " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
+ " plt.xlabel('Actual Energy (kWh)')\n",
+ " plt.ylabel('Predicted Energy (kWh)')\n",
+ " plt.title('Actual vs Predicted Values')\n",
" plt.grid(True)\n",
"\n",
- " # Plot 2: Distribuzione errori assoluti\n",
- " plt.subplot(2, 2, 2)\n",
- " plt.hist(y_pred - y_true, bins=50, alpha=0.7)\n",
- " plt.xlabel('Prediction Error (W/m²)')\n",
+ " # Plot 2: Absolute errors distribution\n",
+ " plt.subplot(3, 2, 2)\n",
+ " plt.hist(errors, bins=50, alpha=0.7)\n",
+ " plt.xlabel('Prediction Error (kWh)')\n",
" plt.ylabel('Frequency')\n",
- " plt.title('Distribution of Absolute Errors')\n",
+ " plt.title('Error Distribution')\n",
" plt.grid(True)\n",
"\n",
- " # Plot 3: Distribuzione errori percentuali\n",
- " plt.subplot(2, 2, 3)\n",
+ " # Plot 3: Percentage errors distribution\n",
+ " plt.subplot(3, 2, 3)\n",
" percentage_errors = ((y_pred - y_true) / (y_true + 1e-7)) * 100\n",
" plt.hist(np.clip(percentage_errors, -100, 100), bins=50, alpha=0.7)\n",
" plt.xlabel('Percentage Error (%)')\n",
" plt.ylabel('Frequency')\n",
- " plt.title('Distribution of Percentage Errors')\n",
+ " plt.title('Percentage Error Distribution')\n",
" plt.grid(True)\n",
"\n",
- " # Plot 4: Box plot degli errori per livello di radiazione\n",
- " plt.subplot(2, 2, 4)\n",
- " errors_by_level = pd.DataFrame({\n",
- " 'Level': y_true_levels,\n",
- " 'Error': y_pred - y_true\n",
- " })\n",
- " errors_by_level.boxplot(column='Error', by='Level', figsize=(10, 6))\n",
- " plt.xlabel('Actual Radiation Level')\n",
- " plt.ylabel('Prediction Error (W/m²)')\n",
- " plt.title('Error Distribution by Radiation Level')\n",
- " plt.xticks(rotation=45)\n",
+ " # Plot 4: Errors vs Actual Values\n",
+ " plt.subplot(3, 2, 4)\n",
+ " plt.scatter(y_true, errors, alpha=0.5)\n",
+ " plt.axhline(y=0, color='r', linestyle='--')\n",
+ " plt.xlabel('Actual Energy (kWh)')\n",
+ " plt.ylabel('Error (kWh)')\n",
+ " plt.title('Errors vs Actual Values')\n",
" plt.grid(True)\n",
"\n",
+ " # Plot 5: Box plot errors by level\n",
+ " plt.subplot(3, 2, 5)\n",
+ " sns.boxplot(x=[get_energy_level(v) for v in y_true], y=errors)\n",
+ " plt.xticks(rotation=45)\n",
+ " plt.xlabel('Energy Level')\n",
+ " plt.ylabel('Error (kWh)')\n",
+ " plt.title('Error Distribution by Level')\n",
+ "\n",
+ " # Plot 6: Confusion matrix\n",
+ " plt.subplot(3, 2, 6)\n",
+ " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
+ " plt.title('Confusion Matrix')\n",
+ " plt.xticks(rotation=45)\n",
+ " plt.yticks(rotation=45)\n",
+ "\n",
" plt.tight_layout()\n",
- "\n",
- " # Salva il plot\n",
- " filename = os.path.join(folder_name, f'solar_radiation_analysis_{timestamp}.png')\n",
+ " filename = os.path.join(folder_name, f'solar_energy_analysis_{timestamp}.png')\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot di analisi salvato come: {filename}\")\n",
+ " print(f\"\\nPlot saved as: {filename}\")\n",
+ " plt.close()\n",
"\n",
- " plt.show()\n",
+ " # Additional plot for temporal analysis if hour is available\n",
+ " if hour is not None:\n",
+ " plt.figure(figsize=(15, 8))\n",
+ " plt.scatter(hour, errors, alpha=0.5)\n",
+ " plt.axhline(y=0, color='r', linestyle='--')\n",
+ " plt.xlabel('Hour of Day')\n",
+ " plt.ylabel('Error (kWh)')\n",
+ " plt.title('Error Distribution by Hour of Day')\n",
+ " plt.grid(True)\n",
+ "\n",
+ " filename = os.path.join(folder_name, f'hourly_error_analysis_{timestamp}.png')\n",
+ " plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
+ " plt.close()\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
+ " print(f\"\\nError saving plots: {str(e)}\")\n",
"\n",
- " # Restituisci tutte le metriche in un dizionario\n",
+ " # Additional metrics\n",
+ " print(\"\\nError Statistics:\")\n",
+ " print(f\"Mean errors: {np.mean(errors):.4f} kWh\")\n",
+ " print(f\"Standard deviation of errors: {np.std(errors):.4f} kWh\")\n",
+ " print(f\"Median error: {np.median(errors):.4f} kWh\")\n",
+ " print(f\"95th percentile absolute error: {np.percentile(np.abs(errors), 95):.4f} kWh\")\n",
+ "\n",
+ " print(\"\\nProduction Statistics:\")\n",
+ " print(f\"Mean actual energy: {np.mean(y_true):.4f} kWh\")\n",
+ " print(f\"Mean predicted energy: {np.mean(y_pred):.4f} kWh\")\n",
+ " print(f\"Maximum actual energy: {np.max(y_true):.4f} kWh\")\n",
+ " print(f\"Maximum predicted energy: {np.max(y_pred):.4f} kWh\")\n",
+ "\n",
+ " # Return metrics in structured format\n",
" metrics = {\n",
" 'absolute': {\n",
" 'mae': mae_raw,\n",
@@ -662,6 +677,18 @@
" },\n",
" 'categorical': {\n",
" 'level_accuracy': level_accuracy\n",
+ " },\n",
+ " 'error_stats': {\n",
+ " 'mean': float(np.mean(errors)),\n",
+ " 'std': float(np.std(errors)),\n",
+ " 'median': float(np.median(errors)),\n",
+ " 'p95_abs': float(np.percentile(np.abs(errors), 95))\n",
+ " },\n",
+ " 'production_stats': {\n",
+ " 'mean_true': float(np.mean(y_true)),\n",
+ " 'mean_pred': float(np.mean(y_pred)),\n",
+ " 'max_true': float(np.max(y_true)),\n",
+ " 'max_pred': float(np.max(y_pred))\n",
" }\n",
" }\n",
"\n",
@@ -670,22 +697,22 @@
"\n",
"def plot_training_history(history, folder_name=None):\n",
" \"\"\"\n",
- " Visualizza e salva i plot della loss e delle metriche durante il training\n",
+ " Display and save loss and metrics plots during training\n",
"\n",
" Parameters:\n",
" -----------\n",
" history : tensorflow.keras.callbacks.History\n",
- " L'oggetto history restituito dal training del modello\n",
+ " History object returned by model training\n",
" folder_name : str\n",
- " Cartella dove salvare il plot\n",
+ " Folder to save the plot\n",
" \"\"\"\n",
" import os\n",
"\n",
" try:\n",
- " # Crea la figura\n",
+ " # Create figure\n",
" plt.figure(figsize=(12, 4))\n",
"\n",
- " # Plot della Loss\n",
+ " # Loss Plot\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(history.history['loss'], label='Training Loss')\n",
" plt.plot(history.history['val_loss'], label='Validation Loss')\n",
@@ -695,7 +722,7 @@
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
- " # Plot del MAE\n",
+ " # MAE Plot\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(history.history['mae'], label='Training MAE')\n",
" plt.plot(history.history['val_mae'], label='Validation MAE')\n",
@@ -709,14 +736,14 @@
"\n",
" if folder_name is not None:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
- " # Genera il nome del file con timestamp\n",
+ " # Generate filename with timestamp\n",
" filename = os.path.join(folder_name, 'training_history.png')\n",
"\n",
- " # Salva la figura\n",
+ " # Save figure\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot della training history salvato come: {filename}\")\n",
+ " print(f\"\\nTraining history plot saved as: {filename}\")\n",
"\n",
- " # Salva anche i dati numerici in formato CSV\n",
+ " # Save numerical data in CSV format\n",
" history_df = pd.DataFrame({\n",
" 'epoch': range(1, len(history.history['loss']) + 1),\n",
" 'training_loss': history.history['loss'],\n",
@@ -728,9 +755,9 @@
" if folder_name is not None:\n",
" csv_filename = os.path.join(folder_name, 'training_history.csv')\n",
" history_df.to_csv(csv_filename, index=False)\n",
- " print(f\"Dati della training history salvati come: {csv_filename}\")\n",
+ " print(f\"Training history data saved as: {csv_filename}\")\n",
"\n",
- " # Calcola e salva le statistiche finali\n",
+ " # Calculate and save final statistics\n",
" final_stats = {\n",
" 'final_training_loss': history.history['loss'][-1],\n",
" 'final_validation_loss': history.history['val_loss'][-1],\n",
@@ -742,54 +769,54 @@
" }\n",
"\n",
" if folder_name is not None:\n",
- " # Salva le statistiche in formato JSON\n",
+ " # Save statistics in JSON format\n",
" stats_filename = os.path.join(folder_name, 'training_stats.json')\n",
" with open(stats_filename, 'w') as f:\n",
" json.dump(final_stats, f, indent=4)\n",
- " print(f\"Statistiche finali salvate come: {stats_filename}\")\n",
+ " print(f\"Final statistics saved as: {stats_filename}\")\n",
"\n",
- " # Stampa le statistiche principali\n",
- " print(\"\\nStatistiche finali del training:\")\n",
- " print(f\"Loss finale (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
- " print(f\"MAE finale (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
- " print(f\"Miglior validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
- " print(f\"Miglior validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
+ " # Print main statistics\n",
+ " print(\"\\nFinal Training Statistics:\")\n",
+ " print(f\"Final Loss (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
+ " print(f\"Final MAE (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
+ " print(f\"Best validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
+ " print(f\"Best validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
"\n",
" plt.show()\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante la creazione o il salvataggio dei plot: {str(e)}\")\n",
+ " print(f\"\\nError creating or saving plots: {str(e)}\")\n",
"\n",
"\n",
"def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='solarradiation_index'):\n",
" \"\"\"\n",
- " Funzione di training avanzata per il modello ibrido UV index con monitoraggio dettagliato\n",
- " e gestione del training.\n",
+ " Advanced training function for the hybrid solar energy index model with detailed monitoring\n",
+ " and training management.\n",
"\n",
" Parameters:\n",
" -----------\n",
" model : keras.Model\n",
- " Il modello ibrido compilato\n",
+ " The compiled hybrid model\n",
" X_train : numpy.ndarray\n",
- " Dati di training\n",
+ " Training data\n",
" y_train : numpy.ndarray\n",
- " Target di training\n",
+ " Training targets\n",
" X_test : numpy.ndarray\n",
- " Dati di validation\n",
+ " Validation data\n",
" y_test : numpy.ndarray\n",
- " Target di validation\n",
+ " Validation targets\n",
" epochs : int, optional\n",
- " Numero massimo di epoche di training\n",
+ " Maximum number of training epochs\n",
" batch_size : int, optional\n",
- " Dimensione del batch\n",
+ " Batch size\n",
"\n",
" Returns:\n",
" --------\n",
" history : keras.callbacks.History\n",
- " Storia del training con tutte le metriche\n",
+ " Training history with all metrics\n",
" \"\"\"\n",
"\n",
- " # Callbacks avanzati per il training\n",
+ " # Advanced training callbacks\n",
" callbacks = [\n",
" # Early Stopping\n",
" EarlyStopping(\n",
@@ -800,7 +827,7 @@
" verbose=1,\n",
" min_delta=1e-4\n",
" ),\n",
- " # ReduceLROnPlateau per MAE\n",
+ " # ReduceLROnPlateau for MAE\n",
" ReduceLROnPlateau(\n",
" monitor='mae',\n",
" factor=0.2,\n",
@@ -811,7 +838,7 @@
" cooldown=3,\n",
" min_lr=1e-7\n",
" ),\n",
- " # ReduceLROnPlateau per loss\n",
+ " # ReduceLROnPlateau for loss\n",
" ReduceLROnPlateau(\n",
" monitor='val_loss',\n",
" factor=0.2,\n",
@@ -838,15 +865,15 @@
" update_freq='epoch',\n",
" profile_batch=0\n",
" ),\n",
- " # Lambda Callback per monitoraggio radiazione solare\n",
+ " # Lambda Callback for solar radiation monitoring\n",
" tf.keras.callbacks.LambdaCallback(\n",
" on_epoch_end=lambda epoch, logs: (\n",
" lambda y_pred: print(\n",
" f\"\\nEpoch {epoch + 1}:\"\n",
- " f\"\\nPredizioni fuori range (0-1500 W/m²): \"\n",
+ " f\"\\nPredictions out of range (0-1500 W/m²): \"\n",
" f\"{np.sum((y_pred < 0) | (y_pred > 1500))}\"\n",
" f\"\\nMAPE: {np.mean(np.abs((y_test - y_pred) / (y_test + 1e-7))) * 100:.2f}%\"\n",
- " f\"\\nPredizioni entro ±10%: \"\n",
+ " f\"\\nPredictions within ±10%: \"\n",
" f\"{np.mean(np.abs((y_pred - y_test) / (y_test + 1e-7)) <= 0.10) * 100:.2f}%\"\n",
" )\n",
" )(model.predict(X_test)) if epoch % 20 == 0 else None\n",
@@ -865,142 +892,104 @@
" validation_freq=1,\n",
" )\n",
"\n",
- " # Analisi post-training\n",
- " print(\"\\nTraining completato con successo!\")\n",
+ " # Post-training analysis\n",
+ " print(\"\\nTraining completed successfully!\")\n",
"\n",
- " # Valutazione finale sul test set\n",
+ " # Final evaluation on test set\n",
" test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=0)\n",
- " print(f\"\\nMetriche finali sul test set:\")\n",
+ " print(f\"\\nFinal metrics on test set:\")\n",
" print(f\"Loss: {test_loss:.4f}\")\n",
" print(f\"MAE: {test_mae:.4f}\")\n",
" print(f\"MSE: {test_mse:.4f}\")\n",
"\n",
- " # Analisi delle predizioni\n",
+ " # Prediction analysis\n",
" predictions = model.predict(X_test)\n",
" out_of_range = np.sum((predictions < 0) | (predictions > 11))\n",
- " print(f\"\\nPredizioni fuori range: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
+ " print(f\"\\nPredictions out of range: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
"\n",
" plot_training_history(history, folder_name=folder_name)\n",
"\n",
" return history\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante il training: {str(e)}\")\n",
+ " print(f\"\\nError during training: {str(e)}\")\n",
" raise\n",
"\n",
" finally:\n",
- " # Pulizia della memoria\n",
+ " # Memory cleanup\n",
" tf.keras.backend.clear_session()\n",
"\n",
"\n",
- "def calculate_class_weights(y_train, n_classes=12):\n",
- " \"\"\"\n",
- " Calcola i pesi delle classi per bilanciare il dataset UV index.\n",
- " \n",
- " Parameters:\n",
- " -----------\n",
- " y_train : numpy.ndarray\n",
- " Array dei valori UV di training\n",
- " n_classes : int, optional\n",
- " Numero di classi possibili (0-11 per UV index, quindi 12 classi)\n",
- " \n",
- " Returns:\n",
- " --------\n",
- " dict:\n",
- " Dizionario con i pesi per ogni classe\n",
- " \"\"\"\n",
- " # Arrotonda i valori UV al più vicino intero e converti in intero\n",
- " y_discrete = np.clip(np.round(y_train), 0, 11).astype(int)\n",
- "\n",
- " # Calcola la frequenza di ogni classe\n",
- " unique, counts = np.unique(y_discrete, return_counts=True)\n",
- " total_samples = len(y_discrete)\n",
- "\n",
- " # Calcola i pesi inversamente proporzionali alla frequenza\n",
- " weights = {}\n",
- " for i in range(n_classes):\n",
- " if i in unique:\n",
- " # Se la classe è presente, calcola il peso\n",
- " weight = total_samples / (len(unique) * counts[unique == i][0])\n",
- " else:\n",
- " # Se la classe non è presente, assegna un peso neutro\n",
- " weight = 1.0\n",
- " weights[i] = weight\n",
- "\n",
- " return weights\n",
- "\n",
- "\n",
"def integrate_predictions(df, predictions, sequence_length=24):\n",
" \"\"\"\n",
- " Integra le predizioni dell'UV index nel dataset originale per i dati precedenti al 2010.\n",
- " \n",
+ " Integrates solar energy index predictions into the original dataset for data before 2010.\n",
+ "\n",
" Parameters:\n",
" -----------\n",
" df : pandas.DataFrame\n",
- " Dataset originale\n",
+ " Original dataset\n",
" predictions : numpy.ndarray\n",
- " Array delle predizioni UV index\n",
+ " Array of solar energy index predictions\n",
" sequence_length : int\n",
- " Lunghezza della sequenza usata per le predizioni\n",
- " \n",
+ " Sequence length used for predictions\n",
+ "\n",
" Returns:\n",
" --------\n",
" pandas.DataFrame\n",
- " Dataset aggiornato con le predizioni UV index\n",
+ " Updated dataset with solar energy index predictions\n",
" \"\"\"\n",
- " # Converti datetime in formato datetime se non lo è già\n",
+ " # Convert datetime to datetime format if not already\n",
" df['datetime'] = pd.to_datetime(df['datetime'])\n",
"\n",
- " # Identifica le righe precedenti al 2010\n",
+ " # Identify rows before 2010\n",
" mask_pre_2010 = df['datetime'].dt.year < 2010\n",
"\n",
- " # Crea un DataFrame temporaneo con le predizioni\n",
+ " # Create temporary DataFrame with predictions\n",
" dates_pre_2010 = df[mask_pre_2010]['datetime'].iloc[sequence_length - 1:]\n",
" predictions_df = pd.DataFrame({\n",
" 'datetime': dates_pre_2010,\n",
- " 'uvindex_predicted': predictions.flatten()\n",
+ " 'solarenergy_predicted': predictions.flatten()\n",
" })\n",
"\n",
- " # Merge con il dataset originale\n",
+ " # Merge with original dataset\n",
" df = df.merge(predictions_df, on='datetime', how='left')\n",
"\n",
- " # Aggiorna la colonna uvindex dove manca\n",
- " df['uvindex'] = df['uvindex'].fillna(df['uvindex_predicted'])\n",
+ " # Update solarenergy column where missing\n",
+ " df['solarenergy'] = df['solarenergy'].fillna(df['solarenergy_predicted'])\n",
"\n",
- " # Rimuovi la colonna temporanea\n",
- " df = df.drop('uvindex_predicted', axis=1)\n",
+ " # Remove temporary column\n",
+ " df = df.drop('solarenergy_predicted', axis=1)\n",
"\n",
- " print(f\"Aggiunte {len(predictions)} predizioni al dataset\")\n",
- " print(f\"Righe con UV index dopo l'integrazione: {df['uvindex'].notna().sum()}\")\n",
+ " print(f\"Added {len(predictions)} predictions to dataset\")\n",
+ " print(f\"Rows with solar energy index after integration: {df['solarenergy'].notna().sum()}\")\n",
"\n",
" return df\n",
"\n",
"\n",
- "def train_uvindex_bounded_model(df):\n",
+ "def train_solarenergy_bounded_model(df):\n",
" \"\"\"\n",
- " Training del modello con vincoli specifici per UV index\n",
+ " Training of model with specific constraints for solar energy index\n",
" \"\"\"\n",
- " print(\"Inizializzazione del training del modello UV index...\")\n",
+ " print(\"Initializing solar energy index model training...\")\n",
"\n",
" try:\n",
- "\n",
- " # Preparazione dei dati\n",
- " print(\"\\n1. Preparazione dei dati...\")\n",
+ " # Data preparation\n",
+ " print(\"\\n1. Preparing data...\")\n",
" X_train_seq, X_test_seq, y_train, y_test, scaler, features, X_to_predict_seq = prepare_hybrid_data(df)\n",
"\n",
- " print(f\"Shape dei dati di training: {X_train_seq.shape}\")\n",
- " print(f\"Shape dei dati di test: {X_test_seq.shape}\")\n",
+ " print(f\"Training data shape: {X_train_seq.shape}\")\n",
+ " print(f\"Test data shape: {X_test_seq.shape}\")\n",
"\n",
- " # Verifica della qualità dei dati\n",
+ " # Data quality verification\n",
" if np.isnan(X_train_seq).any() or np.isnan(y_train).any():\n",
- " raise ValueError(\"Trovati valori NaN nei dati di training\")\n",
+ " raise ValueError(\"Found NaN values in training data\")\n",
"\n",
- " # Creazione del modello\n",
- " print(\"\\n2. Creazione del modello...\")\n",
+ " # Model creation\n",
+ " print(\"\\n2. Creating model...\")\n",
" input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])\n",
" model = create_solarradiation_model(input_shape, folder_name)\n",
"\n",
- " print(\"\\n4. Avvio del training...\")\n",
+ " print(\"\\n4. Starting training...\")\n",
" history = train_hybrid_model(\n",
" model=model,\n",
" X_train=X_train_seq,\n",
@@ -1012,14 +1001,14 @@
" folder_name=folder_name\n",
" )\n",
"\n",
- " print(\"\\n5. Generazione delle predizioni...\")\n",
+ " print(\"\\n5. Generating predictions...\")\n",
" predictions = model.predict(X_test_seq)\n",
" predictions = np.clip(predictions, 0, 11)\n",
"\n",
- " print(\"\\n6. Valutazione del modello...\")\n",
- " metrics = evaluate_solarradiation_predictions(y_test, predictions, folder_name=folder_name)\n",
+ " print(\"\\n6. Evaluating model...\")\n",
+ " metrics = evaluate_solarenergy_predictions(y_test, predictions, folder_name=folder_name)\n",
"\n",
- " # Creazione del dizionario dei risultati\n",
+ " # Create results dictionary\n",
" training_results = {\n",
" 'model_params': {\n",
" 'input_shape': input_shape,\n",
@@ -1030,7 +1019,6 @@
" 'batch_size': 32,\n",
" 'total_epochs': len(history.history['loss']),\n",
" 'best_epoch': np.argmin(history.history['val_loss']) + 1,\n",
- " #'class_weights': {str(k): float(v) for k, v in class_weights.items()}\n",
" },\n",
" 'performance_metrics': {\n",
" 'final_loss': float(history.history['val_loss'][-1]),\n",
@@ -1040,33 +1028,33 @@
" }\n",
" }\n",
"\n",
- " print(\"\\n7. Predizione dei dati mancanti risultati...\")\n",
+ " print(\"\\n7. Predicting missing data results...\")\n",
" to_predict_predictions = model.predict(X_to_predict_seq)\n",
" to_predict_predictions = np.clip(to_predict_predictions, 0, 11)\n",
"\n",
- " print(\"\\n8. Integrazione delle predizioni nel dataset originale...\")\n",
+ " print(\"\\n8. Integrating predictions into original dataset...\")\n",
" df_updated = integrate_predictions(df.copy(), to_predict_predictions)\n",
"\n",
- " df_updated.to_parquet('./data/weather_data_uvindex.parquet')\n",
+ " df_updated.to_parquet('../../sources/weather_data_complete.parquet')\n",
"\n",
- " # Aggiungi statistiche sulle predizioni al training_results\n",
+ " # Add prediction statistics to training_results\n",
" training_results['prediction_stats'] = {\n",
" 'n_predictions_added': len(to_predict_predictions),\n",
- " 'mean_predicted_uv': float(to_predict_predictions.mean()),\n",
- " 'min_predicted_uv': float(to_predict_predictions.min()),\n",
- " 'max_predicted_uv': float(to_predict_predictions.max()),\n",
+ " 'mean_predicted_solarenergy': float(to_predict_predictions.mean()),\n",
+ " 'min_predicted_solarenergy': float(to_predict_predictions.min()),\n",
+ " 'max_predicted_solarenergy': float(to_predict_predictions.max()),\n",
" }\n",
"\n",
- " print(\"\\nTraining completato con successo!\")\n",
+ " print(\"\\nTraining completed successfully!\")\n",
"\n",
" return model, scaler, features, history, predictions, y_test, metrics, training_results\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante il training: {str(e)}\")\n",
+ " print(f\"\\nError during training: {str(e)}\")\n",
" raise\n",
"\n",
" finally:\n",
- " # Pulizia della memoria\n",
+ " # Memory cleanup\n",
" tf.keras.backend.clear_session()"
],
"outputs": [],
@@ -1077,93 +1065,9 @@
"id": "initial_id",
"metadata": {},
"source": [
- "df = pd.read_parquet('../data/weather_data.parquet')\n",
+ "df = pd.read_parquet('../../sources/weather_data_solarradiation.parquet')\n",
"\n",
- "# Esegui il training\n",
- "model, scaler, features, history, predictions, y_test, metrics, training_results = train_uvindex_bounded_model(df)"
- ],
- "outputs": [],
- "execution_count": null
- },
- {
- "cell_type": "code",
- "id": "637891db-8d55-4232-a56e-9759dbcc8c2f",
- "metadata": {},
- "source": [
- "def analyze_solarradiation_prediction_quality(y_true, y_pred):\n",
- " \"\"\"\n",
- " Analisi dettagliata della qualità delle predizioni UV\n",
- " \"\"\"\n",
- " # Converti in numpy array e appiattisci\n",
- " y_true = np.array(y_true).ravel()\n",
- " y_pred = np.array(y_pred).ravel()\n",
- "\n",
- " # Arrotonda le predizioni al più vicino 0.5\n",
- " y_pred_rounded = np.round(y_pred * 2) / 2\n",
- "\n",
- " # Calcola diverse metriche di accuratezza usando array numpy\n",
- " exact_match = np.mean(np.abs(y_pred_rounded - y_true) < 1e-6) * 100 # uso di tolleranza per confronti float\n",
- " within_half = np.mean(np.abs(y_pred_rounded - y_true) <= 0.5) * 100\n",
- " within_one = np.mean(np.abs(y_pred_rounded - y_true) <= 1.0) * 100\n",
- "\n",
- " # Analisi per livello di rischio UV\n",
- " def get_solarradiation_risk_level(values):\n",
- " # Vettorizzazione della funzione per array numpy\n",
- " levels = np.zeros_like(values, dtype=str)\n",
- " levels[values <= 2] = 'Basso'\n",
- " levels[(values > 2) & (values <= 5)] = 'Moderato'\n",
- " levels[(values > 5) & (values <= 7)] = 'Alto'\n",
- " levels[(values > 7) & (values <= 10)] = 'Molto Alto'\n",
- " levels[values > 10] = 'Estremo'\n",
- " return levels\n",
- "\n",
- " y_true_risk = get_solarradiation_risk_level(y_true)\n",
- " y_pred_risk = get_solarradiation_risk_level(y_pred_rounded)\n",
- "\n",
- " risk_accuracy = np.mean(y_true_risk == y_pred_risk) * 100\n",
- "\n",
- " print(\"Analisi Precisione Predizioni UV Index:\")\n",
- " print(f\"Precisione esatta: {exact_match:.1f}%\")\n",
- " print(f\"Precisione entro 0.5 punti: {within_half:.1f}%\")\n",
- " print(f\"Precisione entro 1.0 punti: {within_one:.1f}%\")\n",
- " print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
- "\n",
- " # Distribuzione degli errori per livello UV\n",
- " solarradiation_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
- " labels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- "\n",
- " print(\"\\nAnalisi errori per livello UV:\")\n",
- " for (low, high), label in zip(solarradiation_ranges, labels):\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if np.sum(mask) > 0:\n",
- " mae_range = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
- " n_samples = np.sum(mask)\n",
- " print(f\"MAE per UV {label} ({low}-{high}): {mae_range:.3f} (n={n_samples})\")\n",
- "\n",
- " # Analisi aggiuntiva della distribuzione degli errori\n",
- " errors = y_pred - y_true\n",
- " print(\"\\nStatistiche degli errori:\")\n",
- " print(f\"Media errori: {np.mean(errors):.3f}\")\n",
- " print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
- " print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
- " print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
- "\n",
- " return {\n",
- " 'exact_match': exact_match,\n",
- " 'within_half': within_half,\n",
- " 'within_one': within_one,\n",
- " 'risk_accuracy': risk_accuracy,\n",
- " 'error_stats': {\n",
- " 'mean': float(np.mean(errors)),\n",
- " 'std': float(np.std(errors)),\n",
- " 'median': float(np.median(errors)),\n",
- " 'p95_abs': float(np.percentile(np.abs(errors), 95))\n",
- " }\n",
- " }\n",
- "\n",
- "\n",
- "# Per utilizzare l'analisi:\n",
- "metrics = analyze_solarradiation_prediction_quality(y_test, predictions)"
+ "model, scaler, features, history, predictions, y_test, metrics, training_results = train_solarenergy_bounded_model(df)"
],
"outputs": [],
"execution_count": null
@@ -1175,21 +1079,21 @@
"source": [
"def plot_error_analysis(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
- " Funzione per visualizzare l'analisi degli errori di predizione\n",
+ " Function to visualize prediction error analysis\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
- " Valori reali\n",
+ " Actual values\n",
" y_pred : array-like\n",
- " Valori predetti\n",
+ " Predicted values\n",
" folder_name : str, optional\n",
- " Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
+ " Folder to save plots. If None, plots won't be saved.\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
"\n",
- " # Converti in array numpy 1D se necessario\n",
+ " # Convert to 1D numpy arrays if necessary\n",
" if isinstance(y_true, pd.Series):\n",
" y_true = y_true.values\n",
" if isinstance(y_pred, pd.Series):\n",
@@ -1198,282 +1102,74 @@
" y_true = y_true.ravel()\n",
" y_pred = y_pred.ravel()\n",
"\n",
- " # Calcola gli errori\n",
+ " # Calculate errors\n",
" errors = y_pred - y_true\n",
"\n",
- " # Crea la figura principale\n",
+ " # Create main figure\n",
" fig = plt.figure(figsize=(15, 5))\n",
"\n",
- " # Plot 1: Distribuzione degli errori\n",
+ " # Plot 1: Error Distribution\n",
" plt.subplot(1, 3, 1)\n",
" plt.hist(errors, bins=50, alpha=0.7)\n",
- " plt.title('Distribuzione degli Errori di Predizione')\n",
- " plt.xlabel('Errore')\n",
- " plt.ylabel('Frequenza')\n",
+ " plt.title('Prediction Error Distribution')\n",
+ " plt.xlabel('Error')\n",
+ " plt.ylabel('Frequency')\n",
"\n",
" # Plot 2: Actual vs Predicted\n",
" plt.subplot(1, 3, 2)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
" plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
- " plt.title('Valori Reali vs Predetti')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Valori Predetti')\n",
+ " plt.title('Actual vs Predicted Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Predicted Values')\n",
"\n",
- " # Plot 3: Errori vs Valori Reali\n",
+ " # Plot 3: Errors vs Actual Values\n",
" plt.subplot(1, 3, 3)\n",
" plt.scatter(y_true, errors, alpha=0.5)\n",
" plt.axhline(y=0, color='r', linestyle='--')\n",
- " plt.title('Errori vs Valori Reali')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Errore')\n",
+ " plt.title('Errors vs Actual Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Error')\n",
"\n",
" plt.tight_layout()\n",
"\n",
- " # Salva il plot se è specificata una cartella\n",
+ " # Save plot if folder is specified\n",
" if folder_name is not None:\n",
" try:\n",
- " # Crea la cartella se non esiste\n",
+ " # Create folder if it doesn't exist\n",
" os.makedirs(folder_name, exist_ok=True)\n",
"\n",
- " # Genera il nome del file con timestamp\n",
+ " # Generate filename with timestamp\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" filename = os.path.join(folder_name, f'error_analysis_{timestamp}.png')\n",
"\n",
- " # Salva la figura\n",
+ " # Save figure\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot salvato come: {filename}\")\n",
+ " print(f\"\\nPlot saved as: {filename}\")\n",
" except Exception as e:\n",
- " print(f\"\\nErrore nel salvare il plot: {str(e)}\")\n",
+ " print(f\"\\nError saving plot: {str(e)}\")\n",
"\n",
" plt.show()\n",
"\n",
- " # Stampa statistiche degli errori\n",
- " print(\"\\nStatistiche degli errori:\")\n",
+ " # Print error statistics\n",
+ " print(\"\\nError Statistics:\")\n",
" print(f\"MAE: {np.mean(np.abs(errors)):.4f}\")\n",
" print(f\"MSE: {np.mean(errors ** 2):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(errors ** 2)):.4f}\")\n",
- " print(f\"Media errori: {np.mean(errors):.4f}\")\n",
- " print(f\"Std errori: {np.std(errors):.4f}\")\n",
+ " print(f\"Mean errors: {np.mean(errors):.4f}\")\n",
+ " print(f\"Std errors: {np.std(errors):.4f}\")\n",
"\n",
- " # Calcola percentuali di errori entro certe soglie\n",
+ " # Calculate percentage of errors within thresholds\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
" for threshold in thresholds:\n",
" within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
- " print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
+ " print(f\"Predictions within ±{threshold}: {within_threshold:.1f}%\")\n",
"\n",
"\n",
"plot_error_analysis(y_test, predictions, folder_name=folder_name)"
],
"outputs": [],
"execution_count": null
- },
- {
- "cell_type": "code",
- "id": "03bb9564-e518-4662-b3ee-4cfa96cdf696",
- "metadata": {},
- "source": [
- "def plot_advanced_prediction_analysis(y_true, y_pred, folder_name=None):\n",
- " \"\"\"\n",
- " Funzione per visualizzare l'analisi degli errori di predizione e la precisione\n",
- "\n",
- " Parameters:\n",
- " -----------\n",
- " y_true : array-like\n",
- " Valori reali\n",
- " y_pred : array-like\n",
- " Valori predetti\n",
- " folder_name : str, optional\n",
- " Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
- " \"\"\"\n",
- " import os\n",
- " from datetime import datetime\n",
- " import seaborn as sns\n",
- "\n",
- " # Converti in array numpy 1D se necessario\n",
- " if isinstance(y_true, pd.Series):\n",
- " y_true = y_true.values\n",
- " if isinstance(y_pred, pd.Series):\n",
- " y_pred = y_pred.values\n",
- "\n",
- " y_true = y_true.ravel()\n",
- " y_pred = y_pred.ravel()\n",
- "\n",
- " # Calcola gli errori\n",
- " errors = y_pred - y_true\n",
- "\n",
- " # Calcola accuracy per diversi livelli di tolleranza\n",
- " exact_accuracy = np.mean(np.abs(errors) < 0.1) * 100\n",
- " accuracy_05 = np.mean(np.abs(errors) <= 0.5) * 100\n",
- " accuracy_10 = np.mean(np.abs(errors) <= 1.0) * 100\n",
- "\n",
- " def get_risk_level(uv):\n",
- " if uv < 2:\n",
- " return 'Basso'\n",
- " elif uv < 5:\n",
- " return 'Moderato'\n",
- " elif uv < 7:\n",
- " return 'Alto'\n",
- " elif uv < 10:\n",
- " return 'Molto Alto'\n",
- " else:\n",
- " return 'Estremo'\n",
- "\n",
- " y_true_risk = [get_risk_level(x) for x in y_true]\n",
- " y_pred_risk = [get_risk_level(x) for x in y_pred]\n",
- " risk_accuracy = np.mean(np.array(y_true_risk) == np.array(y_pred_risk)) * 100\n",
- "\n",
- " # Crea la figura principale\n",
- " fig = plt.figure(figsize=(20, 10))\n",
- "\n",
- " # Plot 1: Distribuzione degli errori\n",
- " plt.subplot(2, 2, 1)\n",
- " plt.hist(errors, bins=50, alpha=0.7)\n",
- " plt.title('Distribuzione degli Errori di Predizione')\n",
- " plt.xlabel('Errore')\n",
- " plt.ylabel('Frequenza')\n",
- "\n",
- " # Plot 2: Actual vs Predicted\n",
- " plt.subplot(2, 2, 2)\n",
- " plt.scatter(y_true, y_pred, alpha=0.5)\n",
- " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
- " plt.title('Valori Reali vs Predetti')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Valori Predetti')\n",
- "\n",
- " # Plot 3: Errori vs Valori Reali\n",
- " plt.subplot(2, 2, 3)\n",
- " plt.scatter(y_true, errors, alpha=0.5)\n",
- " plt.axhline(y=0, color='r', linestyle='--')\n",
- " plt.title('Errori vs Valori Reali')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Errore')\n",
- "\n",
- " # Plot 4: Precisione per intervallo di UV\n",
- " plt.subplot(2, 2, 4)\n",
- "\n",
- " solarradiation_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
- " range_labels = ['Basso\\n(0-2)', 'Moderato\\n(2-5)', 'Alto\\n(5-7)', 'Molto Alto\\n(7-10)', 'Estremo\\n(10-11)']\n",
- "\n",
- " accuracies = []\n",
- " counts = []\n",
- " mae_per_range = []\n",
- "\n",
- " for (low, high) in solarradiation_ranges:\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if mask.any():\n",
- " mae = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
- " mae_per_range.append(mae)\n",
- " count = np.sum(mask)\n",
- " counts.append(count)\n",
- " accuracy = np.mean(np.abs(y_pred[mask] - y_true[mask]) <= 0.5) * 100\n",
- " accuracies.append(accuracy)\n",
- "\n",
- " # Crea il grafico a barre con doppio asse y\n",
- " ax = plt.gca()\n",
- " bars = plt.bar(range_labels, accuracies, alpha=0.6, color='skyblue')\n",
- " plt.ylabel('Precisione (%)')\n",
- " plt.title('Precisione e MAE per Range UV')\n",
- "\n",
- " for bar in bars:\n",
- " height = bar.get_height()\n",
- " plt.text(bar.get_x() + bar.get_width() / 2., height,\n",
- " f'{height:.1f}%\\n(n={counts[bars.index(bar)]})',\n",
- " ha='center', va='bottom')\n",
- "\n",
- " ax2 = ax.twinx()\n",
- " line = ax2.plot(range_labels, mae_per_range, 'r-', marker='o', label='MAE')\n",
- " ax2.set_ylabel('MAE', color='red')\n",
- "\n",
- " for i, mae in enumerate(mae_per_range):\n",
- " ax2.text(i, mae, f'MAE: {mae:.3f}', color='red', ha='center', va='bottom')\n",
- "\n",
- " plt.xticks(rotation=45)\n",
- " plt.tight_layout()\n",
- "\n",
- " # Salva la figura principale se è specificata una cartella\n",
- " if folder_name is not None:\n",
- " try:\n",
- " # Crea la cartella se non esiste\n",
- " os.makedirs(folder_name, exist_ok=True)\n",
- "\n",
- " # Genera il timestamp\n",
- " timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
- "\n",
- " # Salva la figura principale\n",
- " main_plot_filename = os.path.join(folder_name, f'advanced_analysis_{timestamp}.png')\n",
- " plt.savefig(main_plot_filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot principale salvato come: {main_plot_filename}\")\n",
- "\n",
- " # Crea e salva la matrice di confusione come plot separato\n",
- " plt.figure(figsize=(10, 8))\n",
- " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
- " risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
- "\n",
- " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
- " plt.title('Matrice di Confusione per Livelli di Rischio UV')\n",
- " plt.tight_layout()\n",
- "\n",
- " conf_matrix_filename = os.path.join(folder_name, f'confusion_matrix_{timestamp}.png')\n",
- " plt.savefig(conf_matrix_filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"Matrice di confusione salvata come: {conf_matrix_filename}\")\n",
- "\n",
- " except Exception as e:\n",
- " print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
- "\n",
- " plt.show()\n",
- "\n",
- " # Stampa delle statistiche e analisi\n",
- " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
- " risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
- "\n",
- " print(\"\\nMatrice di Confusione per Livelli di Rischio UV:\")\n",
- " print(cm_df)\n",
- "\n",
- " print(\"\\nAnalisi Precisione Predizioni UV Index:\")\n",
- " print(f\"Precisione esatta (±0.1): {exact_accuracy:.1f}%\")\n",
- " print(f\"Precisione entro 0.5 punti: {accuracy_05:.1f}%\")\n",
- " print(f\"Precisione entro 1.0 punti: {accuracy_10:.1f}%\")\n",
- " print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
- "\n",
- " print(\"\\nAnalisi errori per livello UV:\")\n",
- " solarradiation_ranges = [(0, 2, 'Basso'), (2, 5, 'Moderato'), (5, 7, 'Alto'),\n",
- " (7, 10, 'Molto Alto'), (10, 11, 'Estremo')]\n",
- "\n",
- " for low, high, label in solarradiation_ranges:\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if mask.any():\n",
- " mae = np.mean(np.abs(errors[mask]))\n",
- " n_samples = np.sum(mask)\n",
- " print(f\"MAE per UV {label} ({low}-{high}): {mae:.3f} (n={n_samples})\")\n",
- "\n",
- " print(\"\\nStatistiche degli errori:\")\n",
- " print(f\"Media errori: {np.mean(errors):.3f}\")\n",
- " print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
- " print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
- " print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
- "\n",
- " print(\"\\nDistribuzione degli errori:\")\n",
- " thresholds = [0.5, 1.0, 1.5, 2.0]\n",
- " for threshold in thresholds:\n",
- " within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
- " print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
- "\n",
- "\n",
- "# Usa la funzione\n",
- "plot_advanced_prediction_analysis(y_test, predictions, folder_name=folder_name)"
- ],
- "outputs": [],
- "execution_count": null
- },
- {
- "cell_type": "code",
- "id": "fe898941-2338-4157-b624-680bc2c517d8",
- "metadata": {},
- "source": [],
- "outputs": [],
- "execution_count": null
}
],
"metadata": {
diff --git a/src/models/solarradiation/solarradiation_model.ipynb b/src/models/solarradiation/solarradiation_model.ipynb
index 00ed9bc..bbf91d7 100755
--- a/src/models/solarradiation/solarradiation_model.ipynb
+++ b/src/models/solarradiation/solarradiation_model.ipynb
@@ -2,23 +2,9 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
"id": "8adcbe0819b88578",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'\\nfrom opt_einsum.paths import branch_1\\n!apt-get update\\n!apt-get install graphviz -y\\n\\n!pip install tensorflow\\n!pip install numpy\\n!pip install pandas\\n\\n!pip install keras\\n!pip install scikit-learn\\n!pip install matplotlib\\n!pip install joblib\\n!pip install pyarrow\\n!pip install fastparquet\\n!pip install scipy\\n!pip install seaborn\\n!pip install tqdm\\n!pip install pydot\\n!pip install tensorflow-io\\n'"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "'''\n",
"from opt_einsum.paths import branch_1\n",
"!apt-get update\n",
"!apt-get install graphviz -y\n",
@@ -38,27 +24,15 @@
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io\n",
- "'''"
- ]
+ "!pip install tensorflow-addons"
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
- "execution_count": 2,
"id": "7a813e3cbca057b7",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-11-10 22:44:08.491015: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
- "2024-11-10 22:44:08.491086: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
- "2024-11-10 22:44:08.491139: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
- "2024-11-10 22:44:08.502469: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
- "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
- ]
- }
- ],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.layers import Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, Input, Activation, Lambda, Bidirectional, Add, MaxPooling1D\n",
@@ -67,25 +41,26 @@
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
- "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.preprocessing import RobustScaler\n",
"from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
"from tensorflow.keras.optimizers import AdamW\n",
"import json\n",
"from datetime import datetime\n",
"import matplotlib.pyplot as plt\n",
- "from sklearn.metrics import confusion_matrix\n",
"from tensorflow.keras.utils import plot_model\n",
+ "import tensorflow_addons as tfa\n",
"\n",
- "folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")"
- ]
+ "folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\n",
+ "\n",
+ "random_state_value = None"
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
- "execution_count": 3,
"id": "b3f525e19f78a1da",
"metadata": {},
- "outputs": [],
"source": [
"def get_season(date):\n",
" month = date.month\n",
@@ -140,14 +115,14 @@
"\n",
"\n",
"def add_solar_features(df):\n",
- " # Calcolo dell'angolo solare\n",
+ " # Solar angle calculation\n",
" df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))\n",
"\n",
- " # Interazioni tra features rilevanti\n",
+ " # Interactions between relevant features\n",
" df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']\n",
" df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])\n",
"\n",
- " # Feature derivate\n",
+ " # Derived features\n",
" df['clear_sky_index'] = (100 - df['cloudcover']) / 100\n",
" df['temp_gradient'] = df['temp'] - df['tempmin']\n",
"\n",
@@ -155,42 +130,77 @@
"\n",
"\n",
"def add_solar_specific_features(df):\n",
- " # Angolo solare e durata del giorno\n",
+ " # Solar angle and day length\n",
" df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)\n",
" df['solar_noon'] = 12 - df['hour']\n",
" df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * np.cos(2 * np.pi * df['solar_noon'] / 24)\n",
"\n",
- " # Interazioni\n",
+ " # Interactions\n",
" df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']\n",
" df['visibility_elevation'] = df['visibility'] * df['solar_elevation']\n",
"\n",
- " # Rolling features con finestre più ampie\n",
+ " # Rolling features with wider windows\n",
" df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12).mean()\n",
" df['temp_rolling_12h'] = df['temp'].rolling(window=12).mean()\n",
"\n",
" return df\n",
"\n",
"\n",
+ "def add_radiation_energy_features(df):\n",
+ " \"\"\"Adds specific features based on solarenergy and uvindex\"\"\"\n",
+ "\n",
+ " # Solar energy to UV ratio (independent from solarradiation)\n",
+ " df['energy_uv_ratio'] = df['solarenergy'] / (df['uvindex'] + 1e-6)\n",
+ "\n",
+ " # Time aggregations\n",
+ " # Moving averages\n",
+ " windows = [3, 6, 12, 24] # hours\n",
+ " for w in windows:\n",
+ " df[f'energy_rolling_mean_{w}h'] = df['solarenergy'].rolling(window=w).mean()\n",
+ " df[f'uv_rolling_mean_{w}h'] = df['uvindex'].rolling(window=w).mean()\n",
+ "\n",
+ " # Daily aggregations\n",
+ " df['energy_daily_sum'] = df.groupby(df.index.date)['solarenergy'].transform('sum')\n",
+ " df['uv_daily_max'] = df.groupby(df.index.date)['uvindex'].transform('max')\n",
+ "\n",
+ " # Changes\n",
+ " df['energy_change'] = df['solarenergy'].diff()\n",
+ " df['uv_change'] = df['uvindex'].diff()\n",
+ "\n",
+ " # Lag features\n",
+ " lags = [1, 2, 3, 6, 12, 24] # hours\n",
+ " for lag in lags:\n",
+ " df[f'energy_lag_{lag}h'] = df['solarenergy'].shift(lag)\n",
+ " df[f'uv_lag_{lag}h'] = df['uvindex'].shift(lag)\n",
+ "\n",
+ " # Peak indicators\n",
+ " df['is_energy_peak'] = (df['solarenergy'] > df['energy_rolling_mean_6h'] * 1.2).astype(int)\n",
+ " df['is_uv_peak'] = (df['uvindex'] > df['uv_rolling_mean_6h'] * 1.2).astype(int)\n",
+ "\n",
+ " return df\n",
+ "\n",
+ "\n",
"def add_advanced_features(df):\n",
- " # Features esistenti\n",
+ " # Existing features\n",
" df = add_time_features(df)\n",
" df = add_solar_features(df)\n",
" df = add_solar_specific_features(df)\n",
+ " df = add_radiation_energy_features(df)\n",
"\n",
" if not isinstance(df.index, pd.DatetimeIndex):\n",
" df.index = pd.to_datetime(df.index)\n",
"\n",
- " # One-hot encoding per le feature categoriche\n",
+ " # One-hot encoding for categorical features\n",
" df = pd.get_dummies(df, columns=['season', 'time_period'])\n",
"\n",
- " # Interazioni tra variabili meteorologiche\n",
+ " # Weather variable interactions\n",
" df['temp_humidity'] = df['temp'] * df['humidity']\n",
" df['temp_cloudcover'] = df['temp'] * df['cloudcover']\n",
" df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']\n",
"\n",
- " # Features derivate per la radiazione solare\n",
+ " # Derived features for solar radiation\n",
" df['clear_sky_factor'] = (100 - df['cloudcover']) / 100\n",
- " df['day_length'] = np.sin(df['day_of_year_sin']) * 12 + 12 # approssimazione della durata del giorno\n",
+ " df['day_length'] = np.sin(df['day_of_year_sin']) * 12 + 12 # day length approximation\n",
"\n",
" # Lag features\n",
" df['temp_1h_lag'] = df['temp'].shift(1)\n",
@@ -203,19 +213,19 @@
"\n",
" df['temp_humidity_interaction'] = df['temp'] * df['humidity'] / 100\n",
"\n",
- " # Indicatore di condizioni estreme\n",
+ " # Extreme conditions indicator\n",
" df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) & (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
"\n",
- " # Feature composite per la trasparenza atmosferica\n",
+ " # Composite feature for atmospheric transparency\n",
" df['atmospheric_transparency'] = (100 - df['cloudcover']) * (df['visibility'] / 10)\n",
"\n",
- " # Indicatori temporali più granulari per mezze stagioni\n",
+ " # More granular temporal indicators for mid-seasons\n",
" df['is_transition_season'] = ((df['season_Spring'] | df['season_Autumn'])).astype(int)\n",
"\n",
- " # Interazione tra angolo solare e copertura nuvolosa normalizzata\n",
+ " # Interaction between solar angle and normalized cloud cover\n",
" df['solar_cloud_effect'] = df['solar_elevation'] * (100 - df['cloudcover']) / 100\n",
"\n",
- " # Indicatore di stabilità atmosferica\n",
+ " # Atmospheric stability indicator\n",
" df['pressure_stability'] = df.groupby(df.index.date if isinstance(df.index, pd.DatetimeIndex)\n",
" else df.index.to_series().dt.date)['pressure'].transform(\n",
" lambda x: x.std()\n",
@@ -225,70 +235,82 @@
"\n",
"\n",
"def prepare_advanced_data(df):\n",
- " # Applicazione delle funzioni di feature engineering\n",
+ " # Apply feature engineering functions\n",
" df = add_advanced_features(df)\n",
"\n",
" target_variables = ['solarradiation', 'solarenergy', 'uvindex']\n",
"\n",
- " # Selezione delle feature più rilevanti per solarradiation\n",
+ " # Updated feature selection (without solarradiation dependencies)\n",
" selected_features = [\n",
- " # Features meteorologiche base\n",
+ " # Base meteorological features\n",
" 'temp', 'humidity', 'cloudcover', 'visibility', 'pressure',\n",
"\n",
- " # Features temporali cicliche\n",
+ " # Solar features\n",
+ " 'zenith_angle', 'air_mass', 'atmospheric_transmission',\n",
+ " 'cloud_transmission', 'theoretical_radiation',\n",
+ "\n",
+ " # Time features\n",
" 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',\n",
" 'day_of_year_sin', 'day_of_year_cos',\n",
"\n",
- " # Features solari\n",
- " 'solar_angle', 'solar_elevation', 'day_length',\n",
- " 'clear_sky_index', 'solar_noon',\n",
+ " # Atmospheric features\n",
+ " 'clear_sky_index', 'humidity_factor', 'atmospheric_clarity',\n",
+ " 'vapor_pressure',\n",
"\n",
- " # Interazioni\n",
- " 'cloud_temp_interaction', 'visibility_cloud_interaction',\n",
- " 'cloud_elevation', 'visibility_elevation',\n",
+ " # Solar energy and UV features\n",
+ " 'energy_uv_ratio',\n",
"\n",
- " # Rolling features\n",
- " 'cloud_rolling_12h', 'temp_rolling_12h',\n",
- " 'temp_rolling_mean_6h', 'cloudcover_rolling_mean_6h',\n",
+ " # Moving averages\n",
+ " 'energy_rolling_mean_3h', 'energy_rolling_mean_6h',\n",
+ " 'uv_rolling_mean_3h', 'uv_rolling_mean_6h',\n",
"\n",
- " # Features categoriche\n",
- " 'season', 'time_period'\n",
+ " # Daily aggregations\n",
+ " 'energy_daily_sum', 'uv_daily_max',\n",
+ "\n",
+ " # Main lag features\n",
+ " 'energy_lag_1h', 'energy_lag_3h', 'energy_lag_6h',\n",
+ " 'uv_lag_1h', 'uv_lag_3h',\n",
+ "\n",
+ " # Peak and volatility indicators\n",
+ " 'is_energy_peak', 'is_uv_peak',\n",
+ " 'energy_volatility', 'uv_volatility',\n",
+ "\n",
+ " # Composite indices\n",
+ " 'solar_intensity_index',\n",
+ "\n",
+ " # Interactions\n",
+ " 'uv_cloud_interaction',\n",
+ " 'energy_temp_interaction'\n",
" ]\n",
"\n",
- " # Aggiorna la lista delle feature con le colonne one-hot\n",
+ " # Add one-hot columns\n",
" categorical_columns = [col for col in df.columns if col.startswith(('season_', 'time_period_'))]\n",
- " final_features = [f for f in selected_features if f not in ['season', 'time_period']] + categorical_columns\n",
+ " final_features = selected_features + categorical_columns\n",
"\n",
+ " # Dataset preparation\n",
" df = df.sort_values('datetime')\n",
" df.set_index('datetime', inplace=True)\n",
"\n",
- " columns_to_interpolate = final_features + target_variables\n",
- " for column in columns_to_interpolate:\n",
+ " # Handle missing values\n",
+ " for column in final_features + target_variables:\n",
" df[column] = df[column].interpolate(method='time')\n",
- "\n",
- " # Rimuovi eventuali valori mancanti residui\n",
" df.fillna(0, inplace=True)\n",
"\n",
+ " # Temporal split\n",
" data_after_2010 = df[df['year'] >= 2010].copy()\n",
" data_before_2010 = df[df['year'] < 2010].copy()\n",
"\n",
- " print(\"\\nNumero di record dopo 2010:\", len(data_after_2010))\n",
- " print(\"Numero di record prima 2010:\", len(data_before_2010))\n",
- "\n",
" X = data_after_2010[final_features]\n",
- "\n",
- " y = data_after_2010['uvindex']\n",
- "\n",
+ " y = data_after_2010['solarradiation']\n",
" X_to_predict = data_before_2010[final_features]\n",
"\n",
- " # Split dei dati\n",
- " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)\n",
+ " # Train-test split\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=random_state_value)\n",
"\n",
- " # Scaling delle feature\n",
- " scaler = StandardScaler()\n",
+ " # Scaling\n",
+ " scaler = RobustScaler()\n",
" X_train_scaled = scaler.fit_transform(X_train)\n",
" X_test_scaled = scaler.transform(X_test)\n",
- "\n",
" X_to_predict_scaled = scaler.transform(X_to_predict)\n",
"\n",
" return X_train_scaled, X_test_scaled, y_train, y_test, scaler, final_features, X_to_predict_scaled\n",
@@ -296,8 +318,8 @@
"\n",
"def create_sequence_data(X, sequence_length=24):\n",
" \"\"\"\n",
- " Converte i dati in sequenze per l'input LSTM\n",
- " sequence_length rappresenta quante ore precedenti considerare\n",
+ " Converts data into sequences for LSTM input\n",
+ " sequence_length represents how many previous hours to consider\n",
" \"\"\"\n",
" sequences = []\n",
" for i in range(len(X) - sequence_length + 1):\n",
@@ -306,300 +328,417 @@
"\n",
"\n",
"def prepare_hybrid_data(df):\n",
- "\n",
" X_train_scaled, X_test_scaled, y_train, y_test, scaler, features, X_to_predict_scaled = prepare_advanced_data(df)\n",
"\n",
- " # Convertiamo i dati in sequenze\n",
- " sequence_length = 24 # 24 ore di dati storici\n",
+ " # Convert data into sequences\n",
+ " sequence_length = 24 # 24 hours of historical data\n",
"\n",
" X_train_seq = create_sequence_data(X_train_scaled, sequence_length)\n",
" X_test_seq = create_sequence_data(X_test_scaled, sequence_length)\n",
"\n",
- " # Adattiamo le y rimuovendo i primi (sequence_length-1) elementi\n",
+ " # Adjust y by removing the first (sequence_length-1) elements\n",
" y_train = y_train[sequence_length - 1:]\n",
" y_test = y_test[sequence_length - 1:]\n",
"\n",
" X_to_predict_seq = create_sequence_data(X_to_predict_scaled, sequence_length)\n",
"\n",
" return X_train_seq, X_test_seq, y_train, y_test, scaler, features, X_to_predict_seq"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
- "execution_count": 8,
"id": "9dff3259-b376-4cfc-89d8-ab2ea18aaa5e",
"metadata": {},
- "outputs": [],
"source": [
- "def create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01, return_sequences=True):\n",
+ "def create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01, return_sequences=True, survival_probability=0.8):\n",
+ " \"\"\"\n",
+ " Creates a bidirectional LSTM layer with residual connections and regularization.\n",
+ "\n",
+ " Parameters:\n",
+ " x: Input tensor\n",
+ " units: Number of LSTM units\n",
+ " dropout_rate: Dropout rate for regularization\n",
+ " l2_reg: L2 regularization factor\n",
+ " return_sequences: Whether to return sequences or just the last output\n",
+ " survival_probability: Probability of layer survival for stochastic depth\n",
+ " \"\"\"\n",
" residual = x\n",
" x = Bidirectional(LSTM(units, return_sequences=return_sequences, kernel_regularizer=regularizers.l2(l2_reg)))(x)\n",
" x = LayerNormalization()(x)\n",
" x = Dropout(dropout_rate)(x)\n",
- " # Adjust residual dimension and handle return_sequences\n",
+ "\n",
" if return_sequences:\n",
" if int(residual.shape[-1]) != 2 * units:\n",
" residual = Dense(2 * units, activation='linear')(residual)\n",
- " x = Add()([x, residual])\n",
+ " x = tfa.layers.StochasticDepth(survival_probability)([x, residual])\n",
" return x\n",
"\n",
+ "def attention_block(x, units, num_heads=8, survival_probability=0.8):\n",
+ " \"\"\"\n",
+ " Creates a multi-head attention block with residual connections.\n",
"\n",
- "def attention_block(x, units, num_heads=8):\n",
+ " Parameters:\n",
+ " x: Input tensor\n",
+ " units: Dimension of the key space\n",
+ " num_heads: Number of attention heads\n",
+ " survival_probability: Probability of layer survival for stochastic depth\n",
+ " \"\"\"\n",
" attention = MultiHeadAttention(num_heads=num_heads, key_dim=units)(x, x)\n",
- " x = Add()([x, attention])\n",
+ " x = tfa.layers.StochasticDepth(survival_probability)([x, attention])\n",
" x = LayerNormalization()(x)\n",
" return x\n",
"\n",
+ "def create_solarradiation_model(input_shape, folder_name, l2_lambda=0.005):\n",
+ " \"\"\"\n",
+ " Creates a deep learning model for solar radiation prediction using LSTM and attention mechanisms.\n",
"\n",
- "def create_solarradiation_index_model(input_shape, folder_name, l2_lambda=0.005):\n",
+ " Parameters:\n",
+ " input_shape: Shape of input data\n",
+ " folder_name: Directory to save model architecture visualization\n",
+ " l2_lambda: L2 regularization factor\n",
+ " \"\"\"\n",
" inputs = Input(shape=input_shape)\n",
"\n",
- " # Primi due layer LSTM con sequenze\n",
- " x = create_residual_lstm_layer(inputs, 128, 0.4, l2_lambda, return_sequences=True)\n",
- " x = create_residual_lstm_layer(x, 64, 0.3, l2_lambda, return_sequences=True)\n",
- " x = create_residual_lstm_layer(x, 32, 0.2, l2_lambda, return_sequences=True)\n",
+ " # Progressive hyperparameters for model architecture\n",
+ " survival_probs = [0.9, 0.8, 0.7] # Decreasing survival probabilities for deeper layers\n",
+ " attention_survival_probs = [0.85, 0.75, 0.65] # Survival probabilities for attention blocks\n",
+ " lstm_units = [256, 128, 64] # Decreasing number of units for LSTM layers\n",
+ " dropout_rates = [0.4, 0.3, 0.2] # Decreasing dropout rates\n",
+ " attention_heads = [32, 24, 16] # Decreasing number of attention heads\n",
"\n",
- " # Attention e MaxPooling mentre abbiamo ancora la sequenza\n",
- " x = attention_block(x, 32, num_heads=16)\n",
- " x = MaxPooling1D()(x)\n",
+ " # Main network architecture\n",
+ " x = inputs\n",
+ " for i in range(3):\n",
+ " # LSTM layer with residual connections\n",
+ " x = create_residual_lstm_layer(\n",
+ " x,\n",
+ " units=lstm_units[i],\n",
+ " dropout_rate=dropout_rates[i],\n",
+ " l2_reg=l2_lambda,\n",
+ " return_sequences=True,\n",
+ " survival_probability=survival_probs[i]\n",
+ " )\n",
+ " # Attention block\n",
+ " x = attention_block(\n",
+ " x,\n",
+ " units=lstm_units[i],\n",
+ " num_heads=attention_heads[i],\n",
+ " survival_probability=attention_survival_probs[i]\n",
+ " )\n",
+ " if i < 2: # No pooling after last LSTM layer\n",
+ " x = MaxPooling1D()(x)\n",
"\n",
- " # Ultimo layer LSTM senza sequenze\n",
- " x = create_residual_lstm_layer(x, 32, 0.1, l2_lambda, return_sequences=False)\n",
- "\n",
- " # Dense layers\n",
- " x = Dense(32, kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
- " x = BatchNormalization()(x)\n",
- " x = Activation('swish')(x)\n",
- " x = Dropout(0.1)(x)\n",
- "\n",
- " x = Dense(16, kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
- " x = BatchNormalization()(x)\n",
- " x = Activation('swish')(x)\n",
- " x = Dropout(0.1)(x)\n",
- "\n",
- " outputs = Dense(1)(x)\n",
- " outputs = Lambda(lambda x: tf.clip_by_value(x, 0, 11))(outputs)\n",
- "\n",
- " model = Model(inputs=inputs, outputs=outputs, name=\"UvModel\")\n",
- "\n",
- " optimizer = AdamW(\n",
- " learning_rate=0.0005,\n",
- " beta_1=0.9,\n",
- " beta_2=0.999,\n",
- " epsilon=1e-07\n",
+ " # Final LSTM layer for sequence aggregation\n",
+ " x = create_residual_lstm_layer(\n",
+ " x,\n",
+ " units=32,\n",
+ " dropout_rate=0.1,\n",
+ " l2_reg=l2_lambda,\n",
+ " return_sequences=False,\n",
+ " survival_probability=0.6\n",
" )\n",
"\n",
+ " # Dense layers for final prediction\n",
+ " dense_units = [64, 32]\n",
+ " dense_dropout = [0.2, 0.1]\n",
+ "\n",
+ " for units, dropout in zip(dense_units, dense_dropout):\n",
+ " x = Dense(units, kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
+ " x = BatchNormalization()(x)\n",
+ " x = Activation('swish')(x)\n",
+ " x = Dropout(dropout)(x)\n",
+ "\n",
+ " # Output layer with value clipping\n",
+ " outputs = Dense(1)(x)\n",
+ " outputs = Lambda(lambda x: tf.clip_by_value(x, 0, 1500))(outputs)\n",
+ "\n",
+ " # Model compilation\n",
+ " model = Model(inputs=inputs, outputs=outputs, name=\"SolarRadiationModel\")\n",
+ "\n",
+ " # Optimizer configuration\n",
+ " optimizer = AdamW(\n",
+ " learning_rate=0.0003,\n",
+ " beta_1=0.9,\n",
+ " beta_2=0.999,\n",
+ " epsilon=1e-07,\n",
+ " weight_decay=0.001\n",
+ " )\n",
+ "\n",
+ " # Custom evaluation metrics\n",
+ " def rmse(y_true, y_pred):\n",
+ " \"\"\"Root Mean Squared Error\"\"\"\n",
+ " return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))\n",
+ "\n",
+ " def mape(y_true, y_pred):\n",
+ " \"\"\"Mean Absolute Percentage Error\"\"\"\n",
+ " epsilon = 1e-7\n",
+ " return tf.reduce_mean(tf.abs((y_true - y_pred) / (y_true + epsilon))) * 100\n",
+ "\n",
+ " def hybrid_loss(y_true, y_pred):\n",
+ " \"\"\"Combined loss function: 70% MSE + 30% MAE\"\"\"\n",
+ " mse = tf.reduce_mean(tf.square(y_true - y_pred))\n",
+ " mae = tf.reduce_mean(tf.abs(y_true - y_pred))\n",
+ " return 0.7 * mse + 0.3 * mae\n",
+ "\n",
+ " # Model compilation with custom metrics\n",
" model.compile(\n",
" optimizer=optimizer,\n",
- " loss='huber',\n",
- " metrics=['mae', 'mse']\n",
+ " loss=hybrid_loss,\n",
+ " metrics=[\n",
+ " 'mae',\n",
+ " rmse,\n",
+ " mape\n",
+ " ]\n",
" )\n",
" model.summary()\n",
"\n",
+ " # Save model architecture visualization\n",
" plot_model(model,\n",
" to_file=f'{folder_name}_model_architecture.png',\n",
- " show_shapes=True, # Mostra le dimensioni dei tensori\n",
- " show_layer_names=True, # Mostra i nomi dei layer\n",
- " dpi=96, # Risoluzione dell'immagine\n",
+ " show_shapes=True,\n",
+ " show_layer_names=True,\n",
+ " dpi=150,\n",
" show_layer_activations=True)\n",
"\n",
" return model\n",
"\n",
"\n",
- "def evaluate_solarradiation_predictions(y_true, y_pred, folder_name=None):\n",
+ "def evaluate_solarradiation_predictions(y_true, y_pred, hour=None, folder_name=None):\n",
" \"\"\"\n",
- " Valutazione specifica per UV index con metriche sia raw che categoriche\n",
+ " Comprehensive evaluation of solar radiation predictions with detailed analysis and visualizations.\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
- " Valori reali dell'UV index\n",
+ " Actual solar radiation values (W/m²)\n",
" y_pred : array-like\n",
- " Valori predetti dell'UV index\n",
+ " Predicted solar radiation values (W/m²)\n",
+ " hour : array-like, optional\n",
+ " Array of hours corresponding to predictions, for temporal analysis\n",
" folder_name : str, optional\n",
- " Cartella dove salvare eventuali plot di analisi\n",
+ " Directory to save analysis plots\n",
"\n",
" Returns:\n",
" --------\n",
" dict\n",
- " Dizionario contenente tutte le metriche calcolate\n",
+ " Dictionary containing all calculated metrics\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
+ " import numpy as np\n",
+ " import pandas as pd\n",
+ " import matplotlib.pyplot as plt\n",
+ " import seaborn as sns\n",
+ " from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix\n",
"\n",
+ " # Data preparation\n",
" y_true = np.array(y_true).ravel()\n",
" y_pred = np.array(y_pred).ravel()\n",
+ " errors = y_pred - y_true\n",
"\n",
- " # Calcolo metriche sui valori raw\n",
+ " # Basic metrics calculation\n",
" mae_raw = mean_absolute_error(y_true, y_pred)\n",
" rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred))\n",
" r2_raw = r2_score(y_true, y_pred)\n",
+ " mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-7))) * 100\n",
"\n",
- " # Arrotonda le predizioni al più vicino intero\n",
- " y_pred_rounded = np.round(y_pred)\n",
- " y_pred_clipped = np.clip(y_pred_rounded, 0, 11)\n",
+ " # Error margin accuracy\n",
+ " within_5_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.05)\n",
+ " within_10_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.10)\n",
+ " within_20_percent = np.mean(np.abs((y_pred - y_true) / (y_true + 1e-7)) <= 0.20)\n",
"\n",
- " # Calcolo metriche sui valori arrotondati\n",
- " mae_rounded = mean_absolute_error(y_true, y_pred_clipped)\n",
- " rmse_rounded = np.sqrt(mean_squared_error(y_true, y_pred_clipped))\n",
- " r2_rounded = r2_score(y_true, y_pred_clipped)\n",
- "\n",
- " # Calcolo accuratezza per diversi margini di errore (sia raw che rounded)\n",
- " # Raw\n",
- " within_05_raw = np.mean(np.abs(y_pred - y_true) <= 0.5)\n",
- " within_1_raw = np.mean(np.abs(y_pred - y_true) <= 1.0)\n",
- " within_2_raw = np.mean(np.abs(y_pred - y_true) <= 2.0)\n",
- "\n",
- " # Rounded\n",
- " exact_accuracy = np.mean(y_pred_clipped == y_true)\n",
- " one_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 1)\n",
- " two_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 2)\n",
- "\n",
- " print(\"\\nUV Index Prediction Metrics:\")\n",
- " print(\"\\nRaw Predictions:\")\n",
- " print(f\"MAE: {mae_raw:.3f}\")\n",
- " print(f\"RMSE: {rmse_raw:.3f}\")\n",
- " print(f\"R² Score: {r2_raw:.3f}\")\n",
- " print(f\"Within ±0.5: {within_05_raw:.3f}\")\n",
- " print(f\"Within ±1.0: {within_1_raw:.3f}\")\n",
- " print(f\"Within ±2.0: {within_2_raw:.3f}\")\n",
- "\n",
- " print(\"\\nRounded Predictions:\")\n",
- " print(f\"MAE: {mae_rounded:.3f}\")\n",
- " print(f\"RMSE: {rmse_rounded:.3f}\")\n",
- " print(f\"R² Score: {r2_rounded:.3f}\")\n",
- " print(f\"Exact Match: {exact_accuracy:.3f}\")\n",
- " print(f\"±1 Accuracy: {one_off_accuracy:.3f}\")\n",
- " print(f\"±2 Accuracy: {two_off_accuracy:.3f}\")\n",
- "\n",
- " # Analisi dei livelli UV\n",
- " def get_solarradiation_level(value):\n",
- " if value <= 2:\n",
+ " # Radiation level classification\n",
+ " def get_radiation_level(value):\n",
+ " if value <= 200:\n",
+ " return 'Very Low'\n",
+ " elif value <= 400:\n",
" return 'Low'\n",
- " elif value <= 5:\n",
+ " elif value <= 600:\n",
" return 'Moderate'\n",
- " elif value <= 7:\n",
+ " elif value <= 800:\n",
" return 'High'\n",
- " elif value <= 10:\n",
+ " elif value <= 1000:\n",
" return 'Very High'\n",
" else:\n",
" return 'Extreme'\n",
"\n",
- " # Calcola livelli UV sia per raw che rounded\n",
- " y_true_levels = [get_solarradiation_level(v) for v in y_true]\n",
- " y_pred_levels_raw = [get_solarradiation_level(v) for v in y_pred]\n",
- " y_pred_levels_rounded = [get_solarradiation_level(v) for v in y_pred_clipped]\n",
+ " # Calculate radiation levels\n",
+ " y_true_levels = [get_radiation_level(v) for v in y_true]\n",
+ " y_pred_levels = [get_radiation_level(v) for v in y_pred]\n",
+ " level_accuracy = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels)])\n",
"\n",
- " # Calcola accuracy dei livelli\n",
- " level_accuracy_raw = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels_raw)])\n",
- " level_accuracy_rounded = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels_rounded)])\n",
+ " # Print main metrics\n",
+ " print(\"\\nSolar Radiation Prediction Metrics:\")\n",
+ " print(\"\\nAbsolute Metrics:\")\n",
+ " print(f\"MAE: {mae_raw:.2f} W/m²\")\n",
+ " print(f\"RMSE: {rmse_raw:.2f} W/m²\")\n",
+ " print(f\"R² Score: {r2_raw:.3f}\")\n",
+ " print(f\"MAPE: {mape:.2f}%\")\n",
"\n",
- " print(\"\\nUV Level Accuracy:\")\n",
- " print(f\"Raw predictions: {level_accuracy_raw:.3f}\")\n",
- " print(f\"Rounded predictions: {level_accuracy_rounded:.3f}\")\n",
+ " print(\"\\nPercentage Accuracy:\")\n",
+ " print(f\"Within ±5%: {within_5_percent*100:.1f}%\")\n",
+ " print(f\"Within ±10%: {within_10_percent*100:.1f}%\")\n",
+ " print(f\"Within ±20%: {within_20_percent*100:.1f}%\")\n",
"\n",
- " print(\"\\nUV Level Confusion Matrix (Raw Predictions):\")\n",
- " print(pd.crosstab(\n",
- " pd.Series(y_true_levels, name='Actual'),\n",
- " pd.Series(y_pred_levels_raw, name='Predicted')\n",
- " ))\n",
+ " print(\"\\nLevel Accuracy:\")\n",
+ " print(f\"Level Accuracy: {level_accuracy*100:.1f}%\")\n",
"\n",
- " print(\"\\nUV Level Confusion Matrix (Rounded Predictions):\")\n",
- " print(pd.crosstab(\n",
- " pd.Series(y_true_levels, name='Actual'),\n",
- " pd.Series(y_pred_levels_rounded, name='Predicted')\n",
- " ))\n",
+ " # Confusion matrix for radiation levels\n",
+ " cm = confusion_matrix(y_true_levels, y_pred_levels)\n",
+ " print(\"\\nConfusion Matrix for Radiation Levels:\")\n",
+ " cm_df = pd.DataFrame(\n",
+ " cm,\n",
+ " columns=['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extreme'],\n",
+ " index=['Very Low', 'Low', 'Moderate', 'High', 'Very High', 'Extreme']\n",
+ " )\n",
+ " print(cm_df)\n",
"\n",
- " # Se specificata una cartella, salva i plot di analisi\n",
+ " # Time period analysis\n",
+ " if hour is not None:\n",
+ " day_periods = {\n",
+ " 'Morning (5-11)': (5, 11),\n",
+ " 'Noon (11-13)': (11, 13),\n",
+ " 'Afternoon (13-17)': (13, 17),\n",
+ " 'Evening (17-21)': (17, 21),\n",
+ " 'Night (21-5)': (21, 5)\n",
+ " }\n",
+ "\n",
+ " print(\"\\nAnalysis by Time Period:\")\n",
+ " for period, (start, end) in day_periods.items():\n",
+ " if start < end:\n",
+ " mask = (hour >= start) & (hour < end)\n",
+ " else:\n",
+ " mask = (hour >= start) | (hour < end)\n",
+ "\n",
+ " if np.any(mask):\n",
+ " period_mae = mean_absolute_error(y_true[mask], y_pred[mask])\n",
+ " period_mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / (y_true[mask] + 1e-7))) * 100\n",
+ " print(f\"\\n{period}:\")\n",
+ " print(f\"MAE: {period_mae:.2f} W/m²\")\n",
+ " print(f\"MAPE: {period_mape:.2f}%\")\n",
+ "\n",
+ " # Visualizations\n",
" if folder_name is not None:\n",
" try:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"\n",
- " # Plot di confronto tra raw e rounded predictions\n",
- " plt.figure(figsize=(15, 5))\n",
+ " # Figure 1: Main analysis plots\n",
+ " plt.figure(figsize=(20, 15))\n",
"\n",
- " # Plot 1: Scatter plot confronto\n",
- " plt.subplot(1, 3, 1)\n",
- " plt.scatter(y_true, y_pred, alpha=0.5, label='Raw')\n",
- " plt.scatter(y_true, y_pred_clipped, alpha=0.5, label='Rounded')\n",
- " plt.plot([0, 11], [0, 11], 'r--', lw=2)\n",
- " plt.xlabel('Actual UV Index')\n",
- " plt.ylabel('Predicted UV Index')\n",
- " plt.title('Raw vs Rounded Predictions')\n",
- " plt.legend()\n",
+ " # Plot 1: Scatter plot of actual vs predicted values\n",
+ " plt.subplot(3, 2, 1)\n",
+ " plt.scatter(y_true, y_pred, alpha=0.5)\n",
+ " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
+ " plt.xlabel('Actual Radiation (W/m²)')\n",
+ " plt.ylabel('Predicted Radiation (W/m²)')\n",
+ " plt.title('Actual vs Predicted Values')\n",
" plt.grid(True)\n",
"\n",
- " # Plot 2: Distribuzione errori raw\n",
- " plt.subplot(1, 3, 2)\n",
- " plt.hist(y_pred - y_true, bins=50, alpha=0.7)\n",
- " plt.xlabel('Prediction Error (Raw)')\n",
+ " # Plot 2: Absolute error distribution\n",
+ " plt.subplot(3, 2, 2)\n",
+ " plt.hist(errors, bins=50, alpha=0.7)\n",
+ " plt.xlabel('Prediction Error (W/m²)')\n",
" plt.ylabel('Frequency')\n",
- " plt.title('Distribution of Raw Errors')\n",
+ " plt.title('Error Distribution')\n",
" plt.grid(True)\n",
"\n",
- " # Plot 3: Distribuzione errori rounded\n",
- " plt.subplot(1, 3, 3)\n",
- " plt.hist(y_pred_clipped - y_true, bins=50, alpha=0.7)\n",
- " plt.xlabel('Prediction Error (Rounded)')\n",
+ " # Plot 3: Percentage error distribution\n",
+ " plt.subplot(3, 2, 3)\n",
+ " percentage_errors = ((y_pred - y_true) / (y_true + 1e-7)) * 100\n",
+ " plt.hist(np.clip(percentage_errors, -100, 100), bins=50, alpha=0.7)\n",
+ " plt.xlabel('Percentage Error (%)')\n",
" plt.ylabel('Frequency')\n",
- " plt.title('Distribution of Rounded Errors')\n",
+ " plt.title('Percentage Error Distribution')\n",
" plt.grid(True)\n",
"\n",
+ " # Plot 4: Errors vs actual values\n",
+ " plt.subplot(3, 2, 4)\n",
+ " plt.scatter(y_true, errors, alpha=0.5)\n",
+ " plt.axhline(y=0, color='r', linestyle='--')\n",
+ " plt.xlabel('Actual Radiation (W/m²)')\n",
+ " plt.ylabel('Error (W/m²)')\n",
+ " plt.title('Errors vs Actual Values')\n",
+ " plt.grid(True)\n",
+ "\n",
+ " # Plot 5: Error boxplot by radiation level\n",
+ " plt.subplot(3, 2, 5)\n",
+ " sns.boxplot(x=[get_radiation_level(v) for v in y_true], y=errors)\n",
+ " plt.xticks(rotation=45)\n",
+ " plt.xlabel('Radiation Level')\n",
+ " plt.ylabel('Error (W/m²)')\n",
+ " plt.title('Error Distribution by Level')\n",
+ "\n",
+ " # Plot 6: Confusion matrix heatmap\n",
+ " plt.subplot(3, 2, 6)\n",
+ " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
+ " plt.title('Confusion Matrix')\n",
+ " plt.xticks(rotation=45)\n",
+ " plt.yticks(rotation=45)\n",
+ "\n",
" plt.tight_layout()\n",
- "\n",
- " # Salva il plot\n",
- " filename = os.path.join(folder_name, f'solarradiation_prediction_analysis_{timestamp}.png')\n",
+ " filename = os.path.join(folder_name, f'radiation_analysis_{timestamp}.png')\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot di analisi salvato come: {filename}\")\n",
- "\n",
- " plt.show()\n",
+ " print(f\"\\nPlot saved as: {filename}\")\n",
+ " plt.close()\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
+ " print(f\"\\nError saving plots: {str(e)}\")\n",
"\n",
- " # Restituisci tutte le metriche in un dizionario\n",
+ " # Additional error statistics\n",
+ " print(\"\\nError Statistics:\")\n",
+ " print(f\"Mean error: {np.mean(errors):.3f}\")\n",
+ " print(f\"Error standard deviation: {np.std(errors):.3f}\")\n",
+ " print(f\"Median error: {np.median(errors):.3f}\")\n",
+ " print(f\"95th percentile absolute error: {np.percentile(np.abs(errors), 95):.3f}\")\n",
+ "\n",
+ " # Return structured metrics\n",
" metrics = {\n",
- " 'raw': {\n",
+ " 'absolute': {\n",
" 'mae': mae_raw,\n",
" 'rmse': rmse_raw,\n",
" 'r2': r2_raw,\n",
- " 'within_05': within_05_raw,\n",
- " 'within_1': within_1_raw,\n",
- " 'within_2': within_2_raw,\n",
- " 'level_accuracy': level_accuracy_raw\n",
+ " 'mape': mape\n",
" },\n",
- " 'rounded': {\n",
- " 'mae': mae_rounded,\n",
- " 'rmse': rmse_rounded,\n",
- " 'r2': r2_rounded,\n",
- " 'exact_match': exact_accuracy,\n",
- " 'one_off': one_off_accuracy,\n",
- " 'two_off': two_off_accuracy,\n",
- " 'level_accuracy': level_accuracy_rounded\n",
+ " 'percentage_accuracy': {\n",
+ " 'within_5_percent': within_5_percent,\n",
+ " 'within_10_percent': within_10_percent,\n",
+ " 'within_20_percent': within_20_percent\n",
+ " },\n",
+ " 'categorical': {\n",
+ " 'level_accuracy': level_accuracy\n",
+ " },\n",
+ " 'error_stats': {\n",
+ " 'mean': float(np.mean(errors)),\n",
+ " 'std': float(np.std(errors)),\n",
+ " 'median': float(np.median(errors)),\n",
+ " 'p95_abs': float(np.percentile(np.abs(errors), 95))\n",
" }\n",
" }\n",
"\n",
" return metrics\n",
"\n",
+ "\n",
"def plot_training_history(history, folder_name=None):\n",
" \"\"\"\n",
- " Visualizza e salva i plot della loss e delle metriche durante il training\n",
+ " Visualize and save training loss and metrics plots\n",
"\n",
" Parameters:\n",
" -----------\n",
" history : tensorflow.keras.callbacks.History\n",
- " L'oggetto history restituito dal training del modello\n",
+ " History object returned by model training\n",
" folder_name : str\n",
- " Cartella dove salvare il plot\n",
+ " Directory to save the plots and metrics\n",
" \"\"\"\n",
" import os\n",
"\n",
" try:\n",
- " # Crea la figura\n",
+ " # Create figure\n",
" plt.figure(figsize=(12, 4))\n",
"\n",
- " # Plot della Loss\n",
+ " # Loss plot\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(history.history['loss'], label='Training Loss')\n",
" plt.plot(history.history['val_loss'], label='Validation Loss')\n",
@@ -609,7 +748,7 @@
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
- " # Plot del MAE\n",
+ " # MAE plot\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(history.history['mae'], label='Training MAE')\n",
" plt.plot(history.history['val_mae'], label='Validation MAE')\n",
@@ -623,14 +762,14 @@
"\n",
" if folder_name is not None:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
- " # Genera il nome del file con timestamp\n",
+ " # Generate filename with timestamp\n",
" filename = os.path.join(folder_name, 'training_history.png')\n",
"\n",
- " # Salva la figura\n",
+ " # Save figure\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot della training history salvato come: {filename}\")\n",
+ " print(f\"\\nTraining history plot saved as: {filename}\")\n",
"\n",
- " # Salva anche i dati numerici in formato CSV\n",
+ " # Save numerical data in CSV format\n",
" history_df = pd.DataFrame({\n",
" 'epoch': range(1, len(history.history['loss']) + 1),\n",
" 'training_loss': history.history['loss'],\n",
@@ -642,9 +781,9 @@
" if folder_name is not None:\n",
" csv_filename = os.path.join(folder_name, 'training_history.csv')\n",
" history_df.to_csv(csv_filename, index=False)\n",
- " print(f\"Dati della training history salvati come: {csv_filename}\")\n",
+ " print(f\"Training history data saved as: {csv_filename}\")\n",
"\n",
- " # Calcola e salva le statistiche finali\n",
+ " # Calculate and save final statistics\n",
" final_stats = {\n",
" 'final_training_loss': history.history['loss'][-1],\n",
" 'final_validation_loss': history.history['val_loss'][-1],\n",
@@ -656,101 +795,116 @@
" }\n",
"\n",
" if folder_name is not None:\n",
- " # Salva le statistiche in formato JSON\n",
+ " # Save statistics in JSON format\n",
" stats_filename = os.path.join(folder_name, 'training_stats.json')\n",
" with open(stats_filename, 'w') as f:\n",
" json.dump(final_stats, f, indent=4)\n",
- " print(f\"Statistiche finali salvate come: {stats_filename}\")\n",
+ " print(f\"Final statistics saved as: {stats_filename}\")\n",
"\n",
- " # Stampa le statistiche principali\n",
- " print(\"\\nStatistiche finali del training:\")\n",
- " print(f\"Loss finale (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
- " print(f\"MAE finale (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
- " print(f\"Miglior validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
- " print(f\"Miglior validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
+ " # Print main statistics\n",
+ " print(\"\\nFinal Training Statistics:\")\n",
+ " print(f\"Final Loss (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
+ " print(f\"Final MAE (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
+ " print(f\"Best validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
+ " print(f\"Best validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
"\n",
" plt.show()\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante la creazione o il salvataggio dei plot: {str(e)}\")\n",
+ " print(f\"\\nError during plot creation or saving: {str(e)}\")\n",
"\n",
"\n",
- "def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='solarradiation_index'):\n",
+ "def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='solarradiation'):\n",
" \"\"\"\n",
- " Funzione di training avanzata per il modello ibrido UV index con monitoraggio dettagliato\n",
- " e gestione del training.\n",
+ " Advanced training function for the hybrid solar radiation model with detailed monitoring\n",
+ " and training management.\n",
"\n",
" Parameters:\n",
" -----------\n",
" model : keras.Model\n",
- " Il modello ibrido compilato\n",
+ " Compiled hybrid model\n",
" X_train : numpy.ndarray\n",
- " Dati di training\n",
+ " Training data\n",
" y_train : numpy.ndarray\n",
- " Target di training\n",
+ " Training targets\n",
" X_test : numpy.ndarray\n",
- " Dati di validation\n",
+ " Validation data\n",
" y_test : numpy.ndarray\n",
- " Target di validation\n",
+ " Validation targets\n",
" epochs : int, optional\n",
- " Numero massimo di epoche di training\n",
+ " Maximum number of training epochs\n",
" batch_size : int, optional\n",
- " Dimensione del batch\n",
+ " Batch size\n",
+ " folder_name : str, optional\n",
+ " Directory for saving model artifacts\n",
"\n",
" Returns:\n",
" --------\n",
" history : keras.callbacks.History\n",
- " Storia del training con tutte le metriche\n",
+ " Training history with all metrics\n",
" \"\"\"\n",
"\n",
- " # Callbacks avanzati per il training\n",
+ " # Advanced training callbacks\n",
" callbacks = [\n",
- " # Early Stopping avanzato\n",
+ " # Early Stopping\n",
" EarlyStopping(\n",
- " monitor='mae',\n",
+ " monitor='val_loss',\n",
" patience=15,\n",
" restore_best_weights=True,\n",
" mode='min',\n",
" verbose=1,\n",
- " min_delta=1e-6\n",
+ " min_delta=1e-4\n",
" ),\n",
+ " # ReduceLROnPlateau for MAE\n",
" ReduceLROnPlateau(\n",
" monitor='mae',\n",
- " factor=0.05,\n",
- " patience=3,\n",
+ " factor=0.2,\n",
+ " patience=5,\n",
" verbose=1,\n",
" mode='min',\n",
- " min_delta=1e-6,\n",
- " cooldown=2,\n",
+ " min_delta=1e-4,\n",
+ " cooldown=3,\n",
" min_lr=1e-7\n",
" ),\n",
+ " # ReduceLROnPlateau for loss\n",
" ReduceLROnPlateau(\n",
" monitor='val_loss',\n",
" factor=0.2,\n",
- " patience=2,\n",
+ " patience=3,\n",
" verbose=1,\n",
" mode='min',\n",
- " min_delta=1e-6,\n",
- " cooldown=1,\n",
+ " min_delta=1e-4,\n",
+ " cooldown=2,\n",
" min_lr=1e-7\n",
" ),\n",
+ " # Model Checkpoint\n",
" tf.keras.callbacks.ModelCheckpoint(\n",
- " filepath=f'{folder_name}_best_solarradiation_model.h5',\n",
- " monitor='mae',\n",
+ " filepath=f'{folder_name}_best_model.h5',\n",
+ " monitor='val_loss',\n",
" save_best_only=True,\n",
- " mode='min'\n",
+ " mode='min',\n",
+ " save_weights_only=False\n",
" ),\n",
+ " # TensorBoard\n",
" tf.keras.callbacks.TensorBoard(\n",
" log_dir=f'./logs_{folder_name}',\n",
" histogram_freq=1,\n",
" write_graph=True,\n",
- " update_freq='epoch'\n",
+ " update_freq='epoch',\n",
+ " profile_batch=0\n",
" ),\n",
+ " # Lambda Callback for solar radiation monitoring\n",
" tf.keras.callbacks.LambdaCallback(\n",
- " on_epoch_end=lambda epoch, logs: print(\n",
- " f\"\\nEpoch {epoch + 1}: Predizioni fuori range: \"\n",
- " f\"{np.sum((model.predict(X_test) < 0) | (model.predict(X_test) > 11))}\"\n",
- " ) if epoch % 20 == 0 else None\n",
+ " on_epoch_end=lambda epoch, logs: (\n",
+ " lambda y_pred: print(\n",
+ " f\"\\nEpoch {epoch + 1}:\"\n",
+ " f\"\\nPredictions out of range (0-1500 W/m²): \"\n",
+ " f\"{np.sum((y_pred < 0) | (y_pred > 1500))}\"\n",
+ " f\"\\nMAPE: {np.mean(np.abs((y_test - y_pred) / (y_test + 1e-7))) * 100:.2f}%\"\n",
+ " f\"\\nPredictions within ±10%: \"\n",
+ " f\"{np.mean(np.abs((y_pred - y_test) / (y_test + 1e-7)) <= 0.10) * 100:.2f}%\"\n",
+ " )\n",
+ " )(model.predict(X_test)) if epoch % 20 == 0 else None\n",
" )\n",
" ]\n",
"\n",
@@ -766,142 +920,104 @@
" validation_freq=1,\n",
" )\n",
"\n",
- " # Analisi post-training\n",
- " print(\"\\nTraining completato con successo!\")\n",
+ " # Post-training analysis\n",
+ " print(\"\\nTraining completed successfully!\")\n",
"\n",
- " # Valutazione finale sul test set\n",
+ " # Final evaluation on test set\n",
" test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=0)\n",
- " print(f\"\\nMetriche finali sul test set:\")\n",
+ " print(f\"\\nFinal metrics on test set:\")\n",
" print(f\"Loss: {test_loss:.4f}\")\n",
" print(f\"MAE: {test_mae:.4f}\")\n",
" print(f\"MSE: {test_mse:.4f}\")\n",
"\n",
- " # Analisi delle predizioni\n",
+ " # Prediction analysis\n",
" predictions = model.predict(X_test)\n",
" out_of_range = np.sum((predictions < 0) | (predictions > 11))\n",
- " print(f\"\\nPredizioni fuori range: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
+ " print(f\"\\nOut of range predictions: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
"\n",
" plot_training_history(history, folder_name=folder_name)\n",
"\n",
" return history\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante il training: {str(e)}\")\n",
+ " print(f\"\\nError during training: {str(e)}\")\n",
" raise\n",
"\n",
" finally:\n",
- " # Pulizia della memoria\n",
+ " # Memory cleanup\n",
" tf.keras.backend.clear_session()\n",
"\n",
"\n",
- "def calculate_class_weights(y_train, n_classes=12):\n",
- " \"\"\"\n",
- " Calcola i pesi delle classi per bilanciare il dataset UV index.\n",
- " \n",
- " Parameters:\n",
- " -----------\n",
- " y_train : numpy.ndarray\n",
- " Array dei valori UV di training\n",
- " n_classes : int, optional\n",
- " Numero di classi possibili (0-11 per UV index, quindi 12 classi)\n",
- " \n",
- " Returns:\n",
- " --------\n",
- " dict:\n",
- " Dizionario con i pesi per ogni classe\n",
- " \"\"\"\n",
- " # Arrotonda i valori UV al più vicino intero e converti in intero\n",
- " y_discrete = np.clip(np.round(y_train), 0, 11).astype(int)\n",
- "\n",
- " # Calcola la frequenza di ogni classe\n",
- " unique, counts = np.unique(y_discrete, return_counts=True)\n",
- " total_samples = len(y_discrete)\n",
- "\n",
- " # Calcola i pesi inversamente proporzionali alla frequenza\n",
- " weights = {}\n",
- " for i in range(n_classes):\n",
- " if i in unique:\n",
- " # Se la classe è presente, calcola il peso\n",
- " weight = total_samples / (len(unique) * counts[unique == i][0])\n",
- " else:\n",
- " # Se la classe non è presente, assegna un peso neutro\n",
- " weight = 1.0\n",
- " weights[i] = weight\n",
- "\n",
- " return weights\n",
- "\n",
- "\n",
"def integrate_predictions(df, predictions, sequence_length=24):\n",
" \"\"\"\n",
- " Integra le predizioni dell'UV index nel dataset originale per i dati precedenti al 2010.\n",
- " \n",
+ " Integrates solar radiation predictions into the original dataset for pre-2010 data.\n",
+ "\n",
" Parameters:\n",
" -----------\n",
" df : pandas.DataFrame\n",
- " Dataset originale\n",
+ " Original dataset\n",
" predictions : numpy.ndarray\n",
- " Array delle predizioni UV index\n",
+ " Array of solar radiation predictions\n",
" sequence_length : int\n",
- " Lunghezza della sequenza usata per le predizioni\n",
- " \n",
+ " Sequence length used for predictions\n",
+ "\n",
" Returns:\n",
" --------\n",
" pandas.DataFrame\n",
- " Dataset aggiornato con le predizioni UV index\n",
+ " Updated dataset with solar radiation predictions\n",
" \"\"\"\n",
- " # Converti datetime in formato datetime se non lo è già\n",
+ " # Convert datetime to datetime format if not already\n",
" df['datetime'] = pd.to_datetime(df['datetime'])\n",
"\n",
- " # Identifica le righe precedenti al 2010\n",
+ " # Identify pre-2010 rows\n",
" mask_pre_2010 = df['datetime'].dt.year < 2010\n",
"\n",
- " # Crea un DataFrame temporaneo con le predizioni\n",
+ " # Create temporary DataFrame with predictions\n",
" dates_pre_2010 = df[mask_pre_2010]['datetime'].iloc[sequence_length - 1:]\n",
" predictions_df = pd.DataFrame({\n",
" 'datetime': dates_pre_2010,\n",
- " 'uvindex_predicted': predictions.flatten()\n",
+ " 'solarradiation_predicted': predictions.flatten()\n",
" })\n",
"\n",
- " # Merge con il dataset originale\n",
+ " # Merge with original dataset\n",
" df = df.merge(predictions_df, on='datetime', how='left')\n",
"\n",
- " # Aggiorna la colonna uvindex dove manca\n",
- " df['uvindex'] = df['uvindex'].fillna(df['uvindex_predicted'])\n",
+ " # Update solar radiation column where missing\n",
+ " df['solarradiation'] = df['solarradiation'].fillna(df['solarradiation_predicted'])\n",
"\n",
- " # Rimuovi la colonna temporanea\n",
- " df = df.drop('uvindex_predicted', axis=1)\n",
+ " # Remove temporary column\n",
+ " df = df.drop('solarradiation_predicted', axis=1)\n",
"\n",
- " print(f\"Aggiunte {len(predictions)} predizioni al dataset\")\n",
- " print(f\"Righe con UV index dopo l'integrazione: {df['uvindex'].notna().sum()}\")\n",
+ " print(f\"Added {len(predictions)} predictions to dataset\")\n",
+ " print(f\"Rows with solar radiation after integration: {df['solarradiation'].notna().sum()}\")\n",
"\n",
" return df\n",
"\n",
"\n",
- "def train_uvindex_bounded_model(df):\n",
+ "def train_solarradiation_bounded_model(df):\n",
" \"\"\"\n",
- " Training del modello con vincoli specifici per UV index\n",
+ " Training of the model with specific constraints for solar radiation\n",
" \"\"\"\n",
- " print(\"Inizializzazione del training del modello UV index...\")\n",
+ " print(\"Initializing solar radiation model training...\")\n",
"\n",
" try:\n",
- "\n",
- " # Preparazione dei dati\n",
- " print(\"\\n1. Preparazione dei dati...\")\n",
+ " # Data preparation\n",
+ " print(\"\\n1. Preparing data...\")\n",
" X_train_seq, X_test_seq, y_train, y_test, scaler, features, X_to_predict_seq = prepare_hybrid_data(df)\n",
"\n",
- " print(f\"Shape dei dati di training: {X_train_seq.shape}\")\n",
- " print(f\"Shape dei dati di test: {X_test_seq.shape}\")\n",
+ " print(f\"Training data shape: {X_train_seq.shape}\")\n",
+ " print(f\"Test data shape: {X_test_seq.shape}\")\n",
"\n",
- " # Verifica della qualità dei dati\n",
+ " # Data quality verification\n",
" if np.isnan(X_train_seq).any() or np.isnan(y_train).any():\n",
- " raise ValueError(\"Trovati valori NaN nei dati di training\")\n",
+ " raise ValueError(\"Found NaN values in training data\")\n",
"\n",
- " # Creazione del modello\n",
- " print(\"\\n2. Creazione del modello...\")\n",
+ " # Model creation\n",
+ " print(\"\\n2. Creating model...\")\n",
" input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])\n",
- " model = create_solarradiation_index_model(input_shape, folder_name)\n",
+ " model = create_solarradiation_model(input_shape, folder_name)\n",
"\n",
- " print(\"\\n4. Avvio del training...\")\n",
+ " print(\"\\n4. Starting training...\")\n",
" history = train_hybrid_model(\n",
" model=model,\n",
" X_train=X_train_seq,\n",
@@ -913,14 +1029,14 @@
" folder_name=folder_name\n",
" )\n",
"\n",
- " print(\"\\n5. Generazione delle predizioni...\")\n",
+ " print(\"\\n5. Generating predictions...\")\n",
" predictions = model.predict(X_test_seq)\n",
" predictions = np.clip(predictions, 0, 11)\n",
"\n",
- " print(\"\\n6. Valutazione del modello...\")\n",
+ " print(\"\\n6. Evaluating model...\")\n",
" metrics = evaluate_solarradiation_predictions(y_test, predictions, folder_name=folder_name)\n",
"\n",
- " # Creazione del dizionario dei risultati\n",
+ " # Create results dictionary\n",
" training_results = {\n",
" 'model_params': {\n",
" 'input_shape': input_shape,\n",
@@ -930,8 +1046,7 @@
" 'training_params': {\n",
" 'batch_size': 32,\n",
" 'total_epochs': len(history.history['loss']),\n",
- " 'best_epoch': np.argmin(history.history['val_loss']) + 1,\n",
- " #'class_weights': {str(k): float(v) for k, v in class_weights.items()}\n",
+ " 'best_epoch': np.argmin(history.history['val_loss']) + 1\n",
" },\n",
" 'performance_metrics': {\n",
" 'final_loss': float(history.history['val_loss'][-1]),\n",
@@ -941,576 +1056,79 @@
" }\n",
" }\n",
"\n",
- " print(\"\\n7. Predizione dei dati mancanti risultati...\")\n",
+ " print(\"\\n7. Predicting missing data...\")\n",
" to_predict_predictions = model.predict(X_to_predict_seq)\n",
" to_predict_predictions = np.clip(to_predict_predictions, 0, 11)\n",
"\n",
- " print(\"\\n8. Integrazione delle predizioni nel dataset originale...\")\n",
+ " print(\"\\n8. Integrating predictions into original dataset...\")\n",
" df_updated = integrate_predictions(df.copy(), to_predict_predictions)\n",
"\n",
- " df_updated.to_parquet('./data/weather_data_uvindex.parquet')\n",
+ " df_updated.to_parquet('../../sources/weather_data_solarradiation.parquet')\n",
"\n",
- " # Aggiungi statistiche sulle predizioni al training_results\n",
+ " # Add prediction statistics to training_results\n",
" training_results['prediction_stats'] = {\n",
" 'n_predictions_added': len(to_predict_predictions),\n",
- " 'mean_predicted_uv': float(to_predict_predictions.mean()),\n",
- " 'min_predicted_uv': float(to_predict_predictions.min()),\n",
- " 'max_predicted_uv': float(to_predict_predictions.max()),\n",
+ " 'mean_predicted_solarradiation': float(to_predict_predictions.mean()),\n",
+ " 'min_predicted_solarradiation': float(to_predict_predictions.min()),\n",
+ " 'max_predicted_solarradiation': float(to_predict_predictions.max()),\n",
" }\n",
"\n",
- " print(\"\\nTraining completato con successo!\")\n",
+ " print(\"\\nTraining completed successfully!\")\n",
"\n",
" return model, scaler, features, history, predictions, y_test, metrics, training_results\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante il training: {str(e)}\")\n",
+ " print(f\"\\nError during training: {str(e)}\")\n",
" raise\n",
"\n",
" finally:\n",
- " # Pulizia della memoria\n",
+ " # Memory cleanup\n",
" tf.keras.backend.clear_session()"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
- "execution_count": 9,
"id": "initial_id",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Inizializzazione del training del modello UV index...\n",
- "\n",
- "1. Preparazione dei dati...\n",
- "\n",
- "Numero di record dopo 2010: 129777\n",
- "Numero di record prima 2010: 227902\n",
- "Shape dei dati di training: (64865, 24, 33)\n",
- "Shape dei dati di test: (64866, 24, 33)\n",
- "\n",
- "2. Creazione del modello...\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-11-10 22:48:30.467084: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22456 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:e1:00.0, compute capability: 8.6\n",
- "2024-11-10 22:48:30.995563: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Model: \"UvModel\"\n",
- "__________________________________________________________________________________________________\n",
- " Layer (type) Output Shape Param # Connected to \n",
- "==================================================================================================\n",
- " input_1 (InputLayer) [(None, 24, 33)] 0 [] \n",
- " \n",
- " bidirectional (Bidirection (None, 24, 256) 165888 ['input_1[0][0]'] \n",
- " al) \n",
- " \n",
- " layer_normalization (Layer (None, 24, 256) 512 ['bidirectional[0][0]'] \n",
- " Normalization) \n",
- " \n",
- " dropout (Dropout) (None, 24, 256) 0 ['layer_normalization[0][0]'] \n",
- " \n",
- " dense (Dense) (None, 24, 256) 8704 ['input_1[0][0]'] \n",
- " \n",
- " add (Add) (None, 24, 256) 0 ['dropout[0][0]', \n",
- " 'dense[0][0]'] \n",
- " \n",
- " bidirectional_1 (Bidirecti (None, 24, 128) 164352 ['add[0][0]'] \n",
- " onal) \n",
- " \n",
- " layer_normalization_1 (Lay (None, 24, 128) 256 ['bidirectional_1[0][0]'] \n",
- " erNormalization) \n",
- " \n",
- " dropout_1 (Dropout) (None, 24, 128) 0 ['layer_normalization_1[0][0]'\n",
- " ] \n",
- " \n",
- " dense_1 (Dense) (None, 24, 128) 32896 ['add[0][0]'] \n",
- " \n",
- " add_1 (Add) (None, 24, 128) 0 ['dropout_1[0][0]', \n",
- " 'dense_1[0][0]'] \n",
- " \n",
- " bidirectional_2 (Bidirecti (None, 24, 64) 41216 ['add_1[0][0]'] \n",
- " onal) \n",
- " \n",
- " layer_normalization_2 (Lay (None, 24, 64) 128 ['bidirectional_2[0][0]'] \n",
- " erNormalization) \n",
- " \n",
- " dropout_2 (Dropout) (None, 24, 64) 0 ['layer_normalization_2[0][0]'\n",
- " ] \n",
- " \n",
- " dense_2 (Dense) (None, 24, 64) 8256 ['add_1[0][0]'] \n",
- " \n",
- " add_2 (Add) (None, 24, 64) 0 ['dropout_2[0][0]', \n",
- " 'dense_2[0][0]'] \n",
- " \n",
- " multi_head_attention (Mult (None, 24, 64) 132672 ['add_2[0][0]', \n",
- " iHeadAttention) 'add_2[0][0]'] \n",
- " \n",
- " add_3 (Add) (None, 24, 64) 0 ['add_2[0][0]', \n",
- " 'multi_head_attention[0][0]']\n",
- " \n",
- " layer_normalization_3 (Lay (None, 24, 64) 128 ['add_3[0][0]'] \n",
- " erNormalization) \n",
- " \n",
- " max_pooling1d (MaxPooling1 (None, 12, 64) 0 ['layer_normalization_3[0][0]'\n",
- " D) ] \n",
- " \n",
- " bidirectional_3 (Bidirecti (None, 64) 24832 ['max_pooling1d[0][0]'] \n",
- " onal) \n",
- " \n",
- " layer_normalization_4 (Lay (None, 64) 128 ['bidirectional_3[0][0]'] \n",
- " erNormalization) \n",
- " \n",
- " dropout_3 (Dropout) (None, 64) 0 ['layer_normalization_4[0][0]'\n",
- " ] \n",
- " \n",
- " dense_3 (Dense) (None, 32) 2080 ['dropout_3[0][0]'] \n",
- " \n",
- " batch_normalization (Batch (None, 32) 128 ['dense_3[0][0]'] \n",
- " Normalization) \n",
- " \n",
- " activation (Activation) (None, 32) 0 ['batch_normalization[0][0]'] \n",
- " \n",
- " dropout_4 (Dropout) (None, 32) 0 ['activation[0][0]'] \n",
- " \n",
- " dense_4 (Dense) (None, 16) 528 ['dropout_4[0][0]'] \n",
- " \n",
- " batch_normalization_1 (Bat (None, 16) 64 ['dense_4[0][0]'] \n",
- " chNormalization) \n",
- " \n",
- " activation_1 (Activation) (None, 16) 0 ['batch_normalization_1[0][0]'\n",
- " ] \n",
- " \n",
- " dropout_5 (Dropout) (None, 16) 0 ['activation_1[0][0]'] \n",
- " \n",
- " dense_5 (Dense) (None, 1) 17 ['dropout_5[0][0]'] \n",
- " \n",
- " lambda (Lambda) (None, 1) 0 ['dense_5[0][0]'] \n",
- " \n",
- "==================================================================================================\n",
- "Total params: 582785 (2.22 MB)\n",
- "Trainable params: 582689 (2.22 MB)\n",
- "Non-trainable params: 96 (384.00 Byte)\n",
- "__________________________________________________________________________________________________\n",
- "\n",
- "4. Avvio del training...\n",
- "Epoch 1/100\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2024-11-10 22:48:44.916189: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8905\n",
- "2024-11-10 22:48:47.364274: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fb501567b00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
- "2024-11-10 22:48:47.364334: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6\n",
- "2024-11-10 22:48:47.375943: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
- "2024-11-10 22:48:47.576847: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "507/507 [==============================] - ETA: 0s - loss: 3.0211 - mae: 0.8732 - mse: 2.6901"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py:3079: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n",
- " saving_api.save_model(\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2028/2028 [==============================] - 25s 11ms/step\n",
- "2028/2028 [==============================] - 22s 11ms/step\n",
- "\n",
- "Epoch 1: Predizioni fuori range: 0\n",
- "507/507 [==============================] - 95s 149ms/step - loss: 3.0211 - mae: 0.8732 - mse: 2.6901 - val_loss: 1.1114 - val_mae: 0.5132 - val_mse: 0.8839 - lr: 5.0000e-04\n",
- "Epoch 2/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.8217 - mae: 0.5744 - mse: 1.1119 - val_loss: 0.5794 - val_mae: 0.5285 - val_mse: 0.8990 - lr: 5.0000e-04\n",
- "Epoch 3/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.5241 - mae: 0.5406 - mse: 0.9968 - val_loss: 0.4411 - val_mae: 0.4995 - val_mse: 0.8515 - lr: 5.0000e-04\n",
- "Epoch 4/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.4490 - mae: 0.5363 - mse: 0.9738 - val_loss: 0.3752 - val_mae: 0.4606 - val_mse: 0.7784 - lr: 5.0000e-04\n",
- "Epoch 5/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.4127 - mae: 0.5218 - mse: 0.9290 - val_loss: 0.3470 - val_mae: 0.4698 - val_mse: 0.7271 - lr: 5.0000e-04\n",
- "Epoch 6/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3900 - mae: 0.5160 - mse: 0.9202 - val_loss: 0.3096 - val_mae: 0.4316 - val_mse: 0.6812 - lr: 5.0000e-04\n",
- "Epoch 7/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3934 - mae: 0.5173 - mse: 0.9176 - val_loss: 0.3931 - val_mae: 0.5097 - val_mse: 0.9083 - lr: 5.0000e-04\n",
- "Epoch 8/100\n",
- "507/507 [==============================] - ETA: 0s - loss: 0.3765 - mae: 0.5054 - mse: 0.8823\n",
- "Epoch 8: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.\n",
- "507/507 [==============================] - 22s 44ms/step - loss: 0.3765 - mae: 0.5054 - mse: 0.8823 - val_loss: 0.3501 - val_mae: 0.4814 - val_mse: 0.7773 - lr: 5.0000e-04\n",
- "Epoch 9/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3486 - mae: 0.4816 - mse: 0.8109 - val_loss: 0.2983 - val_mae: 0.4224 - val_mse: 0.6545 - lr: 1.0000e-04\n",
- "Epoch 10/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3425 - mae: 0.4789 - mse: 0.8051 - val_loss: 0.2930 - val_mae: 0.4210 - val_mse: 0.6593 - lr: 1.0000e-04\n",
- "Epoch 11/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3360 - mae: 0.4739 - mse: 0.7934 - val_loss: 0.2883 - val_mae: 0.4160 - val_mse: 0.6435 - lr: 1.0000e-04\n",
- "Epoch 12/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3341 - mae: 0.4755 - mse: 0.7938 - val_loss: 0.2981 - val_mae: 0.4305 - val_mse: 0.6719 - lr: 1.0000e-04\n",
- "Epoch 13/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3315 - mae: 0.4741 - mse: 0.7918 - val_loss: 0.2859 - val_mae: 0.4170 - val_mse: 0.6472 - lr: 1.0000e-04\n",
- "Epoch 14/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3280 - mae: 0.4706 - mse: 0.7851 - val_loss: 0.2841 - val_mae: 0.4196 - val_mse: 0.6415 - lr: 1.0000e-04\n",
- "Epoch 15/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3253 - mae: 0.4691 - mse: 0.7793 - val_loss: 0.2800 - val_mae: 0.4129 - val_mse: 0.6375 - lr: 1.0000e-04\n",
- "Epoch 16/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3238 - mae: 0.4697 - mse: 0.7778 - val_loss: 0.2866 - val_mae: 0.4247 - val_mse: 0.6558 - lr: 1.0000e-04\n",
- "Epoch 17/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3220 - mae: 0.4683 - mse: 0.7760 - val_loss: 0.2784 - val_mae: 0.4179 - val_mse: 0.6353 - lr: 1.0000e-04\n",
- "Epoch 18/100\n",
- "507/507 [==============================] - 23s 44ms/step - loss: 0.3181 - mae: 0.4659 - mse: 0.7723 - val_loss: 0.2793 - val_mae: 0.4205 - val_mse: 0.6472 - lr: 1.0000e-04\n",
- "Epoch 19/100\n",
- "506/507 [============================>.] - ETA: 0s - loss: 0.3162 - mae: 0.4650 - mse: 0.7704\n",
- "Epoch 19: ReduceLROnPlateau reducing learning rate to 2.0000000949949027e-05.\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3161 - mae: 0.4648 - mse: 0.7699 - val_loss: 0.2797 - val_mae: 0.4190 - val_mse: 0.6547 - lr: 1.0000e-04\n",
- "Epoch 20/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3123 - mae: 0.4613 - mse: 0.7628 - val_loss: 0.2668 - val_mae: 0.4065 - val_mse: 0.6173 - lr: 2.0000e-05\n",
- "Epoch 21/100\n",
- "2028/2028 [==============================] - 23s 11ms/step\n",
- "2028/2028 [==============================] - 22s 11ms/step\n",
- "\n",
- "Epoch 21: Predizioni fuori range: 0\n",
- "507/507 [==============================] - 70s 139ms/step - loss: 0.3098 - mae: 0.4588 - mse: 0.7522 - val_loss: 0.2658 - val_mae: 0.4054 - val_mse: 0.6173 - lr: 2.0000e-05\n",
- "Epoch 22/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3093 - mae: 0.4590 - mse: 0.7529 - val_loss: 0.2652 - val_mae: 0.4046 - val_mse: 0.6218 - lr: 2.0000e-05\n",
- "Epoch 23/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3086 - mae: 0.4580 - mse: 0.7535 - val_loss: 0.2653 - val_mae: 0.4047 - val_mse: 0.6177 - lr: 2.0000e-05\n",
- "Epoch 24/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3077 - mae: 0.4577 - mse: 0.7500 - val_loss: 0.2638 - val_mae: 0.4048 - val_mse: 0.6161 - lr: 2.0000e-05\n",
- "Epoch 25/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3056 - mae: 0.4556 - mse: 0.7444 - val_loss: 0.2637 - val_mae: 0.4049 - val_mse: 0.6151 - lr: 2.0000e-05\n",
- "Epoch 26/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3043 - mae: 0.4543 - mse: 0.7443 - val_loss: 0.2630 - val_mae: 0.4033 - val_mse: 0.6194 - lr: 2.0000e-05\n",
- "Epoch 27/100\n",
- "507/507 [==============================] - 22s 44ms/step - loss: 0.3055 - mae: 0.4561 - mse: 0.7485 - val_loss: 0.2623 - val_mae: 0.4022 - val_mse: 0.6164 - lr: 2.0000e-05\n",
- "Epoch 28/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3042 - mae: 0.4555 - mse: 0.7458 - val_loss: 0.2616 - val_mae: 0.4005 - val_mse: 0.6202 - lr: 2.0000e-05\n",
- "Epoch 29/100\n",
- "507/507 [==============================] - ETA: 0s - loss: 0.3047 - mae: 0.4561 - mse: 0.7492\n",
- "Epoch 29: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3047 - mae: 0.4561 - mse: 0.7492 - val_loss: 0.2613 - val_mae: 0.4038 - val_mse: 0.6127 - lr: 1.0000e-06\n",
- "Epoch 30/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3041 - mae: 0.4560 - mse: 0.7475 - val_loss: 0.2610 - val_mae: 0.4028 - val_mse: 0.6124 - lr: 1.0000e-06\n",
- "Epoch 31/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3016 - mae: 0.4529 - mse: 0.7384 - val_loss: 0.2611 - val_mae: 0.4027 - val_mse: 0.6122 - lr: 1.0000e-06\n",
- "Epoch 32/100\n",
- "506/507 [============================>.] - ETA: 0s - loss: 0.3021 - mae: 0.4538 - mse: 0.7404\n",
- "Epoch 32: ReduceLROnPlateau reducing learning rate to 2.000000222324161e-07.\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3020 - mae: 0.4537 - mse: 0.7400 - val_loss: 0.2611 - val_mae: 0.4027 - val_mse: 0.6123 - lr: 1.0000e-06\n",
- "Epoch 33/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3004 - mae: 0.4514 - mse: 0.7353 - val_loss: 0.2610 - val_mae: 0.4024 - val_mse: 0.6128 - lr: 2.0000e-07\n",
- "Epoch 34/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3014 - mae: 0.4520 - mse: 0.7420 - val_loss: 0.2609 - val_mae: 0.4023 - val_mse: 0.6129 - lr: 2.0000e-07\n",
- "Epoch 35/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3020 - mae: 0.4532 - mse: 0.7402 - val_loss: 0.2609 - val_mae: 0.4022 - val_mse: 0.6132 - lr: 2.0000e-07\n",
- "Epoch 36/100\n",
- "507/507 [==============================] - ETA: 0s - loss: 0.3030 - mae: 0.4545 - mse: 0.7438\n",
- "Epoch 36: ReduceLROnPlateau reducing learning rate to 1e-07.\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3030 - mae: 0.4545 - mse: 0.7438 - val_loss: 0.2609 - val_mae: 0.4023 - val_mse: 0.6127 - lr: 1.0000e-07\n",
- "Epoch 37/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3020 - mae: 0.4538 - mse: 0.7388 - val_loss: 0.2609 - val_mae: 0.4022 - val_mse: 0.6129 - lr: 1.0000e-07\n",
- "Epoch 38/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3036 - mae: 0.4552 - mse: 0.7454 - val_loss: 0.2609 - val_mae: 0.4022 - val_mse: 0.6127 - lr: 1.0000e-07\n",
- "Epoch 39/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3005 - mae: 0.4514 - mse: 0.7366 - val_loss: 0.2608 - val_mae: 0.4021 - val_mse: 0.6128 - lr: 1.0000e-07\n",
- "Epoch 40/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3021 - mae: 0.4536 - mse: 0.7401 - val_loss: 0.2609 - val_mae: 0.4021 - val_mse: 0.6128 - lr: 1.0000e-07\n",
- "Epoch 41/100\n",
- "2028/2028 [==============================] - 22s 11ms/step\n",
- "2028/2028 [==============================] - 23s 11ms/step\n",
- "\n",
- "Epoch 41: Predizioni fuori range: 0\n",
- "507/507 [==============================] - 69s 137ms/step - loss: 0.3031 - mae: 0.4542 - mse: 0.7422 - val_loss: 0.2608 - val_mae: 0.4021 - val_mse: 0.6129 - lr: 1.0000e-07\n",
- "Epoch 42/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3007 - mae: 0.4519 - mse: 0.7347 - val_loss: 0.2609 - val_mae: 0.4022 - val_mse: 0.6128 - lr: 1.0000e-07\n",
- "Epoch 43/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3020 - mae: 0.4528 - mse: 0.7403 - val_loss: 0.2608 - val_mae: 0.4021 - val_mse: 0.6129 - lr: 1.0000e-07\n",
- "Epoch 44/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3017 - mae: 0.4527 - mse: 0.7425 - val_loss: 0.2609 - val_mae: 0.4020 - val_mse: 0.6128 - lr: 1.0000e-07\n",
- "Epoch 45/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3022 - mae: 0.4539 - mse: 0.7417 - val_loss: 0.2608 - val_mae: 0.4019 - val_mse: 0.6131 - lr: 1.0000e-07\n",
- "Epoch 46/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3028 - mae: 0.4541 - mse: 0.7424 - val_loss: 0.2608 - val_mae: 0.4019 - val_mse: 0.6131 - lr: 1.0000e-07\n",
- "Epoch 47/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3028 - mae: 0.4541 - mse: 0.7436 - val_loss: 0.2608 - val_mae: 0.4020 - val_mse: 0.6129 - lr: 1.0000e-07\n",
- "Epoch 48/100\n",
- "507/507 [==============================] - 22s 44ms/step - loss: 0.3014 - mae: 0.4533 - mse: 0.7365 - val_loss: 0.2608 - val_mae: 0.4019 - val_mse: 0.6128 - lr: 1.0000e-07\n",
- "Epoch 49/100\n",
- "507/507 [==============================] - 23s 46ms/step - loss: 0.3010 - mae: 0.4516 - mse: 0.7383 - val_loss: 0.2608 - val_mae: 0.4021 - val_mse: 0.6125 - lr: 1.0000e-07\n",
- "Epoch 50/100\n",
- "507/507 [==============================] - 22s 44ms/step - loss: 0.3035 - mae: 0.4550 - mse: 0.7466 - val_loss: 0.2607 - val_mae: 0.4019 - val_mse: 0.6128 - lr: 1.0000e-07\n",
- "Epoch 51/100\n",
- "507/507 [==============================] - 22s 44ms/step - loss: 0.3031 - mae: 0.4544 - mse: 0.7443 - val_loss: 0.2608 - val_mae: 0.4020 - val_mse: 0.6127 - lr: 1.0000e-07\n",
- "Epoch 52/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3030 - mae: 0.4540 - mse: 0.7462 - val_loss: 0.2609 - val_mae: 0.4021 - val_mse: 0.6124 - lr: 1.0000e-07\n",
- "Epoch 53/100\n",
- "507/507 [==============================] - 23s 45ms/step - loss: 0.3019 - mae: 0.4529 - mse: 0.7407 - val_loss: 0.2608 - val_mae: 0.4020 - val_mse: 0.6127 - lr: 1.0000e-07\n",
- "Epoch 54/100\n",
- "506/507 [============================>.] - ETA: 0s - loss: 0.3029 - mae: 0.4538 - mse: 0.7433Restoring model weights from the end of the best epoch: 39.\n",
- "507/507 [==============================] - 22s 44ms/step - loss: 0.3028 - mae: 0.4537 - mse: 0.7430 - val_loss: 0.2608 - val_mae: 0.4020 - val_mse: 0.6126 - lr: 1.0000e-07\n",
- "Epoch 54: early stopping\n",
- "\n",
- "Training completato con successo!\n",
- "\n",
- "Metriche finali sul test set:\n",
- "Loss: 0.2608\n",
- "MAE: 0.4021\n",
- "MSE: 0.6128\n",
- "2028/2028 [==============================] - 22s 11ms/step\n",
- "\n",
- "Predizioni fuori range: 0 (0.00%)\n"
- ]
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "5. Generazione delle predizioni...\n",
- "2028/2028 [==============================] - 23s 11ms/step\n",
- "\n",
- "6. Valutazione del modello...\n",
- "\n",
- "UV Index Prediction Metrics:\n",
- "MAE: 0.384\n",
- "RMSE: 0.809\n",
- "R² Score: 0.911\n",
- "Exact Match Accuracy: 0.331\n",
- "±1 Accuracy: 0.458\n",
- "±2 Accuracy: 0.566\n",
- "\n",
- "UV Level Confusion Matrix:\n",
- "Predicted High Low Moderate Very High\n",
- "Actual \n",
- "High 3134 11 1252 481\n",
- "Low 129 43324 1888 19\n",
- "Moderate 1208 1663 7320 152\n",
- "Very High 1188 1 78 3018\n",
- "\n",
- "7. Visualizzazione risultati...\n"
- ]
- },
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "8. Predizione dei dati mancanti risultati...\n",
- "7122/7122 [==============================] - 78s 11ms/step\n",
- "\n",
- "9. Integrazione delle predizioni nel dataset originale...\n",
- "Aggiunte 227879 predizioni al dataset\n",
- "Righe con UV index dopo l'integrazione: 357615\n",
- "\n",
- "Training completato con successo!\n"
- ]
- }
- ],
"source": [
- "df = pd.read_parquet('../data/weather_data.parquet')\n",
+ "df = pd.read_parquet('../../sources/weather_data_uvindex.parquet')\n",
"\n",
- "# Esegui il training\n",
- "model, scaler, features, history, predictions, y_test, metrics, training_results = train_uvindex_bounded_model(df)"
- ]
+ "model, scaler, features, history, predictions, y_test, metrics, training_results = train_solarradiation_bounded_model(df)"
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
- "execution_count": 10,
- "id": "637891db-8d55-4232-a56e-9759dbcc8c2f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analisi Precisione Predizioni UV Index:\n",
- "Precisione esatta: 61.3%\n",
- "Precisione entro 0.5 punti: 79.4%\n",
- "Precisione entro 1.0 punti: 90.4%\n",
- "Precisione livello di rischio: 88.0%\n",
- "\n",
- "Analisi errori per livello UV:\n",
- "MAE per UV Basso (0-2): 0.138 (n=41346)\n",
- "MAE per UV Moderato (2-5): 0.906 (n=11522)\n",
- "MAE per UV Alto (5-7): 0.877 (n=5468)\n",
- "MAE per UV Molto Alto (7-10): 0.758 (n=6278)\n",
- "MAE per UV Estremo (10-11): 1.528 (n=252)\n",
- "\n",
- "Statistiche degli errori:\n",
- "Media errori: 0.006\n",
- "Deviazione standard errori: 0.783\n",
- "Errore mediano: 0.000\n",
- "95° percentile errore assoluto: 1.723\n"
- ]
- }
- ],
- "source": [
- "def analyze_solarradiation_prediction_quality(y_true, y_pred):\n",
- " \"\"\"\n",
- " Analisi dettagliata della qualità delle predizioni UV\n",
- " \"\"\"\n",
- " # Converti in numpy array e appiattisci\n",
- " y_true = np.array(y_true).ravel()\n",
- " y_pred = np.array(y_pred).ravel()\n",
- "\n",
- " # Arrotonda le predizioni al più vicino 0.5\n",
- " y_pred_rounded = np.round(y_pred * 2) / 2\n",
- "\n",
- " # Calcola diverse metriche di accuratezza usando array numpy\n",
- " exact_match = np.mean(np.abs(y_pred_rounded - y_true) < 1e-6) * 100 # uso di tolleranza per confronti float\n",
- " within_half = np.mean(np.abs(y_pred_rounded - y_true) <= 0.5) * 100\n",
- " within_one = np.mean(np.abs(y_pred_rounded - y_true) <= 1.0) * 100\n",
- "\n",
- " # Analisi per livello di rischio UV\n",
- " def get_solarradiation_risk_level(values):\n",
- " # Vettorizzazione della funzione per array numpy\n",
- " levels = np.zeros_like(values, dtype=str)\n",
- " levels[values <= 2] = 'Basso'\n",
- " levels[(values > 2) & (values <= 5)] = 'Moderato'\n",
- " levels[(values > 5) & (values <= 7)] = 'Alto'\n",
- " levels[(values > 7) & (values <= 10)] = 'Molto Alto'\n",
- " levels[values > 10] = 'Estremo'\n",
- " return levels\n",
- "\n",
- " y_true_risk = get_solarradiation_risk_level(y_true)\n",
- " y_pred_risk = get_solarradiation_risk_level(y_pred_rounded)\n",
- "\n",
- " risk_accuracy = np.mean(y_true_risk == y_pred_risk) * 100\n",
- "\n",
- " print(\"Analisi Precisione Predizioni UV Index:\")\n",
- " print(f\"Precisione esatta: {exact_match:.1f}%\")\n",
- " print(f\"Precisione entro 0.5 punti: {within_half:.1f}%\")\n",
- " print(f\"Precisione entro 1.0 punti: {within_one:.1f}%\")\n",
- " print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
- "\n",
- " # Distribuzione degli errori per livello UV\n",
- " solarradiation_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
- " labels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- "\n",
- " print(\"\\nAnalisi errori per livello UV:\")\n",
- " for (low, high), label in zip(solarradiation_ranges, labels):\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if np.sum(mask) > 0:\n",
- " mae_range = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
- " n_samples = np.sum(mask)\n",
- " print(f\"MAE per UV {label} ({low}-{high}): {mae_range:.3f} (n={n_samples})\")\n",
- "\n",
- " # Analisi aggiuntiva della distribuzione degli errori\n",
- " errors = y_pred - y_true\n",
- " print(\"\\nStatistiche degli errori:\")\n",
- " print(f\"Media errori: {np.mean(errors):.3f}\")\n",
- " print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
- " print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
- " print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
- "\n",
- " return {\n",
- " 'exact_match': exact_match,\n",
- " 'within_half': within_half,\n",
- " 'within_one': within_one,\n",
- " 'risk_accuracy': risk_accuracy,\n",
- " 'error_stats': {\n",
- " 'mean': float(np.mean(errors)),\n",
- " 'std': float(np.std(errors)),\n",
- " 'median': float(np.median(errors)),\n",
- " 'p95_abs': float(np.percentile(np.abs(errors), 95))\n",
- " }\n",
- " }\n",
- "\n",
- "\n",
- "# Per utilizzare l'analisi:\n",
- "metrics = analyze_solarradiation_prediction_quality(y_test, predictions)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
"id": "08fd4208-0afb-4bf1-bdef-b10b4065fe55",
"metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Statistiche degli errori:\n",
- "MAE: 0.4021\n",
- "MSE: 0.6128\n",
- "RMSE: 0.7828\n",
- "Media errori: 0.0062\n",
- "Std errori: 0.7828\n",
- "Predizioni entro ±0.5: 71.1%\n",
- "Predizioni entro ±1.0: 86.1%\n",
- "Predizioni entro ±1.5: 93.2%\n",
- "Predizioni entro ±2.0: 96.5%\n"
- ]
- }
- ],
"source": [
"def plot_error_analysis(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
- " Funzione per visualizzare l'analisi degli errori di predizione\n",
+ " Function to visualize prediction error analysis\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
- " Valori reali\n",
+ " Actual values\n",
" y_pred : array-like\n",
- " Valori predetti\n",
+ " Predicted values\n",
" folder_name : str, optional\n",
- " Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
+ " Directory to save plots. If None, plots are only displayed\n",
+ "\n",
+ " Generates:\n",
+ " ----------\n",
+ " - Error distribution histogram\n",
+ " - Actual vs Predicted scatter plot\n",
+ " - Errors vs Actual Values scatter plot\n",
+ " - Comprehensive error statistics\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
"\n",
- " # Converti in array numpy 1D se necessario\n",
+ " # Convert to 1D numpy arrays if needed\n",
" if isinstance(y_true, pd.Series):\n",
" y_true = y_true.values\n",
" if isinstance(y_pred, pd.Series):\n",
@@ -1519,330 +1137,75 @@
" y_true = y_true.ravel()\n",
" y_pred = y_pred.ravel()\n",
"\n",
- " # Calcola gli errori\n",
+ " # Calculate errors\n",
" errors = y_pred - y_true\n",
"\n",
- " # Crea la figura principale\n",
+ " # Create main figure\n",
" fig = plt.figure(figsize=(15, 5))\n",
"\n",
- " # Plot 1: Distribuzione degli errori\n",
+ " # Plot 1: Error Distribution\n",
" plt.subplot(1, 3, 1)\n",
" plt.hist(errors, bins=50, alpha=0.7)\n",
- " plt.title('Distribuzione degli Errori di Predizione')\n",
- " plt.xlabel('Errore')\n",
- " plt.ylabel('Frequenza')\n",
+ " plt.title('Prediction Error Distribution')\n",
+ " plt.xlabel('Error')\n",
+ " plt.ylabel('Frequency')\n",
"\n",
" # Plot 2: Actual vs Predicted\n",
" plt.subplot(1, 3, 2)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
" plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
- " plt.title('Valori Reali vs Predetti')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Valori Predetti')\n",
+ " plt.title('Actual vs Predicted Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Predicted Values')\n",
"\n",
- " # Plot 3: Errori vs Valori Reali\n",
+ " # Plot 3: Errors vs Actual Values\n",
" plt.subplot(1, 3, 3)\n",
" plt.scatter(y_true, errors, alpha=0.5)\n",
" plt.axhline(y=0, color='r', linestyle='--')\n",
- " plt.title('Errori vs Valori Reali')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Errore')\n",
+ " plt.title('Errors vs Actual Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Error')\n",
"\n",
" plt.tight_layout()\n",
"\n",
- " # Salva il plot se è specificata una cartella\n",
+ " # Save plot if directory is specified\n",
" if folder_name is not None:\n",
" try:\n",
- " # Crea la cartella se non esiste\n",
+ " # Create directory if it doesn't exist\n",
" os.makedirs(folder_name, exist_ok=True)\n",
"\n",
- " # Genera il nome del file con timestamp\n",
+ " # Generate filename with timestamp\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" filename = os.path.join(folder_name, f'error_analysis_{timestamp}.png')\n",
"\n",
- " # Salva la figura\n",
+ " # Save figure\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot salvato come: {filename}\")\n",
+ " print(f\"\\nPlot saved as: {filename}\")\n",
" except Exception as e:\n",
- " print(f\"\\nErrore nel salvare il plot: {str(e)}\")\n",
+ " print(f\"\\nError saving plot: {str(e)}\")\n",
"\n",
" plt.show()\n",
"\n",
- " # Stampa statistiche degli errori\n",
- " print(\"\\nStatistiche degli errori:\")\n",
+ " # Print error statistics\n",
+ " print(\"\\nError Statistics:\")\n",
" print(f\"MAE: {np.mean(np.abs(errors)):.4f}\")\n",
" print(f\"MSE: {np.mean(errors ** 2):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(errors ** 2)):.4f}\")\n",
- " print(f\"Media errori: {np.mean(errors):.4f}\")\n",
- " print(f\"Std errori: {np.std(errors):.4f}\")\n",
+ " print(f\"Mean error: {np.mean(errors):.4f}\")\n",
+ " print(f\"Error std: {np.std(errors):.4f}\")\n",
"\n",
- " # Calcola percentuali di errori entro certe soglie\n",
+ " # Calculate percentage of errors within thresholds\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
" for threshold in thresholds:\n",
" within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
- " print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
+ " print(f\"Predictions within ±{threshold}: {within_threshold:.1f}%\")\n",
"\n",
"\n",
+ "# Example usage\n",
"plot_error_analysis(y_test, predictions, folder_name=folder_name)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "03bb9564-e518-4662-b3ee-4cfa96cdf696",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Matrice di Confusione per Livelli di Rischio UV:\n",
- " Basso Moderato Alto Molto Alto Estremo\n",
- "Basso 2931 19 0 2128 390\n",
- "Moderato 41 40299 0 995 11\n",
- "Alto 13 0 0 0 239\n",
- "Molto Alto 863 2454 0 8063 142\n",
- "Estremo 1878 1 0 169 4230\n",
- "\n",
- "Analisi Precisione Predizioni UV Index:\n",
- "Precisione esatta (±0.1): 55.1%\n",
- "Precisione entro 0.5 punti: 71.1%\n",
- "Precisione entro 1.0 punti: 86.1%\n",
- "Precisione livello di rischio: 85.6%\n",
- "\n",
- "Analisi errori per livello UV:\n",
- "MAE per UV Basso (0-2): 0.138 (n=41346)\n",
- "MAE per UV Moderato (2-5): 0.906 (n=11522)\n",
- "MAE per UV Alto (5-7): 0.877 (n=5468)\n",
- "MAE per UV Molto Alto (7-10): 0.758 (n=6278)\n",
- "MAE per UV Estremo (10-11): 1.528 (n=252)\n",
- "\n",
- "Statistiche degli errori:\n",
- "Media errori: 0.006\n",
- "Deviazione standard errori: 0.783\n",
- "Errore mediano: 0.000\n",
- "95° percentile errore assoluto: 1.723\n",
- "\n",
- "Distribuzione degli errori:\n",
- "Predizioni entro ±0.5: 71.1%\n",
- "Predizioni entro ±1.0: 86.1%\n",
- "Predizioni entro ±1.5: 93.2%\n",
- "Predizioni entro ±2.0: 96.5%\n"
- ]
- }
],
- "source": [
- "def plot_advanced_prediction_analysis(y_true, y_pred, folder_name=None):\n",
- " \"\"\"\n",
- " Funzione per visualizzare l'analisi degli errori di predizione e la precisione\n",
- "\n",
- " Parameters:\n",
- " -----------\n",
- " y_true : array-like\n",
- " Valori reali\n",
- " y_pred : array-like\n",
- " Valori predetti\n",
- " folder_name : str, optional\n",
- " Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
- " \"\"\"\n",
- " import os\n",
- " from datetime import datetime\n",
- " import seaborn as sns\n",
- "\n",
- " # Converti in array numpy 1D se necessario\n",
- " if isinstance(y_true, pd.Series):\n",
- " y_true = y_true.values\n",
- " if isinstance(y_pred, pd.Series):\n",
- " y_pred = y_pred.values\n",
- "\n",
- " y_true = y_true.ravel()\n",
- " y_pred = y_pred.ravel()\n",
- "\n",
- " # Calcola gli errori\n",
- " errors = y_pred - y_true\n",
- "\n",
- " # Calcola accuracy per diversi livelli di tolleranza\n",
- " exact_accuracy = np.mean(np.abs(errors) < 0.1) * 100\n",
- " accuracy_05 = np.mean(np.abs(errors) <= 0.5) * 100\n",
- " accuracy_10 = np.mean(np.abs(errors) <= 1.0) * 100\n",
- "\n",
- " def get_risk_level(uv):\n",
- " if uv < 2:\n",
- " return 'Basso'\n",
- " elif uv < 5:\n",
- " return 'Moderato'\n",
- " elif uv < 7:\n",
- " return 'Alto'\n",
- " elif uv < 10:\n",
- " return 'Molto Alto'\n",
- " else:\n",
- " return 'Estremo'\n",
- "\n",
- " y_true_risk = [get_risk_level(x) for x in y_true]\n",
- " y_pred_risk = [get_risk_level(x) for x in y_pred]\n",
- " risk_accuracy = np.mean(np.array(y_true_risk) == np.array(y_pred_risk)) * 100\n",
- "\n",
- " # Crea la figura principale\n",
- " fig = plt.figure(figsize=(20, 10))\n",
- "\n",
- " # Plot 1: Distribuzione degli errori\n",
- " plt.subplot(2, 2, 1)\n",
- " plt.hist(errors, bins=50, alpha=0.7)\n",
- " plt.title('Distribuzione degli Errori di Predizione')\n",
- " plt.xlabel('Errore')\n",
- " plt.ylabel('Frequenza')\n",
- "\n",
- " # Plot 2: Actual vs Predicted\n",
- " plt.subplot(2, 2, 2)\n",
- " plt.scatter(y_true, y_pred, alpha=0.5)\n",
- " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
- " plt.title('Valori Reali vs Predetti')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Valori Predetti')\n",
- "\n",
- " # Plot 3: Errori vs Valori Reali\n",
- " plt.subplot(2, 2, 3)\n",
- " plt.scatter(y_true, errors, alpha=0.5)\n",
- " plt.axhline(y=0, color='r', linestyle='--')\n",
- " plt.title('Errori vs Valori Reali')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Errore')\n",
- "\n",
- " # Plot 4: Precisione per intervallo di UV\n",
- " plt.subplot(2, 2, 4)\n",
- "\n",
- " solarradiation_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
- " range_labels = ['Basso\\n(0-2)', 'Moderato\\n(2-5)', 'Alto\\n(5-7)', 'Molto Alto\\n(7-10)', 'Estremo\\n(10-11)']\n",
- "\n",
- " accuracies = []\n",
- " counts = []\n",
- " mae_per_range = []\n",
- "\n",
- " for (low, high) in solarradiation_ranges:\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if mask.any():\n",
- " mae = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
- " mae_per_range.append(mae)\n",
- " count = np.sum(mask)\n",
- " counts.append(count)\n",
- " accuracy = np.mean(np.abs(y_pred[mask] - y_true[mask]) <= 0.5) * 100\n",
- " accuracies.append(accuracy)\n",
- "\n",
- " # Crea il grafico a barre con doppio asse y\n",
- " ax = plt.gca()\n",
- " bars = plt.bar(range_labels, accuracies, alpha=0.6, color='skyblue')\n",
- " plt.ylabel('Precisione (%)')\n",
- " plt.title('Precisione e MAE per Range UV')\n",
- "\n",
- " for bar in bars:\n",
- " height = bar.get_height()\n",
- " plt.text(bar.get_x() + bar.get_width() / 2., height,\n",
- " f'{height:.1f}%\\n(n={counts[bars.index(bar)]})',\n",
- " ha='center', va='bottom')\n",
- "\n",
- " ax2 = ax.twinx()\n",
- " line = ax2.plot(range_labels, mae_per_range, 'r-', marker='o', label='MAE')\n",
- " ax2.set_ylabel('MAE', color='red')\n",
- "\n",
- " for i, mae in enumerate(mae_per_range):\n",
- " ax2.text(i, mae, f'MAE: {mae:.3f}', color='red', ha='center', va='bottom')\n",
- "\n",
- " plt.xticks(rotation=45)\n",
- " plt.tight_layout()\n",
- "\n",
- " # Salva la figura principale se è specificata una cartella\n",
- " if folder_name is not None:\n",
- " try:\n",
- " # Crea la cartella se non esiste\n",
- " os.makedirs(folder_name, exist_ok=True)\n",
- "\n",
- " # Genera il timestamp\n",
- " timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
- "\n",
- " # Salva la figura principale\n",
- " main_plot_filename = os.path.join(folder_name, f'advanced_analysis_{timestamp}.png')\n",
- " plt.savefig(main_plot_filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot principale salvato come: {main_plot_filename}\")\n",
- "\n",
- " # Crea e salva la matrice di confusione come plot separato\n",
- " plt.figure(figsize=(10, 8))\n",
- " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
- " risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
- "\n",
- " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
- " plt.title('Matrice di Confusione per Livelli di Rischio UV')\n",
- " plt.tight_layout()\n",
- "\n",
- " conf_matrix_filename = os.path.join(folder_name, f'confusion_matrix_{timestamp}.png')\n",
- " plt.savefig(conf_matrix_filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"Matrice di confusione salvata come: {conf_matrix_filename}\")\n",
- "\n",
- " except Exception as e:\n",
- " print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
- "\n",
- " plt.show()\n",
- "\n",
- " # Stampa delle statistiche e analisi\n",
- " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
- " risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
- "\n",
- " print(\"\\nMatrice di Confusione per Livelli di Rischio UV:\")\n",
- " print(cm_df)\n",
- "\n",
- " print(\"\\nAnalisi Precisione Predizioni UV Index:\")\n",
- " print(f\"Precisione esatta (±0.1): {exact_accuracy:.1f}%\")\n",
- " print(f\"Precisione entro 0.5 punti: {accuracy_05:.1f}%\")\n",
- " print(f\"Precisione entro 1.0 punti: {accuracy_10:.1f}%\")\n",
- " print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
- "\n",
- " print(\"\\nAnalisi errori per livello UV:\")\n",
- " solarradiation_ranges = [(0, 2, 'Basso'), (2, 5, 'Moderato'), (5, 7, 'Alto'),\n",
- " (7, 10, 'Molto Alto'), (10, 11, 'Estremo')]\n",
- "\n",
- " for low, high, label in solarradiation_ranges:\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if mask.any():\n",
- " mae = np.mean(np.abs(errors[mask]))\n",
- " n_samples = np.sum(mask)\n",
- " print(f\"MAE per UV {label} ({low}-{high}): {mae:.3f} (n={n_samples})\")\n",
- "\n",
- " print(\"\\nStatistiche degli errori:\")\n",
- " print(f\"Media errori: {np.mean(errors):.3f}\")\n",
- " print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
- " print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
- " print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
- "\n",
- " print(\"\\nDistribuzione degli errori:\")\n",
- " thresholds = [0.5, 1.0, 1.5, 2.0]\n",
- " for threshold in thresholds:\n",
- " within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
- " print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
- "\n",
- "\n",
- "# Usa la funzione\n",
- "plot_advanced_prediction_analysis(y_test, predictions, folder_name=folder_name)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fe898941-2338-4157-b624-680bc2c517d8",
- "metadata": {},
"outputs": [],
- "source": []
+ "execution_count": null
}
],
"metadata": {
diff --git a/src/models/uv_index/uv_index_model.ipynb b/src/models/uv_index/uv_index_model.ipynb
index 19537e7..3f8101e 100755
--- a/src/models/uv_index/uv_index_model.ipynb
+++ b/src/models/uv_index/uv_index_model.ipynb
@@ -2,23 +2,14 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
"id": "8adcbe0819b88578",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'\\nfrom opt_einsum.paths import branch_1\\n!apt-get update\\n!apt-get install graphviz -y\\n\\n!pip install tensorflow\\n!pip install numpy\\n!pip install pandas\\n\\n!pip install keras\\n!pip install scikit-learn\\n!pip install matplotlib\\n!pip install joblib\\n!pip install pyarrow\\n!pip install fastparquet\\n!pip install scipy\\n!pip install seaborn\\n!pip install tqdm\\n!pip install pydot\\n!pip install tensorflow-io\\n'"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-11-19T22:43:39.592603Z",
+ "start_time": "2024-11-19T22:43:05.800433Z"
}
- ],
+ },
"source": [
- "'''\n",
"from opt_einsum.paths import branch_1\n",
"!apt-get update\n",
"!apt-get install graphviz -y\n",
@@ -38,47 +29,211 @@
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io\n",
- "'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "7a813e3cbca057b7",
- "metadata": {},
+ "!pip install tensorflow-addons"
+ ],
"outputs": [
{
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "2024-11-10 22:44:08.491015: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
- "2024-11-10 22:44:08.491086: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
- "2024-11-10 22:44:08.491139: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
- "2024-11-10 22:44:08.502469: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
- "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+ "Requirement already satisfied: tensorflow in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (2.16.2)\r\n",
+ "Requirement already satisfied: absl-py>=1.0.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (2.1.0)\r\n",
+ "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (1.6.3)\r\n",
+ "Requirement already satisfied: flatbuffers>=23.5.26 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (24.3.25)\r\n",
+ "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (0.4.0)\r\n",
+ "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (0.2.0)\r\n",
+ "Requirement already satisfied: h5py>=3.10.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (3.11.0)\r\n",
+ "Requirement already satisfied: libclang>=13.0.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (18.1.1)\r\n",
+ "Requirement already satisfied: ml-dtypes~=0.3.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (0.3.2)\r\n",
+ "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (3.3.0)\r\n",
+ "Requirement already satisfied: packaging in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (24.1)\r\n",
+ "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (4.25.4)\r\n",
+ "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (2.32.3)\r\n",
+ "Requirement already satisfied: setuptools in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (72.1.0)\r\n",
+ "Requirement already satisfied: six>=1.12.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (1.16.0)\r\n",
+ "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (2.4.0)\r\n",
+ "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (4.11.0)\r\n",
+ "Requirement already satisfied: wrapt>=1.11.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (1.14.1)\r\n",
+ "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (1.66.1)\r\n",
+ "Collecting tensorboard<2.17,>=2.16 (from tensorflow)\r\n",
+ " Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)\r\n",
+ "Collecting keras>=3.0.0 (from tensorflow)\r\n",
+ " Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)\r\n",
+ "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (0.37.1)\r\n",
+ "Requirement already satisfied: numpy<2.0.0,>=1.23.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow) (1.23.5)\r\n",
+ "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from astunparse>=1.6.0->tensorflow) (0.43.0)\r\n",
+ "Requirement already satisfied: rich in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras>=3.0.0->tensorflow) (13.7.1)\r\n",
+ "Requirement already satisfied: namex in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras>=3.0.0->tensorflow) (0.0.7)\r\n",
+ "Requirement already satisfied: optree in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras>=3.0.0->tensorflow) (0.12.1)\r\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorflow) (3.3.2)\r\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorflow) (3.7)\r\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorflow) (2.2.2)\r\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from requests<3,>=2.21.0->tensorflow) (2024.8.30)\r\n",
+ "Requirement already satisfied: markdown>=2.6.8 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (3.7)\r\n",
+ "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (0.7.2)\r\n",
+ "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (3.0.4)\r\n",
+ "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from werkzeug>=1.0.1->tensorboard<2.17,>=2.16->tensorflow) (2.1.3)\r\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from rich->keras>=3.0.0->tensorflow) (2.2.0)\r\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from rich->keras>=3.0.0->tensorflow) (2.15.1)\r\n",
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.0.0->tensorflow) (0.1.0)\r\n",
+ "Downloading keras-3.6.0-py3-none-any.whl (1.2 MB)\r\n",
+ "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.2/1.2 MB\u001B[0m \u001B[31m9.8 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\r\n",
+ "\u001B[?25hDownloading tensorboard-2.16.2-py3-none-any.whl (5.5 MB)\r\n",
+ "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m5.5/5.5 MB\u001B[0m \u001B[31m20.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
+ "\u001B[?25hInstalling collected packages: tensorboard, keras\r\n",
+ " Attempting uninstall: tensorboard\r\n",
+ " Found existing installation: tensorboard 2.12.3\r\n",
+ " Uninstalling tensorboard-2.12.3:\r\n",
+ " Successfully uninstalled tensorboard-2.12.3\r\n",
+ " Attempting uninstall: keras\r\n",
+ " Found existing installation: keras 2.12.0\r\n",
+ " Uninstalling keras-2.12.0:\r\n",
+ " Successfully uninstalled keras-2.12.0\r\n",
+ "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\r\n",
+ "tensorflow-macos 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 3.6.0 which is incompatible.\r\n",
+ "tensorflow-macos 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.16.2 which is incompatible.\u001B[0m\u001B[31m\r\n",
+ "\u001B[0mSuccessfully installed keras-3.6.0 tensorboard-2.16.2\r\n",
+ "Requirement already satisfied: numpy in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (1.23.5)\r\n",
+ "Requirement already satisfied: pandas in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (2.2.2)\r\n",
+ "Requirement already satisfied: numpy>=1.22.4 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas) (1.23.5)\r\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\r\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas) (2024.1)\r\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas) (2023.3)\r\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
+ "Requirement already satisfied: keras in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (3.6.0)\r\n",
+ "Requirement already satisfied: absl-py in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (2.1.0)\r\n",
+ "Requirement already satisfied: numpy in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (1.23.5)\r\n",
+ "Requirement already satisfied: rich in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (13.7.1)\r\n",
+ "Requirement already satisfied: namex in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (0.0.7)\r\n",
+ "Requirement already satisfied: h5py in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (3.11.0)\r\n",
+ "Requirement already satisfied: optree in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (0.12.1)\r\n",
+ "Requirement already satisfied: ml-dtypes in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (0.3.2)\r\n",
+ "Requirement already satisfied: packaging in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from keras) (24.1)\r\n",
+ "Requirement already satisfied: typing-extensions>=4.5.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from optree->keras) (4.11.0)\r\n",
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from rich->keras) (2.2.0)\r\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from rich->keras) (2.15.1)\r\n",
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich->keras) (0.1.0)\r\n",
+ "Requirement already satisfied: scikit-learn in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (1.5.2)\r\n",
+ "Requirement already satisfied: numpy>=1.19.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from scikit-learn) (1.23.5)\r\n",
+ "Requirement already satisfied: scipy>=1.6.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from scikit-learn) (1.10.0)\r\n",
+ "Requirement already satisfied: joblib>=1.2.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from scikit-learn) (1.4.2)\r\n",
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from scikit-learn) (3.5.0)\r\n",
+ "Requirement already satisfied: matplotlib in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (3.9.2)\r\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (1.2.0)\r\n",
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (0.11.0)\r\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (4.51.0)\r\n",
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (1.4.4)\r\n",
+ "Requirement already satisfied: numpy>=1.23 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (1.23.5)\r\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (24.1)\r\n",
+ "Requirement already satisfied: pillow>=8 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (10.4.0)\r\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (3.1.2)\r\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib) (2.9.0.post0)\r\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\r\n",
+ "Requirement already satisfied: joblib in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (1.4.2)\r\n",
+ "Requirement already satisfied: pyarrow in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (16.1.0)\r\n",
+ "Requirement already satisfied: numpy>=1.16.6 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pyarrow) (1.23.5)\r\n",
+ "Requirement already satisfied: fastparquet in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (2024.2.0)\r\n",
+ "Requirement already satisfied: pandas>=1.5.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from fastparquet) (2.2.2)\r\n",
+ "Requirement already satisfied: numpy>=1.20.3 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from fastparquet) (1.23.5)\r\n",
+ "Requirement already satisfied: cramjam>=2.3 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from fastparquet) (2.7.0)\r\n",
+ "Requirement already satisfied: fsspec in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from fastparquet) (2024.6.1)\r\n",
+ "Requirement already satisfied: packaging in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from fastparquet) (24.1)\r\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2.9.0.post0)\r\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2024.1)\r\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas>=1.5.0->fastparquet) (2023.3)\r\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->fastparquet) (1.16.0)\r\n",
+ "Requirement already satisfied: scipy in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (1.10.0)\r\n",
+ "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from scipy) (1.23.5)\r\n",
+ "Requirement already satisfied: seaborn in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (0.13.2)\r\n",
+ "Requirement already satisfied: numpy!=1.24.0,>=1.20 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from seaborn) (1.23.5)\r\n",
+ "Requirement already satisfied: pandas>=1.2 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from seaborn) (2.2.2)\r\n",
+ "Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from seaborn) (3.9.2)\r\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.0)\r\n",
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.11.0)\r\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.51.0)\r\n",
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.4)\r\n",
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)\r\n",
+ "Requirement already satisfied: pillow>=8 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)\r\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.1.2)\r\n",
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\r\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas>=1.2->seaborn) (2024.1)\r\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pandas>=1.2->seaborn) (2023.3)\r\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)\r\n",
+ "Requirement already satisfied: tqdm in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (4.66.5)\r\n",
+ "Collecting pydot\r\n",
+ " Downloading pydot-3.0.2-py3-none-any.whl.metadata (10 kB)\r\n",
+ "Requirement already satisfied: pyparsing>=3.0.9 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from pydot) (3.1.2)\r\n",
+ "Downloading pydot-3.0.2-py3-none-any.whl (35 kB)\r\n",
+ "Installing collected packages: pydot\r\n",
+ "Successfully installed pydot-3.0.2\r\n",
+ "Collecting tensorflow-io\r\n",
+ " Downloading tensorflow_io-0.37.1-cp310-cp310-macosx_10_14_x86_64.whl.metadata (14 kB)\r\n",
+ "Requirement already satisfied: tensorflow-io-gcs-filesystem==0.37.1 in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow-io) (0.37.1)\r\n",
+ "Downloading tensorflow_io-0.37.1-cp310-cp310-macosx_10_14_x86_64.whl (22.1 MB)\r\n",
+ "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m22.1/22.1 MB\u001B[0m \u001B[31m6.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
+ "\u001B[?25hInstalling collected packages: tensorflow-io\r\n",
+ "Successfully installed tensorflow-io-0.37.1\r\n",
+ "Collecting tensorflow-addons\r\n",
+ " Downloading tensorflow_addons-0.23.0-cp310-cp310-macosx_10_14_x86_64.whl.metadata (1.7 kB)\r\n",
+ "Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)\r\n",
+ " Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)\r\n",
+ "Requirement already satisfied: packaging in /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages (from tensorflow-addons) (24.1)\r\n",
+ "Downloading tensorflow_addons-0.23.0-cp310-cp310-macosx_10_14_x86_64.whl (14.1 MB)\r\n",
+ "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m14.1/14.1 MB\u001B[0m \u001B[31m7.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
+ "\u001B[?25hDownloading typeguard-2.13.3-py3-none-any.whl (17 kB)\r\n",
+ "Installing collected packages: typeguard, tensorflow-addons\r\n",
+ "Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3\r\n"
]
}
],
+ "execution_count": 3
+ },
+ {
+ "cell_type": "code",
+ "id": "7a813e3cbca057b7",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2024-11-19T22:44:45.975065Z",
+ "start_time": "2024-11-19T22:44:45.853184Z"
+ }
+ },
"source": [
"import tensorflow as tf\n",
- "from tensorflow.keras.layers import Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, Input, Activation, Lambda, Bidirectional, Add, MaxPooling1D\n",
+ "from tensorflow.keras.layers import Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, Input, Activation, Lambda, Bidirectional, Add, MaxPooling1D, Conv1D, GlobalAveragePooling1D\n",
"from tensorflow.keras import regularizers\n",
"from tensorflow.keras.models import Model\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
- "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
+ "from sklearn.preprocessing import RobustScaler\n",
+ "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint\n",
"from tensorflow.keras.optimizers import AdamW\n",
"import json\n",
"from datetime import datetime\n",
"import matplotlib.pyplot as plt\n",
- "from sklearn.metrics import confusion_matrix\n",
"from tensorflow.keras.utils import plot_model\n",
+ "import tensorflow_addons as tfa\n",
"\n",
- "folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")"
- ]
+ "\n",
+ "folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\n",
+ "random_state_value = None"
+ ],
+ "outputs": [
+ {
+ "ename": "NotFoundError",
+ "evalue": "dlopen(/usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow-plugins/libmetal_plugin.dylib, 0x0006): Symbol not found: __ZN10tensorflow16TensorShapeProtoC1ERKS0_\n Referenced from: /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow-plugins/libmetal_plugin.dylib\n Expected in: /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mNotFoundError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[7], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mtensorflow\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mtf\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mkeras\u001B[39;00m\n\u001B[1;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mkeras\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlayers\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, Input, Activation, Lambda, Bidirectional, Add, MaxPooling1D, Conv1D, GlobalAveragePooling1D\n",
+ "File \u001B[0;32m/usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow/__init__.py:432\u001B[0m\n\u001B[1;32m 430\u001B[0m _plugin_dir \u001B[38;5;241m=\u001B[39m _os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(_s, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtensorflow-plugins\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 431\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m _os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mexists(_plugin_dir):\n\u001B[0;32m--> 432\u001B[0m \u001B[43m_ll\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mload_library\u001B[49m\u001B[43m(\u001B[49m\u001B[43m_plugin_dir\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 433\u001B[0m \u001B[38;5;66;03m# Load Pluggable Device Library\u001B[39;00m\n\u001B[1;32m 434\u001B[0m _ll\u001B[38;5;241m.\u001B[39mload_pluggable_device_library(_plugin_dir)\n",
+ "File \u001B[0;32m/usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow/python/framework/load_library.py:151\u001B[0m, in \u001B[0;36mload_library\u001B[0;34m(library_location)\u001B[0m\n\u001B[1;32m 148\u001B[0m kernel_libraries \u001B[38;5;241m=\u001B[39m [library_location]\n\u001B[1;32m 150\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m lib \u001B[38;5;129;01min\u001B[39;00m kernel_libraries:\n\u001B[0;32m--> 151\u001B[0m \u001B[43mpy_tf\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mTF_LoadLibrary\u001B[49m\u001B[43m(\u001B[49m\u001B[43mlib\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 153\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 154\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mOSError\u001B[39;00m(\n\u001B[1;32m 155\u001B[0m errno\u001B[38;5;241m.\u001B[39mENOENT,\n\u001B[1;32m 156\u001B[0m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mThe file or folder to load kernel libraries from does not exist.\u001B[39m\u001B[38;5;124m'\u001B[39m,\n\u001B[1;32m 157\u001B[0m library_location)\n",
+ "\u001B[0;31mNotFoundError\u001B[0m: dlopen(/usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow-plugins/libmetal_plugin.dylib, 0x0006): Symbol not found: __ZN10tensorflow16TensorShapeProtoC1ERKS0_\n Referenced from: /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow-plugins/libmetal_plugin.dylib\n Expected in: /usr/local/anaconda3/envs/ml_env/lib/python3.10/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so"
+ ]
+ }
+ ],
+ "execution_count": 7
},
{
"cell_type": "code",
@@ -140,14 +295,14 @@
"\n",
"\n",
"def add_solar_features(df):\n",
- " # Calcolo dell'angolo solare\n",
+ " # Calculate solar angle\n",
" df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))\n",
"\n",
- " # Interazioni tra features rilevanti\n",
+ " # Interactions between relevant features\n",
" df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']\n",
" df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])\n",
"\n",
- " # Feature derivate\n",
+ " # Derived features\n",
" df['clear_sky_index'] = (100 - df['cloudcover']) / 100\n",
" df['temp_gradient'] = df['temp'] - df['tempmin']\n",
"\n",
@@ -155,42 +310,72 @@
"\n",
"\n",
"def add_solar_specific_features(df):\n",
- " # Angolo solare e durata del giorno\n",
+ " # Solar angle and day length calculations\n",
" df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)\n",
" df['solar_noon'] = 12 - df['hour']\n",
" df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * np.cos(2 * np.pi * df['solar_noon'] / 24)\n",
"\n",
- " # Interazioni\n",
+ " # Feature interactions\n",
" df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']\n",
" df['visibility_elevation'] = df['visibility'] * df['solar_elevation']\n",
"\n",
- " # Rolling features con finestre più ampie\n",
+ " # Extended window rolling features\n",
" df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12).mean()\n",
" df['temp_rolling_12h'] = df['temp'].rolling(window=12).mean()\n",
"\n",
" return df\n",
"\n",
"\n",
+ "def add_uv_specific_features(df):\n",
+ " # Solar zenith angle calculation\n",
+ " lat = df['latitude'].iloc[0] # assuming constant latitude for the dataset\n",
+ " df['solar_zenith'] = 90 - np.arcsin(\n",
+ " np.sin(np.radians(lat)) * np.sin(df['solar_elevation']) +\n",
+ " np.cos(np.radians(lat)) * np.cos(df['solar_elevation']) * np.cos(df['hour'] * 15)\n",
+ " ).degrees\n",
+ "\n",
+ " # UV peak hours indicator (10:00-16:00)\n",
+ " df['is_uv_peak_hours'] = ((df['hour'] >= 10) & (df['hour'] <= 16)).astype(int)\n",
+ "\n",
+ " # Atmospheric attenuation factor\n",
+ " df['atmospheric_attenuation'] = (100 - df['cloudcover']) * (df['visibility'] / 100) * (1 - df['humidity'] / 200)\n",
+ "\n",
+ " # Seasonal UV factor\n",
+ " df['uv_seasonal_factor'] = np.where(df['season_Summer'], 1.0,\n",
+ " np.where(df['season_Spring'], 0.7,\n",
+ " np.where(df['season_Autumn'], 0.5, 0.3)))\n",
+ "\n",
+ " # Solar elevation and atmospheric transparency interaction\n",
+ " df['solar_clarity_index'] = df['solar_elevation'] * df['atmospheric_attenuation'] / 100\n",
+ "\n",
+ " # UV-specific rolling features\n",
+ " df['clarity_rolling_3h'] = df['atmospheric_attenuation'].rolling(window=3).mean()\n",
+ " df['temp_uv_interaction'] = df['temp'] * df['solar_clarity_index']\n",
+ "\n",
+ " return df\n",
+ "\n",
+ "\n",
"def add_advanced_features(df):\n",
- " # Features esistenti\n",
+ " # Apply existing feature engineering\n",
" df = add_time_features(df)\n",
" df = add_solar_features(df)\n",
" df = add_solar_specific_features(df)\n",
+ " df = add_uv_specific_features(df)\n",
"\n",
" if not isinstance(df.index, pd.DatetimeIndex):\n",
" df.index = pd.to_datetime(df.index)\n",
"\n",
- " # One-hot encoding per le feature categoriche\n",
+ " # One-hot encoding for categorical features\n",
" df = pd.get_dummies(df, columns=['season', 'time_period'])\n",
"\n",
- " # Interazioni tra variabili meteorologiche\n",
+ " # Weather variable interactions\n",
" df['temp_humidity'] = df['temp'] * df['humidity']\n",
" df['temp_cloudcover'] = df['temp'] * df['cloudcover']\n",
" df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']\n",
"\n",
- " # Features derivate per la radiazione solare\n",
+ " # Solar radiation derived features\n",
" df['clear_sky_factor'] = (100 - df['cloudcover']) / 100\n",
- " df['day_length'] = np.sin(df['day_of_year_sin']) * 12 + 12 # approssimazione della durata del giorno\n",
+ " df['day_length'] = np.sin(df['day_of_year_sin']) * 12 + 12 # day length approximation\n",
"\n",
" # Lag features\n",
" df['temp_1h_lag'] = df['temp'].shift(1)\n",
@@ -203,19 +388,20 @@
"\n",
" df['temp_humidity_interaction'] = df['temp'] * df['humidity'] / 100\n",
"\n",
- " # Indicatore di condizioni estreme\n",
- " df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) & (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
+ " # Extreme conditions indicator\n",
+ " df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) &\n",
+ " (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
"\n",
- " # Feature composite per la trasparenza atmosferica\n",
+ " # Atmospheric transparency composite feature\n",
" df['atmospheric_transparency'] = (100 - df['cloudcover']) * (df['visibility'] / 10)\n",
"\n",
- " # Indicatori temporali più granulari per mezze stagioni\n",
+ " # Transitional seasons indicator\n",
" df['is_transition_season'] = ((df['season_Spring'] | df['season_Autumn'])).astype(int)\n",
"\n",
- " # Interazione tra angolo solare e copertura nuvolosa normalizzata\n",
+ " # Solar angle and normalized cloud cover interaction\n",
" df['solar_cloud_effect'] = df['solar_elevation'] * (100 - df['cloudcover']) / 100\n",
"\n",
- " # Indicatore di stabilità atmosferica\n",
+ " # Atmospheric stability indicator\n",
" df['pressure_stability'] = df.groupby(df.index.date if isinstance(df.index, pd.DatetimeIndex)\n",
" else df.index.to_series().dt.date)['pressure'].transform(\n",
" lambda x: x.std()\n",
@@ -225,82 +411,130 @@
"\n",
"\n",
"def prepare_advanced_data(df):\n",
- " # Applicazione delle funzioni di feature engineering\n",
+ " \"\"\"\n",
+ " Prepares data for UV index prediction model with advanced feature engineering\n",
+ " and optimized preprocessing.\n",
+ "\n",
+ " Args:\n",
+ " df: DataFrame with meteorological data\n",
+ "\n",
+ " Returns:\n",
+ " tuple: (X_train_scaled, X_test_scaled, y_train, y_test, scaler, final_features, X_to_predict_scaled)\n",
+ " \"\"\"\n",
+ " # Apply feature engineering functions\n",
" df = add_advanced_features(df)\n",
"\n",
- " target_variables = ['solarradiation', 'solarenergy', 'uvindex']\n",
+ " # Optimized feature selection for UV index\n",
+ " selected_features = {\n",
+ " # Primary meteorological features (high correlation with UV)\n",
+ " 'atmospheric': [\n",
+ " 'temp', 'humidity', 'cloudcover', 'visibility',\n",
+ " 'clear_sky_index', 'atmospheric_transparency'\n",
+ " ],\n",
"\n",
- " # Selezione delle feature più rilevanti per UV index\n",
- " selected_features = [\n",
- " # Features meteorologiche base\n",
- " 'temp', 'humidity', 'cloudcover', 'visibility', 'pressure',\n",
+ " # Essential cyclic temporal features\n",
+ " 'temporal': [\n",
+ " 'hour_sin', 'hour_cos',\n",
+ " 'day_of_year_sin', 'day_of_year_cos'\n",
+ " ],\n",
"\n",
- " # Features temporali cicliche\n",
- " 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',\n",
- " 'day_of_year_sin', 'day_of_year_cos',\n",
+ " # Critical solar features for UV\n",
+ " 'solar': [\n",
+ " 'solar_angle', 'solar_elevation',\n",
+ " 'day_length', 'solar_noon',\n",
+ " 'solar_cloud_effect'\n",
+ " ],\n",
"\n",
- " # Features solari\n",
- " 'solar_angle', 'solar_elevation', 'day_length',\n",
- " 'clear_sky_index', 'solar_noon',\n",
+ " # Key interactions\n",
+ " 'interactions': [\n",
+ " 'cloud_temp_interaction',\n",
+ " 'visibility_cloud_interaction',\n",
+ " 'temp_humidity_interaction',\n",
+ " 'solar_clarity_index'\n",
+ " ],\n",
"\n",
- " # Interazioni\n",
- " 'cloud_temp_interaction', 'visibility_cloud_interaction',\n",
- " 'cloud_elevation', 'visibility_elevation',\n",
+ " # Significant rolling features\n",
+ " 'rolling': [\n",
+ " 'cloud_rolling_12h',\n",
+ " 'temp_rolling_mean_6h',\n",
+ " 'atmospheric_stability'\n",
+ " ]\n",
+ " }\n",
"\n",
- " # Rolling features\n",
- " 'cloud_rolling_12h', 'temp_rolling_12h',\n",
- " 'temp_rolling_mean_6h', 'cloudcover_rolling_mean_6h',\n",
+ " # Flatten feature list\n",
+ " base_features = [item for sublist in selected_features.values() for item in sublist]\n",
"\n",
- " # Features categoriche\n",
- " 'season', 'time_period'\n",
- " ]\n",
- "\n",
- " # Aggiorna la lista delle feature con le colonne one-hot\n",
+ " # Add categorical features (one-hot encoded)\n",
" categorical_columns = [col for col in df.columns if col.startswith(('season_', 'time_period_'))]\n",
- " final_features = [f for f in selected_features if f not in ['season', 'time_period']] + categorical_columns\n",
+ " final_features = base_features + categorical_columns\n",
"\n",
+ " # Temporal preprocessing\n",
" df = df.sort_values('datetime')\n",
" df.set_index('datetime', inplace=True)\n",
"\n",
- " columns_to_interpolate = final_features + target_variables\n",
- " for column in columns_to_interpolate:\n",
- " df[column] = df[column].interpolate(method='time')\n",
+ " # Advanced interpolation for missing values\n",
+ " for column in final_features:\n",
+ " if column in df.columns:\n",
+ " if df[column].isnull().any():\n",
+ " if column in selected_features['rolling']:\n",
+ " df[column] = df[column].fillna(method='ffill').fillna(method='bfill')\n",
+ " else:\n",
+ " df[column] = df[column].interpolate(method='time', limit_direction='both')\n",
"\n",
- " # Rimuovi eventuali valori mancanti residui\n",
- " df.fillna(0, inplace=True)\n",
+ " # Temporal data split\n",
+ " data_after_2010 = df[df.index.year >= 2010].copy()\n",
+ " data_before_2010 = df[df.index.year < 2010].copy()\n",
"\n",
- " data_after_2010 = df[df['year'] >= 2010].copy()\n",
- " data_before_2010 = df[df['year'] < 2010].copy()\n",
- "\n",
- " print(\"\\nNumero di record dopo 2010:\", len(data_after_2010))\n",
- " print(\"Numero di record prima 2010:\", len(data_before_2010))\n",
+ " print(f\"\\nTemporal distribution of data:\")\n",
+ " print(f\"Records after 2010: {len(data_after_2010):,}\")\n",
+ " print(f\"Records before 2010: {len(data_before_2010):,}\")\n",
"\n",
+ " # Feature and target preparation\n",
" X = data_after_2010[final_features]\n",
- "\n",
- " #print(X.head())\n",
- " #print(X.columns)\n",
- "\n",
" y = data_after_2010['uvindex']\n",
- "\n",
" X_to_predict = data_before_2010[final_features]\n",
"\n",
- " # Split dei dati\n",
- " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)\n",
+ " # Data validation\n",
+ " if X.isnull().any().any() or y.isnull().any():\n",
+ " print(\"\\nWarning: Found missing values after preprocessing\")\n",
+ " print(\"Features with missing values:\", X.columns[X.isnull().any()].tolist())\n",
+ " X = X.fillna(X.mean())\n",
+ " y = y.fillna(y.mean())\n",
"\n",
- " # Scaling delle feature\n",
- " scaler = StandardScaler()\n",
+ " # Stratified data split\n",
+ " X_train, X_test, y_train, y_test = train_test_split(\n",
+ " X, y,\n",
+ " test_size=0.5,\n",
+ " random_state=random_state_value,\n",
+ " stratify=pd.qcut(y, q=5, duplicates='drop', labels=False)\n",
+ " )\n",
+ "\n",
+ " # Robust feature scaling\n",
+ " scaler = RobustScaler()\n",
" X_train_scaled = scaler.fit_transform(X_train)\n",
" X_test_scaled = scaler.transform(X_test)\n",
- "\n",
" X_to_predict_scaled = scaler.transform(X_to_predict)\n",
"\n",
- " return X_train_scaled, X_test_scaled, y_train, y_test, scaler, final_features, X_to_predict_scaled\n",
+ " # Final validation\n",
+ " assert not np.isnan(X_train_scaled).any(), \"Found NaN in X_train_scaled\"\n",
+ " assert not np.isnan(X_test_scaled).any(), \"Found NaN in X_test_scaled\"\n",
+ " assert not np.isnan(X_to_predict_scaled).any(), \"Found NaN in X_to_predict_scaled\"\n",
+ "\n",
+ " # Print feature information\n",
+ " print(\"\\nNumber of features used:\", len(final_features))\n",
+ " print(\"\\nFeature categories:\")\n",
+ " for category, features in selected_features.items():\n",
+ " print(f\"{category}: {len(features)} features\")\n",
+ " print(f\"Categorical: {len(categorical_columns)} features\")\n",
+ "\n",
+ " return (X_train_scaled, X_test_scaled, y_train, y_test,\n",
+ " scaler, final_features, X_to_predict_scaled)\n",
"\n",
"\n",
"def create_sequence_data(X, sequence_length=24):\n",
" \"\"\"\n",
- " Converte i dati in sequenze per l'input LSTM\n",
- " sequence_length rappresenta quante ore precedenti considerare\n",
+ " Converts data into sequences for LSTM input\n",
+ " sequence_length represents how many previous hours to consider\n",
" \"\"\"\n",
" sequences = []\n",
" for i in range(len(X) - sequence_length + 1):\n",
@@ -309,16 +543,16 @@
"\n",
"\n",
"def prepare_hybrid_data(df):\n",
- " # Utilizziamo la preparazione dati esistente\n",
+ " # Use existing data preparation\n",
" X_train_scaled, X_test_scaled, y_train, y_test, scaler, features, X_to_predict_scaled = prepare_advanced_data(df)\n",
"\n",
- " # Convertiamo i dati in sequenze\n",
- " sequence_length = 24 # 24 ore di dati storici\n",
+ " # Convert data to sequences\n",
+ " sequence_length = 24 # 24 hours of historical data\n",
"\n",
" X_train_seq = create_sequence_data(X_train_scaled, sequence_length)\n",
" X_test_seq = create_sequence_data(X_test_scaled, sequence_length)\n",
"\n",
- " # Adattiamo le y rimuovendo i primi (sequence_length-1) elementi\n",
+ " # Adjust y by removing the first (sequence_length-1) elements\n",
" y_train = y_train[sequence_length - 1:]\n",
" y_test = y_test[sequence_length - 1:]\n",
"\n",
@@ -334,77 +568,173 @@
"metadata": {},
"outputs": [],
"source": [
- "# Funzioni per costruire il modello LSTM avanzato\n",
- "def create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01, return_sequences=True):\n",
+ "def create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01,\n",
+ " survival_probability=0.8, return_sequences=True):\n",
+ " \"\"\"LSTM layer with stochastic depth\"\"\"\n",
" residual = x\n",
- " x = Bidirectional(LSTM(units, return_sequences=return_sequences, kernel_regularizer=regularizers.l2(l2_reg)))(x)\n",
+ "\n",
+ " # Main path\n",
+ " x = Bidirectional(LSTM(units, return_sequences=return_sequences,\n",
+ " kernel_regularizer=regularizers.l2(l2_reg)))(x)\n",
" x = LayerNormalization()(x)\n",
" x = Dropout(dropout_rate)(x)\n",
- " # Adjust residual dimension and handle return_sequences\n",
+ "\n",
+ " # Adjust residual dimension if needed\n",
" if return_sequences:\n",
" if int(residual.shape[-1]) != 2 * units:\n",
- " residual = Dense(2 * units, activation='linear')(residual)\n",
- " x = Add()([x, residual])\n",
+ " residual = Conv1D(2 * units, 1)(residual)\n",
+ "\n",
+ " x = tfa.layers.StochasticDepth(survival_probability)([x, residual])\n",
+ "\n",
" return x\n",
"\n",
"\n",
- "def attention_block(x, units, num_heads=8):\n",
+ "def attention_block(x, units, num_heads=8, survival_probability=0.8):\n",
+ " \"\"\"\n",
+ " Attention block with stochastic depth.\n",
+ "\n",
+ " Args:\n",
+ " x: input tensor\n",
+ " units: number of units for the attention\n",
+ " num_heads: number of attention heads\n",
+ " survival_probability: probability of the layer being kept during training\n",
+ " \"\"\"\n",
+ " # Compute self-attention\n",
" attention = MultiHeadAttention(num_heads=num_heads, key_dim=units)(x, x)\n",
- " x = Add()([x, attention])\n",
+ "\n",
+ " # Apply stochastic depth to the attention path\n",
+ " x = tfa.layers.StochasticDepth(survival_probability)([attention, x])\n",
" x = LayerNormalization()(x)\n",
+ "\n",
+ " # Optional: FFN block with stochastic depth\n",
+ " ffn = Dense(units * 4, activation='swish')(x)\n",
+ " ffn = Dense(units)(ffn)\n",
+ " x = tfa.layers.StochasticDepth(survival_probability)([ffn, x])\n",
+ " x = LayerNormalization()(x)\n",
+ "\n",
" return x\n",
"\n",
"\n",
"def create_uv_index_model(input_shape, folder_name, l2_lambda=0.005):\n",
" inputs = Input(shape=input_shape)\n",
"\n",
- " # Primi due layer LSTM con sequenze\n",
- " x = create_residual_lstm_layer(inputs, 128, 0.4, l2_lambda, return_sequences=True)\n",
- " x = create_residual_lstm_layer(x, 64, 0.3, l2_lambda, return_sequences=True)\n",
- " x = create_residual_lstm_layer(x, 32, 0.2, l2_lambda, return_sequences=True)\n",
+ " # Progressive survival probabilities\n",
+ " survival_probs = [0.9, 0.8, 0.7]\n",
+ " attention_survival_probs = [0.85, 0.75, 0.65]\n",
"\n",
- " # Attention e MaxPooling mentre abbiamo ancora la sequenza\n",
- " x = attention_block(x, 32, num_heads=16)\n",
- " x = MaxPooling1D()(x)\n",
+ " # First LSTM block with stochastic depth\n",
+ " x = create_residual_lstm_layer(\n",
+ " inputs, 128, dropout_rate=0.4,\n",
+ " l2_reg=l2_lambda,\n",
+ " survival_probability=survival_probs[0],\n",
+ " return_sequences=True\n",
+ " )\n",
+ " x = attention_block(x, 128, num_heads=8,\n",
+ " survival_probability=attention_survival_probs[0])\n",
"\n",
- " # Ultimo layer LSTM senza sequenze\n",
- " x = create_residual_lstm_layer(x, 32, 0.1, l2_lambda, return_sequences=False)\n",
+ " # Second LSTM block with stochastic depth\n",
+ " x = create_residual_lstm_layer(\n",
+ " x, 64, dropout_rate=0.3,\n",
+ " l2_reg=l2_lambda,\n",
+ " survival_probability=survival_probs[1],\n",
+ " return_sequences=True\n",
+ " )\n",
+ " x = attention_block(x, 64, num_heads=8,\n",
+ " survival_probability=attention_survival_probs[1])\n",
"\n",
- " # Dense layers\n",
- " x = Dense(32, kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
+ " # Third LSTM block with stochastic depth\n",
+ " x = create_residual_lstm_layer(\n",
+ " x, 32, dropout_rate=0.2,\n",
+ " l2_reg=l2_lambda,\n",
+ " survival_probability=survival_probs[2],\n",
+ " return_sequences=True\n",
+ " )\n",
+ " x = attention_block(x, 32, num_heads=8,\n",
+ " survival_probability=attention_survival_probs[2])\n",
+ "\n",
+ " # Global attention with stochastic depth\n",
+ " temporal_attention = MultiHeadAttention(num_heads=16, key_dim=32)(x, x)\n",
+ " x = tfa.layers.StochasticDepth(survival_probability=0.8)([temporal_attention, x])\n",
+ " x = LayerNormalization()(x)\n",
+ "\n",
+ " # Pooling\n",
+ " x = GlobalAveragePooling1D()(x)\n",
+ "\n",
+ " # Dense layers with stochastic depth\n",
+ " dense = Dense(32, activation='swish',\n",
+ " kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
+ " dense = BatchNormalization()(dense)\n",
+ " skip = Dense(32, activation='linear')(x)\n",
+ " x = tfa.layers.StochasticDepth(survival_probability=0.9)([dense, skip])\n",
+ "\n",
+ " # Final dense layer\n",
+ " x = Dense(16, activation='swish',\n",
+ " kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
" x = BatchNormalization()(x)\n",
- " x = Activation('swish')(x)\n",
- " x = Dropout(0.1)(x)\n",
- "\n",
- " x = Dense(16, kernel_regularizer=regularizers.l2(l2_lambda))(x)\n",
- " x = BatchNormalization()(x)\n",
- " x = Activation('swish')(x)\n",
- " x = Dropout(0.1)(x)\n",
"\n",
+ " # Output layer\n",
" outputs = Dense(1)(x)\n",
" outputs = Lambda(lambda x: tf.clip_by_value(x, 0, 11))(outputs)\n",
"\n",
" model = Model(inputs=inputs, outputs=outputs, name=\"UvModel\")\n",
"\n",
- " optimizer = AdamW(\n",
- " learning_rate=0.0005,\n",
- " beta_1=0.9,\n",
- " beta_2=0.999,\n",
- " epsilon=1e-07\n",
+ " # Cosine decay with warmup\n",
+ " initial_learning_rate = 0.001\n",
+ " decay_steps = 10000\n",
+ "\n",
+ " cosine_decay = tf.keras.optimizers.schedules.CosineDecayRestarts(\n",
+ " initial_learning_rate,\n",
+ " decay_steps,\n",
+ " t_mul=2.0,\n",
+ " m_mul=0.9,\n",
+ " alpha=0.1\n",
" )\n",
"\n",
+ " optimizer = AdamW(\n",
+ " learning_rate=cosine_decay,\n",
+ " weight_decay=0.01\n",
+ " )\n",
+ "\n",
+ " # UV-specific loss\n",
+ " def uv_aware_loss(y_true, y_pred):\n",
+ " huber_loss = tf.keras.losses.Huber()(y_true, y_pred)\n",
+ "\n",
+ " # Higher weight for UV peak hours\n",
+ " time_of_day = tf.cast(tf.math.floormod(tf.range(tf.shape(y_true)[0]), 24),\n",
+ " tf.float32)\n",
+ " peak_hours_weight = tf.where(\n",
+ " tf.logical_and(time_of_day >= 10, time_of_day <= 16),\n",
+ " 1.5,\n",
+ " 1.0\n",
+ " )\n",
+ "\n",
+ " # Higher weight for high UV values\n",
+ " high_uv_weight = tf.where(\n",
+ " y_true >= 8,\n",
+ " 1.3,\n",
+ " 1.0\n",
+ " )\n",
+ "\n",
+ " return huber_loss * peak_hours_weight * high_uv_weight\n",
+ "\n",
" model.compile(\n",
" optimizer=optimizer,\n",
- " loss='huber',\n",
- " metrics=['mae', 'mse']\n",
+ " loss=uv_aware_loss,\n",
+ " metrics=[\n",
+ " 'mae',\n",
+ " 'mse',\n",
+ " tf.keras.metrics.RootMeanSquaredError(),\n",
+ " tf.keras.metrics.MeanAbsolutePercentageError()\n",
+ " ]\n",
" )\n",
+ "\n",
" model.summary()\n",
"\n",
" plot_model(model,\n",
" to_file=f'{folder_name}_model_architecture.png',\n",
- " show_shapes=True, # Mostra le dimensioni dei tensori\n",
- " show_layer_names=True, # Mostra i nomi dei layer\n",
- " dpi=96, # Risoluzione dell'immagine\n",
+ " show_shapes=True,\n",
+ " show_layer_names=True,\n",
+ " dpi=150,\n",
" show_layer_activations=True)\n",
"\n",
" return model\n",
@@ -412,198 +742,221 @@
"\n",
"def evaluate_uv_predictions(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
- " Valutazione specifica per UV index con metriche sia raw che categoriche\n",
+ " Comprehensive evaluation of UV index predictions with detailed analysis and visualizations.\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
- " Valori reali dell'UV index\n",
+ " Actual UV index values\n",
" y_pred : array-like\n",
- " Valori predetti dell'UV index\n",
+ " Predicted UV index values\n",
" folder_name : str, optional\n",
- " Cartella dove salvare eventuali plot di analisi\n",
+ " Folder to save analysis plots\n",
"\n",
" Returns:\n",
" --------\n",
" dict\n",
- " Dizionario contenente tutte le metriche calcolate\n",
+ " Dictionary containing all calculated metrics\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
+ " import seaborn as sns\n",
+ " from sklearn.metrics import confusion_matrix, mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
+ " # Data preprocessing\n",
" y_true = np.array(y_true).ravel()\n",
" y_pred = np.array(y_pred).ravel()\n",
"\n",
- " # Calcolo metriche sui valori raw\n",
- " mae_raw = mean_absolute_error(y_true, y_pred)\n",
- " rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred))\n",
- " r2_raw = r2_score(y_true, y_pred)\n",
- "\n",
- " # Arrotonda le predizioni al più vicino intero\n",
- " y_pred_rounded = np.round(y_pred)\n",
+ " # Rounding and clipping predictions\n",
+ " y_pred_rounded = np.round(y_pred * 2) / 2 # Round to nearest 0.5\n",
" y_pred_clipped = np.clip(y_pred_rounded, 0, 11)\n",
"\n",
- " # Calcolo metriche sui valori arrotondati\n",
- " mae_rounded = mean_absolute_error(y_true, y_pred_clipped)\n",
- " rmse_rounded = np.sqrt(mean_squared_error(y_true, y_pred_clipped))\n",
- " r2_rounded = r2_score(y_true, y_pred_clipped)\n",
+ " # Calculate errors\n",
+ " errors = y_pred - y_true\n",
+ " errors_rounded = y_pred_clipped - y_true\n",
"\n",
- " # Calcolo accuratezza per diversi margini di errore (sia raw che rounded)\n",
- " # Raw\n",
- " within_05_raw = np.mean(np.abs(y_pred - y_true) <= 0.5)\n",
- " within_1_raw = np.mean(np.abs(y_pred - y_true) <= 1.0)\n",
- " within_2_raw = np.mean(np.abs(y_pred - y_true) <= 2.0)\n",
+ " # Function to determine UV risk level\n",
+ " def get_uv_risk_level(values):\n",
+ " levels = np.full_like(values, 'Low', dtype=object)\n",
+ " levels[(values > 2) & (values <= 5)] = 'Moderate'\n",
+ " levels[(values > 5) & (values <= 7)] = 'High'\n",
+ " levels[(values > 7) & (values <= 10)] = 'Very High'\n",
+ " levels[values > 10] = 'Extreme'\n",
+ " return levels\n",
"\n",
- " # Rounded\n",
- " exact_accuracy = np.mean(y_pred_clipped == y_true)\n",
- " one_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 1)\n",
- " two_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 2)\n",
+ " # Calculate basic metrics\n",
+ " metrics = {\n",
+ " 'raw': {\n",
+ " 'mae': mean_absolute_error(y_true, y_pred),\n",
+ " 'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),\n",
+ " 'r2': r2_score(y_true, y_pred),\n",
+ " 'mean_error': np.mean(errors),\n",
+ " 'std_error': np.std(errors),\n",
+ " 'median_error': np.median(errors),\n",
+ " 'p95_abs_error': np.percentile(np.abs(errors), 95)\n",
+ " },\n",
+ " 'rounded': {\n",
+ " 'mae': mean_absolute_error(y_true, y_pred_clipped),\n",
+ " 'rmse': np.sqrt(mean_squared_error(y_true, y_pred_clipped)),\n",
+ " 'r2': r2_score(y_true, y_pred_clipped)\n",
+ " }\n",
+ " }\n",
"\n",
- " print(\"\\nUV Index Prediction Metrics:\")\n",
- " print(\"\\nRaw Predictions:\")\n",
- " print(f\"MAE: {mae_raw:.3f}\")\n",
- " print(f\"RMSE: {rmse_raw:.3f}\")\n",
- " print(f\"R² Score: {r2_raw:.3f}\")\n",
- " print(f\"Within ±0.5: {within_05_raw:.3f}\")\n",
- " print(f\"Within ±1.0: {within_1_raw:.3f}\")\n",
- " print(f\"Within ±2.0: {within_2_raw:.3f}\")\n",
+ " # Calculate accuracies for different margins\n",
+ " for data_type, errors_data in [('raw', errors), ('rounded', errors_rounded)]:\n",
+ " metrics[data_type].update({\n",
+ " 'within_05': np.mean(np.abs(errors_data) <= 0.5) * 100,\n",
+ " 'within_1': np.mean(np.abs(errors_data) <= 1.0) * 100,\n",
+ " 'within_15': np.mean(np.abs(errors_data) <= 1.5) * 100,\n",
+ " 'within_2': np.mean(np.abs(errors_data) <= 2.0) * 100\n",
+ " })\n",
"\n",
- " print(\"\\nRounded Predictions:\")\n",
- " print(f\"MAE: {mae_rounded:.3f}\")\n",
- " print(f\"RMSE: {rmse_rounded:.3f}\")\n",
- " print(f\"R² Score: {r2_rounded:.3f}\")\n",
- " print(f\"Exact Match: {exact_accuracy:.3f}\")\n",
- " print(f\"±1 Accuracy: {one_off_accuracy:.3f}\")\n",
- " print(f\"±2 Accuracy: {two_off_accuracy:.3f}\")\n",
+ " # Analysis by UV risk level\n",
+ " y_true_risk = get_uv_risk_level(y_true)\n",
+ " y_pred_risk = get_uv_risk_level(y_pred_clipped)\n",
"\n",
- " # Analisi dei livelli UV\n",
- " def get_uv_level(value):\n",
- " if value <= 2:\n",
- " return 'Low'\n",
- " elif value <= 5:\n",
- " return 'Moderate'\n",
- " elif value <= 7:\n",
- " return 'High'\n",
- " elif value <= 10:\n",
- " return 'Very High'\n",
- " else:\n",
- " return 'Extreme'\n",
+ " # Calculate confusion matrix\n",
+ " risk_levels = ['Low', 'Moderate', 'High', 'Very High', 'Extreme']\n",
+ " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
+ " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
"\n",
- " # Calcola livelli UV sia per raw che rounded\n",
- " y_true_levels = [get_uv_level(v) for v in y_true]\n",
- " y_pred_levels_raw = [get_uv_level(v) for v in y_pred]\n",
- " y_pred_levels_rounded = [get_uv_level(v) for v in y_pred_clipped]\n",
+ " # Analysis by UV range\n",
+ " uv_ranges = [\n",
+ " (0, 2, 'Low'),\n",
+ " (2, 5, 'Moderate'),\n",
+ " (5, 7, 'High'),\n",
+ " (7, 10, 'Very High'),\n",
+ " (10, 11, 'Extreme')\n",
+ " ]\n",
"\n",
- " # Calcola accuracy dei livelli\n",
- " level_accuracy_raw = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels_raw)])\n",
- " level_accuracy_rounded = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels_rounded)])\n",
+ " range_analysis = {}\n",
+ " for low, high, label in uv_ranges:\n",
+ " mask = (y_true >= low) & (y_true < high)\n",
+ " if mask.any():\n",
+ " range_analysis[label] = {\n",
+ " 'mae': mean_absolute_error(y_true[mask], y_pred[mask]),\n",
+ " 'count': np.sum(mask),\n",
+ " 'accuracy_within_05': np.mean(np.abs(errors[mask]) <= 0.5) * 100,\n",
+ " 'accuracy_within_1': np.mean(np.abs(errors[mask]) <= 1.0) * 100\n",
+ " }\n",
"\n",
- " print(\"\\nUV Level Accuracy:\")\n",
- " print(f\"Raw predictions: {level_accuracy_raw:.3f}\")\n",
- " print(f\"Rounded predictions: {level_accuracy_rounded:.3f}\")\n",
- "\n",
- " print(\"\\nUV Level Confusion Matrix (Raw Predictions):\")\n",
- " print(pd.crosstab(\n",
- " pd.Series(y_true_levels, name='Actual'),\n",
- " pd.Series(y_pred_levels_raw, name='Predicted')\n",
- " ))\n",
- "\n",
- " print(\"\\nUV Level Confusion Matrix (Rounded Predictions):\")\n",
- " print(pd.crosstab(\n",
- " pd.Series(y_true_levels, name='Actual'),\n",
- " pd.Series(y_pred_levels_rounded, name='Predicted')\n",
- " ))\n",
- "\n",
- " # Se specificata una cartella, salva i plot di analisi\n",
+ " # Visualizations\n",
" if folder_name is not None:\n",
" try:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"\n",
- " # Plot di confronto tra raw e rounded predictions\n",
- " plt.figure(figsize=(15, 5))\n",
+ " # Main figure with 4 subplots\n",
+ " fig = plt.figure(figsize=(20, 15))\n",
"\n",
- " # Plot 1: Scatter plot confronto\n",
- " plt.subplot(1, 3, 1)\n",
- " plt.scatter(y_true, y_pred, alpha=0.5, label='Raw')\n",
- " plt.scatter(y_true, y_pred_clipped, alpha=0.5, label='Rounded')\n",
+ " # 1. Error distribution\n",
+ " plt.subplot(2, 2, 1)\n",
+ " plt.hist(errors, bins=50, alpha=0.7)\n",
+ " plt.title('Prediction Error Distribution')\n",
+ " plt.xlabel('Error')\n",
+ " plt.ylabel('Frequency')\n",
+ "\n",
+ " # 2. Actual vs Predicted scatter plot\n",
+ " plt.subplot(2, 2, 2)\n",
+ " plt.scatter(y_true, y_pred, alpha=0.5)\n",
" plt.plot([0, 11], [0, 11], 'r--', lw=2)\n",
- " plt.xlabel('Actual UV Index')\n",
- " plt.ylabel('Predicted UV Index')\n",
- " plt.title('Raw vs Rounded Predictions')\n",
- " plt.legend()\n",
- " plt.grid(True)\n",
+ " plt.title('Actual vs Predicted Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Predicted Values')\n",
"\n",
- " # Plot 2: Distribuzione errori raw\n",
- " plt.subplot(1, 3, 2)\n",
- " plt.hist(y_pred - y_true, bins=50, alpha=0.7)\n",
- " plt.xlabel('Prediction Error (Raw)')\n",
- " plt.ylabel('Frequency')\n",
- " plt.title('Distribution of Raw Errors')\n",
- " plt.grid(True)\n",
+ " # 3. Errors vs Actual Values\n",
+ " plt.subplot(2, 2, 3)\n",
+ " plt.scatter(y_true, errors, alpha=0.5)\n",
+ " plt.axhline(y=0, color='r', linestyle='--')\n",
+ " plt.title('Errors vs Actual Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Error')\n",
"\n",
- " # Plot 3: Distribuzione errori rounded\n",
- " plt.subplot(1, 3, 3)\n",
- " plt.hist(y_pred_clipped - y_true, bins=50, alpha=0.7)\n",
- " plt.xlabel('Prediction Error (Rounded)')\n",
- " plt.ylabel('Frequency')\n",
- " plt.title('Distribution of Rounded Errors')\n",
- " plt.grid(True)\n",
+ " # 4. Accuracy and MAE by range\n",
+ " ax = plt.subplot(2, 2, 4)\n",
+ " x_labels = [f\"{label}\\n({low}-{high})\" for low, high, label in uv_ranges]\n",
+ " accuracies = [range_analysis[label]['accuracy_within_05']\n",
+ " for _, _, label in uv_ranges if label in range_analysis]\n",
+ " mae_values = [range_analysis[label]['mae']\n",
+ " for _, _, label in uv_ranges if label in range_analysis]\n",
+ "\n",
+ " bars = plt.bar(x_labels, accuracies, alpha=0.6)\n",
+ " plt.ylabel('Accuracy within ±0.5 (%)')\n",
+ " plt.title('Accuracy and MAE by UV Range')\n",
+ "\n",
+ " # Add MAE as line\n",
+ " ax2 = ax.twinx()\n",
+ " ax2.plot(x_labels, mae_values, 'r-o', label='MAE')\n",
+ " ax2.set_ylabel('MAE', color='red')\n",
"\n",
" plt.tight_layout()\n",
"\n",
- " # Salva il plot\n",
- " filename = os.path.join(folder_name, f'uv_prediction_analysis_{timestamp}.png')\n",
- " plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot di analisi salvato come: {filename}\")\n",
+ " # Save main figure\n",
+ " main_plot_path = os.path.join(folder_name, f'uv_analysis_{timestamp}.png')\n",
+ " plt.savefig(main_plot_path, dpi=300, bbox_inches='tight')\n",
"\n",
- " plt.show()\n",
+ " # Confusion matrix as separate plot\n",
+ " plt.figure(figsize=(10, 8))\n",
+ " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
+ " plt.title('Confusion Matrix for UV Risk Levels')\n",
+ "\n",
+ " conf_matrix_path = os.path.join(folder_name, f'confusion_matrix_{timestamp}.png')\n",
+ " plt.savefig(conf_matrix_path, dpi=300, bbox_inches='tight')\n",
+ "\n",
+ " plt.close('all')\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
+ " print(f\"\\nError saving plots: {str(e)}\")\n",
"\n",
- " # Restituisci tutte le metriche in un dizionario\n",
- " metrics = {\n",
- " 'raw': {\n",
- " 'mae': mae_raw,\n",
- " 'rmse': rmse_raw,\n",
- " 'r2': r2_raw,\n",
- " 'within_05': within_05_raw,\n",
- " 'within_1': within_1_raw,\n",
- " 'within_2': within_2_raw,\n",
- " 'level_accuracy': level_accuracy_raw\n",
- " },\n",
- " 'rounded': {\n",
- " 'mae': mae_rounded,\n",
- " 'rmse': rmse_rounded,\n",
- " 'r2': r2_rounded,\n",
- " 'exact_match': exact_accuracy,\n",
- " 'one_off': one_off_accuracy,\n",
- " 'two_off': two_off_accuracy,\n",
- " 'level_accuracy': level_accuracy_rounded\n",
+ " # Print detailed report\n",
+ " print(\"\\nUV Index Prediction Analysis:\")\n",
+ " print(\"\\nRaw Metrics:\")\n",
+ " for key, value in metrics['raw'].items():\n",
+ " print(f\"{key}: {value:.3f}\")\n",
+ "\n",
+ " print(\"\\nRounded Metrics:\")\n",
+ " for key, value in metrics['rounded'].items():\n",
+ " print(f\"{key}: {value:.3f}\")\n",
+ "\n",
+ " print(\"\\nAnalysis by UV Range:\")\n",
+ " for label, stats in range_analysis.items():\n",
+ " print(f\"\\n{label}:\")\n",
+ " for key, value in stats.items():\n",
+ " print(f\" {key}: {value:.3f}\")\n",
+ "\n",
+ " print(\"\\nConfusion Matrix:\")\n",
+ " print(cm_df)\n",
+ "\n",
+ " # Add range analysis and confusion matrix to metrics dictionary\n",
+ " metrics.update({\n",
+ " 'range_analysis': range_analysis,\n",
+ " 'confusion_matrix': cm_df.to_dict(),\n",
+ " 'plot_paths': {\n",
+ " 'main_analysis': main_plot_path if folder_name else None,\n",
+ " 'confusion_matrix': conf_matrix_path if folder_name else None\n",
" }\n",
- " }\n",
+ " })\n",
"\n",
" return metrics\n",
"\n",
"def plot_training_history(history, folder_name=None):\n",
" \"\"\"\n",
- " Visualizza e salva i plot della loss e delle metriche durante il training\n",
+ " Visualize and save the loss and metrics plots during training\n",
"\n",
" Parameters:\n",
" -----------\n",
" history : tensorflow.keras.callbacks.History\n",
- " L'oggetto history restituito dal training del modello\n",
+ " The history object returned by model training\n",
" folder_name : str\n",
- " Cartella dove salvare il plot\n",
+ " Folder where to save the plot\n",
" \"\"\"\n",
" import os\n",
"\n",
" try:\n",
- " # Crea la figura\n",
+ " # Create the figure\n",
" plt.figure(figsize=(12, 4))\n",
"\n",
- " # Plot della Loss\n",
+ " # Loss Plot\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(history.history['loss'], label='Training Loss')\n",
" plt.plot(history.history['val_loss'], label='Validation Loss')\n",
@@ -613,7 +966,7 @@
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
- " # Plot del MAE\n",
+ " # MAE Plot\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(history.history['mae'], label='Training MAE')\n",
" plt.plot(history.history['val_mae'], label='Validation MAE')\n",
@@ -627,14 +980,14 @@
"\n",
" if folder_name is not None:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
- " # Genera il nome del file con timestamp\n",
+ " # Generate filename with timestamp\n",
" filename = os.path.join(folder_name, 'training_history.png')\n",
"\n",
- " # Salva la figura\n",
+ " # Save the figure\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot della training history salvato come: {filename}\")\n",
+ " print(f\"\\nTraining history plot saved as: {filename}\")\n",
"\n",
- " # Salva anche i dati numerici in formato CSV\n",
+ " # Also save numerical data in CSV format\n",
" history_df = pd.DataFrame({\n",
" 'epoch': range(1, len(history.history['loss']) + 1),\n",
" 'training_loss': history.history['loss'],\n",
@@ -646,9 +999,9 @@
" if folder_name is not None:\n",
" csv_filename = os.path.join(folder_name, 'training_history.csv')\n",
" history_df.to_csv(csv_filename, index=False)\n",
- " print(f\"Dati della training history salvati come: {csv_filename}\")\n",
+ " print(f\"Training history data saved as: {csv_filename}\")\n",
"\n",
- " # Calcola e salva le statistiche finali\n",
+ " # Calculate and save final statistics\n",
" final_stats = {\n",
" 'final_training_loss': history.history['loss'][-1],\n",
" 'final_validation_loss': history.history['val_loss'][-1],\n",
@@ -660,56 +1013,56 @@
" }\n",
"\n",
" if folder_name is not None:\n",
- " # Salva le statistiche in formato JSON\n",
+ " # Save statistics in JSON format\n",
" stats_filename = os.path.join(folder_name, 'training_stats.json')\n",
" with open(stats_filename, 'w') as f:\n",
" json.dump(final_stats, f, indent=4)\n",
- " print(f\"Statistiche finali salvate come: {stats_filename}\")\n",
+ " print(f\"Final statistics saved as: {stats_filename}\")\n",
"\n",
- " # Stampa le statistiche principali\n",
- " print(\"\\nStatistiche finali del training:\")\n",
- " print(f\"Loss finale (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
- " print(f\"MAE finale (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
- " print(f\"Miglior validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
- " print(f\"Miglior validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
+ " # Print main statistics\n",
+ " print(\"\\nFinal training statistics:\")\n",
+ " print(f\"Final Loss (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
+ " print(f\"Final MAE (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
+ " print(f\"Best validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
+ " print(f\"Best validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
"\n",
" plt.show()\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante la creazione o il salvataggio dei plot: {str(e)}\")\n",
+ " print(f\"\\nError during plot creation or saving: {str(e)}\")\n",
"\n",
"\n",
"def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='uv_index'):\n",
" \"\"\"\n",
- " Funzione di training avanzata per il modello ibrido UV index con monitoraggio dettagliato\n",
- " e gestione del training.\n",
+ " Advanced training function for the hybrid UV index model with detailed monitoring\n",
+ " and training management.\n",
"\n",
" Parameters:\n",
" -----------\n",
" model : keras.Model\n",
- " Il modello ibrido compilato\n",
+ " The compiled hybrid model\n",
" X_train : numpy.ndarray\n",
- " Dati di training\n",
+ " Training data\n",
" y_train : numpy.ndarray\n",
- " Target di training\n",
+ " Training targets\n",
" X_test : numpy.ndarray\n",
- " Dati di validation\n",
+ " Validation data\n",
" y_test : numpy.ndarray\n",
- " Target di validation\n",
+ " Validation targets\n",
" epochs : int, optional\n",
- " Numero massimo di epoche di training\n",
+ " Maximum number of training epochs\n",
" batch_size : int, optional\n",
- " Dimensione del batch\n",
+ " Batch size\n",
"\n",
" Returns:\n",
" --------\n",
" history : keras.callbacks.History\n",
- " Storia del training con tutte le metriche\n",
+ " Training history with all metrics\n",
" \"\"\"\n",
"\n",
- " # Callbacks avanzati per il training\n",
+ " # Advanced callbacks for training\n",
" callbacks = [\n",
- " # Early Stopping avanzato\n",
+ " # Advanced Early Stopping\n",
" EarlyStopping(\n",
" monitor='mae',\n",
" patience=15,\n",
@@ -752,7 +1105,7 @@
" ),\n",
" tf.keras.callbacks.LambdaCallback(\n",
" on_epoch_end=lambda epoch, logs: print(\n",
- " f\"\\nEpoch {epoch + 1}: Predizioni fuori range: \"\n",
+ " f\"\\nEpoch {epoch + 1}: Out of range predictions: \"\n",
" f\"{np.sum((model.predict(X_test) < 0) | (model.predict(X_test) > 11))}\"\n",
" ) if epoch % 20 == 0 else None\n",
" )\n",
@@ -770,142 +1123,104 @@
" validation_freq=1,\n",
" )\n",
"\n",
- " # Analisi post-training\n",
- " print(\"\\nTraining completato con successo!\")\n",
+ " # Post-training analysis\n",
+ " print(\"\\nTraining completed successfully!\")\n",
"\n",
- " # Valutazione finale sul test set\n",
+ " # Final evaluation on test set\n",
" test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=0)\n",
- " print(f\"\\nMetriche finali sul test set:\")\n",
+ " print(f\"\\nFinal metrics on test set:\")\n",
" print(f\"Loss: {test_loss:.4f}\")\n",
" print(f\"MAE: {test_mae:.4f}\")\n",
" print(f\"MSE: {test_mse:.4f}\")\n",
"\n",
- " # Analisi delle predizioni\n",
+ " # Prediction analysis\n",
" predictions = model.predict(X_test)\n",
" out_of_range = np.sum((predictions < 0) | (predictions > 11))\n",
- " print(f\"\\nPredizioni fuori range: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
+ " print(f\"\\nOut of range predictions: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
"\n",
" plot_training_history(history, folder_name=folder_name)\n",
"\n",
" return history\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante il training: {str(e)}\")\n",
+ " print(f\"\\nError during training: {str(e)}\")\n",
" raise\n",
"\n",
" finally:\n",
- " # Pulizia della memoria\n",
+ " # Memory cleanup\n",
" tf.keras.backend.clear_session()\n",
"\n",
"\n",
- "def calculate_class_weights(y_train, n_classes=12):\n",
- " \"\"\"\n",
- " Calcola i pesi delle classi per bilanciare il dataset UV index.\n",
- " \n",
- " Parameters:\n",
- " -----------\n",
- " y_train : numpy.ndarray\n",
- " Array dei valori UV di training\n",
- " n_classes : int, optional\n",
- " Numero di classi possibili (0-11 per UV index, quindi 12 classi)\n",
- " \n",
- " Returns:\n",
- " --------\n",
- " dict:\n",
- " Dizionario con i pesi per ogni classe\n",
- " \"\"\"\n",
- " # Arrotonda i valori UV al più vicino intero e converti in intero\n",
- " y_discrete = np.clip(np.round(y_train), 0, 11).astype(int)\n",
- "\n",
- " # Calcola la frequenza di ogni classe\n",
- " unique, counts = np.unique(y_discrete, return_counts=True)\n",
- " total_samples = len(y_discrete)\n",
- "\n",
- " # Calcola i pesi inversamente proporzionali alla frequenza\n",
- " weights = {}\n",
- " for i in range(n_classes):\n",
- " if i in unique:\n",
- " # Se la classe è presente, calcola il peso\n",
- " weight = total_samples / (len(unique) * counts[unique == i][0])\n",
- " else:\n",
- " # Se la classe non è presente, assegna un peso neutro\n",
- " weight = 1.0\n",
- " weights[i] = weight\n",
- "\n",
- " return weights\n",
- "\n",
- "\n",
"def integrate_predictions(df, predictions, sequence_length=24):\n",
" \"\"\"\n",
- " Integra le predizioni dell'UV index nel dataset originale per i dati precedenti al 2010.\n",
- " \n",
+ " Integrate UV index predictions into the original dataset for pre-2010 data.\n",
+ "\n",
" Parameters:\n",
" -----------\n",
" df : pandas.DataFrame\n",
- " Dataset originale\n",
+ " Original dataset\n",
" predictions : numpy.ndarray\n",
- " Array delle predizioni UV index\n",
+ " Array of UV index predictions\n",
" sequence_length : int\n",
- " Lunghezza della sequenza usata per le predizioni\n",
- " \n",
+ " Sequence length used for predictions\n",
+ "\n",
" Returns:\n",
" --------\n",
" pandas.DataFrame\n",
- " Dataset aggiornato con le predizioni UV index\n",
+ " Updated dataset with UV index predictions\n",
" \"\"\"\n",
- " # Converti datetime in formato datetime se non lo è già\n",
+ " # Convert datetime to datetime format if not already\n",
" df['datetime'] = pd.to_datetime(df['datetime'])\n",
"\n",
- " # Identifica le righe precedenti al 2010\n",
+ " # Identify pre-2010 rows\n",
" mask_pre_2010 = df['datetime'].dt.year < 2010\n",
"\n",
- " # Crea un DataFrame temporaneo con le predizioni\n",
+ " # Create temporary DataFrame with predictions\n",
" dates_pre_2010 = df[mask_pre_2010]['datetime'].iloc[sequence_length - 1:]\n",
" predictions_df = pd.DataFrame({\n",
" 'datetime': dates_pre_2010,\n",
" 'uvindex_predicted': predictions.flatten()\n",
" })\n",
"\n",
- " # Merge con il dataset originale\n",
+ " # Merge with original dataset\n",
" df = df.merge(predictions_df, on='datetime', how='left')\n",
"\n",
- " # Aggiorna la colonna uvindex dove manca\n",
+ " # Update uvindex column where missing\n",
" df['uvindex'] = df['uvindex'].fillna(df['uvindex_predicted'])\n",
"\n",
- " # Rimuovi la colonna temporanea\n",
+ " # Remove temporary column\n",
" df = df.drop('uvindex_predicted', axis=1)\n",
"\n",
- " print(f\"Aggiunte {len(predictions)} predizioni al dataset\")\n",
- " print(f\"Righe con UV index dopo l'integrazione: {df['uvindex'].notna().sum()}\")\n",
+ " print(f\"Added {len(predictions)} predictions to dataset\")\n",
+ " print(f\"Rows with UV index after integration: {df['uvindex'].notna().sum()}\")\n",
"\n",
" return df\n",
"\n",
"\n",
"def train_uvindex_bounded_model(df):\n",
" \"\"\"\n",
- " Training del modello con vincoli specifici per UV index\n",
+ " Training of the model with specific constraints for UV index\n",
" \"\"\"\n",
- " print(\"Inizializzazione del training del modello UV index...\")\n",
+ " print(\"Initializing UV index model training...\")\n",
"\n",
" try:\n",
- "\n",
- " # Preparazione dei dati\n",
- " print(\"\\n1. Preparazione dei dati...\")\n",
+ " # Data preparation\n",
+ " print(\"\\n1. Preparing data...\")\n",
" X_train_seq, X_test_seq, y_train, y_test, scaler, features, X_to_predict_seq = prepare_hybrid_data(df)\n",
"\n",
- " print(f\"Shape dei dati di training: {X_train_seq.shape}\")\n",
- " print(f\"Shape dei dati di test: {X_test_seq.shape}\")\n",
+ " print(f\"Training data shape: {X_train_seq.shape}\")\n",
+ " print(f\"Test data shape: {X_test_seq.shape}\")\n",
"\n",
- " # Verifica della qualità dei dati\n",
+ " # Data quality verification\n",
" if np.isnan(X_train_seq).any() or np.isnan(y_train).any():\n",
- " raise ValueError(\"Trovati valori NaN nei dati di training\")\n",
+ " raise ValueError(\"Found NaN values in training data\")\n",
"\n",
- " # Creazione del modello\n",
- " print(\"\\n2. Creazione del modello...\")\n",
+ " # Model creation\n",
+ " print(\"\\n2. Creating model...\")\n",
" input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])\n",
" model = create_uv_index_model(input_shape, folder_name)\n",
"\n",
- " print(\"\\n4. Avvio del training...\")\n",
+ " print(\"\\n4. Starting training...\")\n",
" history = train_hybrid_model(\n",
" model=model,\n",
" X_train=X_train_seq,\n",
@@ -917,14 +1232,14 @@
" folder_name=folder_name\n",
" )\n",
"\n",
- " print(\"\\n5. Generazione delle predizioni...\")\n",
+ " print(\"\\n5. Generating predictions...\")\n",
" predictions = model.predict(X_test_seq)\n",
" predictions = np.clip(predictions, 0, 11)\n",
"\n",
- " print(\"\\n6. Valutazione del modello...\")\n",
+ " print(\"\\n6. Model evaluation...\")\n",
" metrics = evaluate_uv_predictions(y_test, predictions, folder_name=folder_name)\n",
"\n",
- " # Creazione del dizionario dei risultati\n",
+ " # Creating results dictionary\n",
" training_results = {\n",
" 'model_params': {\n",
" 'input_shape': input_shape,\n",
@@ -935,7 +1250,6 @@
" 'batch_size': 32,\n",
" 'total_epochs': len(history.history['loss']),\n",
" 'best_epoch': np.argmin(history.history['val_loss']) + 1,\n",
- " #'class_weights': {str(k): float(v) for k, v in class_weights.items()}\n",
" },\n",
" 'performance_metrics': {\n",
" 'final_loss': float(history.history['val_loss'][-1]),\n",
@@ -945,16 +1259,16 @@
" }\n",
" }\n",
"\n",
- " print(\"\\n7. Predizione dei dati mancanti risultati...\")\n",
+ " print(\"\\n7. Predicting missing data results...\")\n",
" to_predict_predictions = model.predict(X_to_predict_seq)\n",
" to_predict_predictions = np.clip(to_predict_predictions, 0, 11)\n",
"\n",
- " print(\"\\n8. Integrazione delle predizioni nel dataset originale...\")\n",
+ " print(\"\\n8. Integrating predictions into original dataset...\")\n",
" df_updated = integrate_predictions(df.copy(), to_predict_predictions)\n",
"\n",
- " df_updated.to_parquet('./data/weather_data_uvindex.parquet')\n",
+ " df_updated.to_parquet('../../sources/weather_data_uvindex.parquet')\n",
"\n",
- " # Aggiungi statistiche sulle predizioni al training_results\n",
+ " # Add prediction statistics to training_results\n",
" training_results['prediction_stats'] = {\n",
" 'n_predictions_added': len(to_predict_predictions),\n",
" 'mean_predicted_uv': float(to_predict_predictions.mean()),\n",
@@ -962,16 +1276,16 @@
" 'max_predicted_uv': float(to_predict_predictions.max()),\n",
" }\n",
"\n",
- " print(\"\\nTraining completato con successo!\")\n",
+ " print(\"\\nTraining completed successfully!\")\n",
"\n",
" return model, scaler, features, history, predictions, y_test, metrics, training_results\n",
"\n",
" except Exception as e:\n",
- " print(f\"\\nErrore durante il training: {str(e)}\")\n",
+ " print(f\"\\nError during training: {str(e)}\")\n",
" raise\n",
"\n",
" finally:\n",
- " # Pulizia della memoria\n",
+ " # Memory cleanup\n",
" tf.keras.backend.clear_session()"
]
},
@@ -1349,120 +1663,11 @@
}
],
"source": [
- "df = pd.read_parquet('../data/weather_data.parquet')\n",
+ "df = pd.read_parquet('../../sources/weather_data.parquet')\n",
"\n",
- "# Esegui il training\n",
"model, scaler, features, history, predictions, y_test, metrics, training_results = train_uvindex_bounded_model(df)"
]
},
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "637891db-8d55-4232-a56e-9759dbcc8c2f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Analisi Precisione Predizioni UV Index:\n",
- "Precisione esatta: 61.3%\n",
- "Precisione entro 0.5 punti: 79.4%\n",
- "Precisione entro 1.0 punti: 90.4%\n",
- "Precisione livello di rischio: 88.0%\n",
- "\n",
- "Analisi errori per livello UV:\n",
- "MAE per UV Basso (0-2): 0.138 (n=41346)\n",
- "MAE per UV Moderato (2-5): 0.906 (n=11522)\n",
- "MAE per UV Alto (5-7): 0.877 (n=5468)\n",
- "MAE per UV Molto Alto (7-10): 0.758 (n=6278)\n",
- "MAE per UV Estremo (10-11): 1.528 (n=252)\n",
- "\n",
- "Statistiche degli errori:\n",
- "Media errori: 0.006\n",
- "Deviazione standard errori: 0.783\n",
- "Errore mediano: 0.000\n",
- "95° percentile errore assoluto: 1.723\n"
- ]
- }
- ],
- "source": [
- "def analyze_uv_prediction_quality(y_true, y_pred):\n",
- " \"\"\"\n",
- " Analisi dettagliata della qualità delle predizioni UV\n",
- " \"\"\"\n",
- " # Converti in numpy array e appiattisci\n",
- " y_true = np.array(y_true).ravel()\n",
- " y_pred = np.array(y_pred).ravel()\n",
- "\n",
- " # Arrotonda le predizioni al più vicino 0.5\n",
- " y_pred_rounded = np.round(y_pred * 2) / 2\n",
- "\n",
- " # Calcola diverse metriche di accuratezza usando array numpy\n",
- " exact_match = np.mean(np.abs(y_pred_rounded - y_true) < 1e-6) * 100 # uso di tolleranza per confronti float\n",
- " within_half = np.mean(np.abs(y_pred_rounded - y_true) <= 0.5) * 100\n",
- " within_one = np.mean(np.abs(y_pred_rounded - y_true) <= 1.0) * 100\n",
- "\n",
- " # Analisi per livello di rischio UV\n",
- " def get_uv_risk_level(values):\n",
- " # Vettorizzazione della funzione per array numpy\n",
- " levels = np.zeros_like(values, dtype=str)\n",
- " levels[values <= 2] = 'Basso'\n",
- " levels[(values > 2) & (values <= 5)] = 'Moderato'\n",
- " levels[(values > 5) & (values <= 7)] = 'Alto'\n",
- " levels[(values > 7) & (values <= 10)] = 'Molto Alto'\n",
- " levels[values > 10] = 'Estremo'\n",
- " return levels\n",
- "\n",
- " y_true_risk = get_uv_risk_level(y_true)\n",
- " y_pred_risk = get_uv_risk_level(y_pred_rounded)\n",
- "\n",
- " risk_accuracy = np.mean(y_true_risk == y_pred_risk) * 100\n",
- "\n",
- " print(\"Analisi Precisione Predizioni UV Index:\")\n",
- " print(f\"Precisione esatta: {exact_match:.1f}%\")\n",
- " print(f\"Precisione entro 0.5 punti: {within_half:.1f}%\")\n",
- " print(f\"Precisione entro 1.0 punti: {within_one:.1f}%\")\n",
- " print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
- "\n",
- " # Distribuzione degli errori per livello UV\n",
- " uv_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
- " labels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- "\n",
- " print(\"\\nAnalisi errori per livello UV:\")\n",
- " for (low, high), label in zip(uv_ranges, labels):\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if np.sum(mask) > 0:\n",
- " mae_range = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
- " n_samples = np.sum(mask)\n",
- " print(f\"MAE per UV {label} ({low}-{high}): {mae_range:.3f} (n={n_samples})\")\n",
- "\n",
- " # Analisi aggiuntiva della distribuzione degli errori\n",
- " errors = y_pred - y_true\n",
- " print(\"\\nStatistiche degli errori:\")\n",
- " print(f\"Media errori: {np.mean(errors):.3f}\")\n",
- " print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
- " print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
- " print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
- "\n",
- " return {\n",
- " 'exact_match': exact_match,\n",
- " 'within_half': within_half,\n",
- " 'within_one': within_one,\n",
- " 'risk_accuracy': risk_accuracy,\n",
- " 'error_stats': {\n",
- " 'mean': float(np.mean(errors)),\n",
- " 'std': float(np.std(errors)),\n",
- " 'median': float(np.median(errors)),\n",
- " 'p95_abs': float(np.percentile(np.abs(errors), 95))\n",
- " }\n",
- " }\n",
- "\n",
- "\n",
- "# Per utilizzare l'analisi:\n",
- "metrics = analyze_uv_prediction_quality(y_test, predictions)"
- ]
- },
{
"cell_type": "code",
"execution_count": 11,
@@ -1500,21 +1705,21 @@
"source": [
"def plot_error_analysis(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
- " Funzione per visualizzare l'analisi degli errori di predizione\n",
+ " Function to visualize prediction error analysis\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
- " Valori reali\n",
+ " Actual values\n",
" y_pred : array-like\n",
- " Valori predetti\n",
+ " Predicted values\n",
" folder_name : str, optional\n",
- " Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
+ " Folder to save plots. If None, plots are not saved.\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
"\n",
- " # Converti in array numpy 1D se necessario\n",
+ " # Convert to 1D numpy array if necessary\n",
" if isinstance(y_true, pd.Series):\n",
" y_true = y_true.values\n",
" if isinstance(y_pred, pd.Series):\n",
@@ -1523,330 +1728,72 @@
" y_true = y_true.ravel()\n",
" y_pred = y_pred.ravel()\n",
"\n",
- " # Calcola gli errori\n",
+ " # Calculate errors\n",
" errors = y_pred - y_true\n",
"\n",
- " # Crea la figura principale\n",
+ " # Create main figure\n",
" fig = plt.figure(figsize=(15, 5))\n",
"\n",
- " # Plot 1: Distribuzione degli errori\n",
+ " # Plot 1: Error Distribution\n",
" plt.subplot(1, 3, 1)\n",
" plt.hist(errors, bins=50, alpha=0.7)\n",
- " plt.title('Distribuzione degli Errori di Predizione')\n",
- " plt.xlabel('Errore')\n",
- " plt.ylabel('Frequenza')\n",
+ " plt.title('Prediction Error Distribution')\n",
+ " plt.xlabel('Error')\n",
+ " plt.ylabel('Frequency')\n",
"\n",
" # Plot 2: Actual vs Predicted\n",
" plt.subplot(1, 3, 2)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
" plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
- " plt.title('Valori Reali vs Predetti')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Valori Predetti')\n",
+ " plt.title('Actual vs Predicted Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Predicted Values')\n",
"\n",
- " # Plot 3: Errori vs Valori Reali\n",
+ " # Plot 3: Errors vs Actual Values\n",
" plt.subplot(1, 3, 3)\n",
" plt.scatter(y_true, errors, alpha=0.5)\n",
" plt.axhline(y=0, color='r', linestyle='--')\n",
- " plt.title('Errori vs Valori Reali')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Errore')\n",
+ " plt.title('Errors vs Actual Values')\n",
+ " plt.xlabel('Actual Values')\n",
+ " plt.ylabel('Error')\n",
"\n",
" plt.tight_layout()\n",
"\n",
- " # Salva il plot se è specificata una cartella\n",
+ " # Save plot if folder is specified\n",
" if folder_name is not None:\n",
" try:\n",
- " # Crea la cartella se non esiste\n",
+ " # Create folder if it doesn't exist\n",
" os.makedirs(folder_name, exist_ok=True)\n",
"\n",
- " # Genera il nome del file con timestamp\n",
+ " # Generate filename with timestamp\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" filename = os.path.join(folder_name, f'error_analysis_{timestamp}.png')\n",
"\n",
- " # Salva la figura\n",
+ " # Save figure\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot salvato come: {filename}\")\n",
+ " print(f\"\\nPlot saved as: {filename}\")\n",
" except Exception as e:\n",
- " print(f\"\\nErrore nel salvare il plot: {str(e)}\")\n",
+ " print(f\"\\nError saving plot: {str(e)}\")\n",
"\n",
" plt.show()\n",
"\n",
- " # Stampa statistiche degli errori\n",
- " print(\"\\nStatistiche degli errori:\")\n",
+ " # Print error statistics\n",
+ " print(\"\\nError statistics:\")\n",
" print(f\"MAE: {np.mean(np.abs(errors)):.4f}\")\n",
" print(f\"MSE: {np.mean(errors ** 2):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(errors ** 2)):.4f}\")\n",
- " print(f\"Media errori: {np.mean(errors):.4f}\")\n",
- " print(f\"Std errori: {np.std(errors):.4f}\")\n",
+ " print(f\"Mean errors: {np.mean(errors):.4f}\")\n",
+ " print(f\"Std errors: {np.std(errors):.4f}\")\n",
"\n",
- " # Calcola percentuali di errori entro certe soglie\n",
+ " # Calculate percentage of errors within thresholds\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
" for threshold in thresholds:\n",
" within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
- " print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
+ " print(f\"Predictions within ±{threshold}: {within_threshold:.1f}%\")\n",
"\n",
"\n",
"plot_error_analysis(y_test, predictions, folder_name=folder_name)"
]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "03bb9564-e518-4662-b3ee-4cfa96cdf696",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Matrice di Confusione per Livelli di Rischio UV:\n",
- " Basso Moderato Alto Molto Alto Estremo\n",
- "Basso 2931 19 0 2128 390\n",
- "Moderato 41 40299 0 995 11\n",
- "Alto 13 0 0 0 239\n",
- "Molto Alto 863 2454 0 8063 142\n",
- "Estremo 1878 1 0 169 4230\n",
- "\n",
- "Analisi Precisione Predizioni UV Index:\n",
- "Precisione esatta (±0.1): 55.1%\n",
- "Precisione entro 0.5 punti: 71.1%\n",
- "Precisione entro 1.0 punti: 86.1%\n",
- "Precisione livello di rischio: 85.6%\n",
- "\n",
- "Analisi errori per livello UV:\n",
- "MAE per UV Basso (0-2): 0.138 (n=41346)\n",
- "MAE per UV Moderato (2-5): 0.906 (n=11522)\n",
- "MAE per UV Alto (5-7): 0.877 (n=5468)\n",
- "MAE per UV Molto Alto (7-10): 0.758 (n=6278)\n",
- "MAE per UV Estremo (10-11): 1.528 (n=252)\n",
- "\n",
- "Statistiche degli errori:\n",
- "Media errori: 0.006\n",
- "Deviazione standard errori: 0.783\n",
- "Errore mediano: 0.000\n",
- "95° percentile errore assoluto: 1.723\n",
- "\n",
- "Distribuzione degli errori:\n",
- "Predizioni entro ±0.5: 71.1%\n",
- "Predizioni entro ±1.0: 86.1%\n",
- "Predizioni entro ±1.5: 93.2%\n",
- "Predizioni entro ±2.0: 96.5%\n"
- ]
- }
- ],
- "source": [
- "def plot_advanced_prediction_analysis(y_true, y_pred, folder_name=None):\n",
- " \"\"\"\n",
- " Funzione per visualizzare l'analisi degli errori di predizione e la precisione\n",
- "\n",
- " Parameters:\n",
- " -----------\n",
- " y_true : array-like\n",
- " Valori reali\n",
- " y_pred : array-like\n",
- " Valori predetti\n",
- " folder_name : str, optional\n",
- " Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
- " \"\"\"\n",
- " import os\n",
- " from datetime import datetime\n",
- " import seaborn as sns\n",
- "\n",
- " # Converti in array numpy 1D se necessario\n",
- " if isinstance(y_true, pd.Series):\n",
- " y_true = y_true.values\n",
- " if isinstance(y_pred, pd.Series):\n",
- " y_pred = y_pred.values\n",
- "\n",
- " y_true = y_true.ravel()\n",
- " y_pred = y_pred.ravel()\n",
- "\n",
- " # Calcola gli errori\n",
- " errors = y_pred - y_true\n",
- "\n",
- " # Calcola accuracy per diversi livelli di tolleranza\n",
- " exact_accuracy = np.mean(np.abs(errors) < 0.1) * 100\n",
- " accuracy_05 = np.mean(np.abs(errors) <= 0.5) * 100\n",
- " accuracy_10 = np.mean(np.abs(errors) <= 1.0) * 100\n",
- "\n",
- " def get_risk_level(uv):\n",
- " if uv < 2:\n",
- " return 'Basso'\n",
- " elif uv < 5:\n",
- " return 'Moderato'\n",
- " elif uv < 7:\n",
- " return 'Alto'\n",
- " elif uv < 10:\n",
- " return 'Molto Alto'\n",
- " else:\n",
- " return 'Estremo'\n",
- "\n",
- " y_true_risk = [get_risk_level(x) for x in y_true]\n",
- " y_pred_risk = [get_risk_level(x) for x in y_pred]\n",
- " risk_accuracy = np.mean(np.array(y_true_risk) == np.array(y_pred_risk)) * 100\n",
- "\n",
- " # Crea la figura principale\n",
- " fig = plt.figure(figsize=(20, 10))\n",
- "\n",
- " # Plot 1: Distribuzione degli errori\n",
- " plt.subplot(2, 2, 1)\n",
- " plt.hist(errors, bins=50, alpha=0.7)\n",
- " plt.title('Distribuzione degli Errori di Predizione')\n",
- " plt.xlabel('Errore')\n",
- " plt.ylabel('Frequenza')\n",
- "\n",
- " # Plot 2: Actual vs Predicted\n",
- " plt.subplot(2, 2, 2)\n",
- " plt.scatter(y_true, y_pred, alpha=0.5)\n",
- " plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
- " plt.title('Valori Reali vs Predetti')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Valori Predetti')\n",
- "\n",
- " # Plot 3: Errori vs Valori Reali\n",
- " plt.subplot(2, 2, 3)\n",
- " plt.scatter(y_true, errors, alpha=0.5)\n",
- " plt.axhline(y=0, color='r', linestyle='--')\n",
- " plt.title('Errori vs Valori Reali')\n",
- " plt.xlabel('Valori Reali')\n",
- " plt.ylabel('Errore')\n",
- "\n",
- " # Plot 4: Precisione per intervallo di UV\n",
- " plt.subplot(2, 2, 4)\n",
- "\n",
- " uv_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
- " range_labels = ['Basso\\n(0-2)', 'Moderato\\n(2-5)', 'Alto\\n(5-7)', 'Molto Alto\\n(7-10)', 'Estremo\\n(10-11)']\n",
- "\n",
- " accuracies = []\n",
- " counts = []\n",
- " mae_per_range = []\n",
- "\n",
- " for (low, high) in uv_ranges:\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if mask.any():\n",
- " mae = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
- " mae_per_range.append(mae)\n",
- " count = np.sum(mask)\n",
- " counts.append(count)\n",
- " accuracy = np.mean(np.abs(y_pred[mask] - y_true[mask]) <= 0.5) * 100\n",
- " accuracies.append(accuracy)\n",
- "\n",
- " # Crea il grafico a barre con doppio asse y\n",
- " ax = plt.gca()\n",
- " bars = plt.bar(range_labels, accuracies, alpha=0.6, color='skyblue')\n",
- " plt.ylabel('Precisione (%)')\n",
- " plt.title('Precisione e MAE per Range UV')\n",
- "\n",
- " for bar in bars:\n",
- " height = bar.get_height()\n",
- " plt.text(bar.get_x() + bar.get_width() / 2., height,\n",
- " f'{height:.1f}%\\n(n={counts[bars.index(bar)]})',\n",
- " ha='center', va='bottom')\n",
- "\n",
- " ax2 = ax.twinx()\n",
- " line = ax2.plot(range_labels, mae_per_range, 'r-', marker='o', label='MAE')\n",
- " ax2.set_ylabel('MAE', color='red')\n",
- "\n",
- " for i, mae in enumerate(mae_per_range):\n",
- " ax2.text(i, mae, f'MAE: {mae:.3f}', color='red', ha='center', va='bottom')\n",
- "\n",
- " plt.xticks(rotation=45)\n",
- " plt.tight_layout()\n",
- "\n",
- " # Salva la figura principale se è specificata una cartella\n",
- " if folder_name is not None:\n",
- " try:\n",
- " # Crea la cartella se non esiste\n",
- " os.makedirs(folder_name, exist_ok=True)\n",
- "\n",
- " # Genera il timestamp\n",
- " timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
- "\n",
- " # Salva la figura principale\n",
- " main_plot_filename = os.path.join(folder_name, f'advanced_analysis_{timestamp}.png')\n",
- " plt.savefig(main_plot_filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"\\nPlot principale salvato come: {main_plot_filename}\")\n",
- "\n",
- " # Crea e salva la matrice di confusione come plot separato\n",
- " plt.figure(figsize=(10, 8))\n",
- " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
- " risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
- "\n",
- " sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
- " plt.title('Matrice di Confusione per Livelli di Rischio UV')\n",
- " plt.tight_layout()\n",
- "\n",
- " conf_matrix_filename = os.path.join(folder_name, f'confusion_matrix_{timestamp}.png')\n",
- " plt.savefig(conf_matrix_filename, dpi=300, bbox_inches='tight')\n",
- " print(f\"Matrice di confusione salvata come: {conf_matrix_filename}\")\n",
- "\n",
- " except Exception as e:\n",
- " print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
- "\n",
- " plt.show()\n",
- "\n",
- " # Stampa delle statistiche e analisi\n",
- " cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
- " risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
- " cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
- "\n",
- " print(\"\\nMatrice di Confusione per Livelli di Rischio UV:\")\n",
- " print(cm_df)\n",
- "\n",
- " print(\"\\nAnalisi Precisione Predizioni UV Index:\")\n",
- " print(f\"Precisione esatta (±0.1): {exact_accuracy:.1f}%\")\n",
- " print(f\"Precisione entro 0.5 punti: {accuracy_05:.1f}%\")\n",
- " print(f\"Precisione entro 1.0 punti: {accuracy_10:.1f}%\")\n",
- " print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
- "\n",
- " print(\"\\nAnalisi errori per livello UV:\")\n",
- " uv_ranges = [(0, 2, 'Basso'), (2, 5, 'Moderato'), (5, 7, 'Alto'),\n",
- " (7, 10, 'Molto Alto'), (10, 11, 'Estremo')]\n",
- "\n",
- " for low, high, label in uv_ranges:\n",
- " mask = (y_true >= low) & (y_true < high)\n",
- " if mask.any():\n",
- " mae = np.mean(np.abs(errors[mask]))\n",
- " n_samples = np.sum(mask)\n",
- " print(f\"MAE per UV {label} ({low}-{high}): {mae:.3f} (n={n_samples})\")\n",
- "\n",
- " print(\"\\nStatistiche degli errori:\")\n",
- " print(f\"Media errori: {np.mean(errors):.3f}\")\n",
- " print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
- " print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
- " print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
- "\n",
- " print(\"\\nDistribuzione degli errori:\")\n",
- " thresholds = [0.5, 1.0, 1.5, 2.0]\n",
- " for threshold in thresholds:\n",
- " within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
- " print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
- "\n",
- "\n",
- "# Usa la funzione\n",
- "plot_advanced_prediction_analysis(y_test, predictions, folder_name=folder_name)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "fe898941-2338-4157-b624-680bc2c517d8",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
diff --git a/src/olive-oil-dashboard.py b/src/olive-oil-dashboard.py
old mode 100644
new mode 100755
diff --git a/src/olive_config.json b/src/olive_config.json
old mode 100644
new mode 100755
diff --git a/src/olive_oil_train_dataset/__pycache__/create_train_dataset.cpython-39.pyc b/src/olive_oil_train_dataset/__pycache__/create_train_dataset.cpython-39.pyc
old mode 100644
new mode 100755
diff --git a/src/olive_oil_train_dataset/create_train_dataset.py b/src/olive_oil_train_dataset/create_train_dataset.py
old mode 100644
new mode 100755
diff --git a/src/setup.py b/src/setup.py
old mode 100644
new mode 100755
diff --git a/src/sources.dvc b/src/sources.dvc
new file mode 100644
index 0000000..5c75a9d
--- /dev/null
+++ b/src/sources.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 23e7daa876590e1c6ae9cb7af3be8028.dir
+ size: 984847509
+ nfiles: 5
+ hash: md5
+ path: sources
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
old mode 100644
new mode 100755
diff --git a/src/utils/__pycache__/__init__.cpython-39.pyc b/src/utils/__pycache__/__init__.cpython-39.pyc
old mode 100644
new mode 100755
diff --git a/src/utils/__pycache__/helpers.cpython-39.pyc b/src/utils/__pycache__/helpers.cpython-39.pyc
old mode 100644
new mode 100755
diff --git a/src/utils/helpers.py b/src/utils/helpers.py
old mode 100644
new mode 100755
index b408295..3f35a74
--- a/src/utils/helpers.py
+++ b/src/utils/helpers.py
@@ -429,7 +429,7 @@ def calculate_water_need(weather_data, base_need, optimal_temp):
rain_factor = 1 - 0.001 * weather_data['precip_sum'] # Diminuisce leggermente con l'aumentare delle precipitazioni
return base_need * temp_factor * rain_factor
-def create_technique_mapping(olive_varieties, mapping_path='./kaggle/working/models/technique_mapping.joblib'):
+def create_technique_mapping(olive_varieties, mapping_path='./sources/technique_mapping.joblib'):
# Estrai tutte le tecniche uniche dal dataset e convertile in lowercase
all_techniques = olive_varieties['Tecnica di Coltivazione'].str.lower().unique()
@@ -443,7 +443,7 @@ def create_technique_mapping(olive_varieties, mapping_path='./kaggle/working/mod
return technique_mapping
-def encode_techniques(df, mapping_path='./kaggle/working/models/technique_mapping.joblib'):
+def encode_techniques(df, mapping_path='./sources/technique_mapping.joblib'):
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}. Run create_technique_mapping first.")
@@ -459,7 +459,7 @@ def encode_techniques(df, mapping_path='./kaggle/working/models/technique_mappin
return df
-def decode_techniques(df, mapping_path='./kaggle/working/models/technique_mapping.joblib'):
+def decode_techniques(df, mapping_path='./sources/technique_mapping.joblib'):
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}")
@@ -477,7 +477,7 @@ def decode_techniques(df, mapping_path='./kaggle/working/models/technique_mappin
return df
-def decode_single_technique(technique_value, mapping_path='./kaggle/working/models/technique_mapping.joblib'):
+def decode_single_technique(technique_value, mapping_path='./sources/technique_mapping.joblib'):
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}")