stop tracking src/sources

This commit is contained in:
Giuseppe Nucifora 2024-11-20 00:56:04 +01:00
parent ca5e1ddbc0
commit ffc74dc262
20 changed files with 5736 additions and 988 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

37
.idea/csv-editor.xml generated
View File

@ -1,37 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CsvFileAttributes">
<option name="attributeMap">
<map>
<entry key="$USER_HOME$/Downloads/Breadcrumb_Data.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/data/olive_varieties.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/data/simulated_data.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/data/variety_olive_oil_production.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
</map>
</option>
</component>
</project>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -61,27 +61,23 @@
],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.layers import Dense, LSTM, Conv1D, MultiHeadAttention, Dropout, BatchNormalization, \\\n",
" LayerNormalization, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate, Input, Reshape, Activation, Lambda, \\\n",
" Bidirectional, Add, Multiply, MaxPooling1D\n",
"from tensorflow.keras.layers import Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, Input, Activation, Lambda, Bidirectional, Add, MaxPooling1D\n",
"from tensorflow.keras import regularizers\n",
"from tensorflow.keras.models import Model\n",
"import tensorflow.keras.backend as K\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"from sklearn.preprocessing import StandardScaler\n",
"from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
"from tensorflow.keras.optimizers import Adam, AdamW\n",
"import matplotlib.pyplot as plt\n",
"from tensorflow.keras.optimizers import AdamW\n",
"import json\n",
"import joblib\n",
"from sklearn.utils.class_weight import compute_class_weight\n",
"from datetime import datetime\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import confusion_matrix\n",
"from tensorflow.keras.utils import plot_model"
"from tensorflow.keras.utils import plot_model\n",
"\n",
"folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")"
]
},
{
@ -175,7 +171,6 @@
" return df\n",
"\n",
"\n",
"\n",
"def add_advanced_features(df):\n",
" # Features esistenti\n",
" df = add_time_features(df)\n",
@ -188,7 +183,7 @@
" # One-hot encoding per le feature categoriche\n",
" df = pd.get_dummies(df, columns=['season', 'time_period'])\n",
"\n",
" # Aggiungi interazioni tra variabili meteorologiche\n",
" # Interazioni tra variabili meteorologiche\n",
" df['temp_humidity'] = df['temp'] * df['humidity']\n",
" df['temp_cloudcover'] = df['temp'] * df['cloudcover']\n",
" df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']\n",
@ -207,22 +202,21 @@
" df['cloudcover_rolling_mean_6h'] = df['cloudcover'].rolling(window=6).mean()\n",
"\n",
" df['temp_humidity_interaction'] = df['temp'] * df['humidity'] / 100\n",
" \n",
"\n",
" # Indicatore di condizioni estreme\n",
" df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) & \n",
" (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
" \n",
" df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) & (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
"\n",
" # Feature composite per la trasparenza atmosferica\n",
" df['atmospheric_transparency'] = (100 - df['cloudcover']) * (df['visibility'] / 10)\n",
" \n",
"\n",
" # Indicatori temporali più granulari per mezze stagioni\n",
" df['is_transition_season'] = ((df['season_Spring'] | df['season_Autumn'])).astype(int)\n",
" \n",
"\n",
" # Interazione tra angolo solare e copertura nuvolosa normalizzata\n",
" df['solar_cloud_effect'] = df['solar_elevation'] * (100 - df['cloudcover']) / 100\n",
" \n",
"\n",
" # Indicatore di stabilità atmosferica\n",
" df['pressure_stability'] = df.groupby(df.index.date if isinstance(df.index, pd.DatetimeIndex) \n",
" df['pressure_stability'] = df.groupby(df.index.date if isinstance(df.index, pd.DatetimeIndex)\n",
" else df.index.to_series().dt.date)['pressure'].transform(\n",
" lambda x: x.std()\n",
" ).fillna(0)\n",
@ -235,6 +229,7 @@
" df = add_advanced_features(df)\n",
"\n",
" target_variables = ['solarradiation', 'solarenergy', 'uvindex']\n",
"\n",
" # Selezione delle feature più rilevanti per UV index\n",
" selected_features = [\n",
" # Features meteorologiche base\n",
@ -256,7 +251,7 @@
" 'cloud_rolling_12h', 'temp_rolling_12h',\n",
" 'temp_rolling_mean_6h', 'cloudcover_rolling_mean_6h',\n",
"\n",
" # Features categoriche (da encodare)\n",
" # Features categoriche\n",
" 'season', 'time_period'\n",
" ]\n",
"\n",
@ -266,14 +261,12 @@
"\n",
" df = df.sort_values('datetime')\n",
" df.set_index('datetime', inplace=True)\n",
" # Rimozione delle righe con valori NaN (create dai rolling features)\n",
" #df = df.dropna()\n",
"\n",
" columns_to_interpolate = final_features + target_variables\n",
" for column in columns_to_interpolate:\n",
" df[column] = df[column].interpolate(method='time')\n",
"\n",
" # Rimuovi eventuali valori mancanti residui\n",
" #df.dropna(subset=features + selected_features, inplace=True)\n",
" df.fillna(0, inplace=True)\n",
"\n",
" data_after_2010 = df[df['year'] >= 2010].copy()\n",
@ -286,6 +279,7 @@
"\n",
" #print(X.head())\n",
" #print(X.columns)\n",
"\n",
" y = data_after_2010['uvindex']\n",
"\n",
" X_to_predict = data_before_2010[final_features]\n",
@ -361,7 +355,7 @@
" return x\n",
"\n",
"\n",
"def create_optimized_model(input_shape,folder_name, l2_lambda=0.005):\n",
"def create_uv_index_model(input_shape, folder_name, l2_lambda=0.005):\n",
" inputs = Input(shape=input_shape)\n",
"\n",
" # Primi due layer LSTM con sequenze\n",
@ -386,20 +380,19 @@
" x = BatchNormalization()(x)\n",
" x = Activation('swish')(x)\n",
" x = Dropout(0.1)(x)\n",
" \n",
"\n",
" outputs = Dense(1)(x)\n",
" outputs = Lambda(lambda x: tf.clip_by_value(x, 0, 11))(outputs)\n",
" \n",
"\n",
" model = Model(inputs=inputs, outputs=outputs, name=\"UvModel\")\n",
" \n",
" # 4. Optimizer con parametri conservativi\n",
" optimizer = Adam(\n",
" learning_rate=0.0005, # Learning rate più basso\n",
"\n",
" optimizer = AdamW(\n",
" learning_rate=0.0005,\n",
" beta_1=0.9,\n",
" beta_2=0.999,\n",
" epsilon=1e-07\n",
" )\n",
" \n",
"\n",
" model.compile(\n",
" optimizer=optimizer,\n",
" loss='huber',\n",
@ -407,48 +400,83 @@
" )\n",
" model.summary()\n",
"\n",
" plot_model(model, \n",
" plot_model(model,\n",
" to_file=f'{folder_name}_model_architecture.png',\n",
" show_shapes=True, # Mostra le dimensioni dei tensori\n",
" show_layer_names=True, # Mostra i nomi dei layer\n",
" dpi=96, # Risoluzione dell'immagine\n",
" show_layer_activations=True) \n",
" show_layer_activations=True)\n",
"\n",
" return model\n",
"\n",
"def evaluate_uv_predictions(y_true, y_pred):\n",
"\n",
"def evaluate_uv_predictions(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
" Valutazione specifica per UV index con metriche categoriche\n",
" Valutazione specifica per UV index con metriche sia raw che categoriche\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
" Valori reali dell'UV index\n",
" y_pred : array-like\n",
" Valori predetti dell'UV index\n",
" folder_name : str, optional\n",
" Cartella dove salvare eventuali plot di analisi\n",
"\n",
" Returns:\n",
" --------\n",
" dict\n",
" Dizionario contenente tutte le metriche calcolate\n",
" \"\"\"\n",
" # Converti in numpy array se necessario\n",
" y_true = np.array(y_true)\n",
" y_pred = np.array(y_pred)\n",
" import os\n",
" from datetime import datetime\n",
"\n",
" y_true = np.array(y_true).ravel()\n",
" y_pred = np.array(y_pred).ravel()\n",
"\n",
" # Calcolo metriche sui valori raw\n",
" mae_raw = mean_absolute_error(y_true, y_pred)\n",
" rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred))\n",
" r2_raw = r2_score(y_true, y_pred)\n",
"\n",
" # Arrotonda le predizioni al più vicino intero\n",
" y_pred_rounded = np.round(y_pred)\n",
"\n",
" # Clip dei valori tra 0 e 11\n",
" y_pred_clipped = np.clip(y_pred_rounded, 0, 11)\n",
"\n",
" # Calcolo metriche\n",
" mae = mean_absolute_error(y_true, y_pred_clipped)\n",
" rmse = np.sqrt(mean_squared_error(y_true, y_pred_clipped))\n",
" r2 = r2_score(y_true, y_pred_clipped)\n",
" # Calcolo metriche sui valori arrotondati\n",
" mae_rounded = mean_absolute_error(y_true, y_pred_clipped)\n",
" rmse_rounded = np.sqrt(mean_squared_error(y_true, y_pred_clipped))\n",
" r2_rounded = r2_score(y_true, y_pred_clipped)\n",
"\n",
" # Calcolo accuratezza per diversi margini di errore\n",
" exact_accuracy = np.mean(y_pred_clipped == y_true.ravel())\n",
" one_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true.ravel()) <= 1)\n",
" two_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true.ravel()) <= 2)\n",
" # Calcolo accuratezza per diversi margini di errore (sia raw che rounded)\n",
" # Raw\n",
" within_05_raw = np.mean(np.abs(y_pred - y_true) <= 0.5)\n",
" within_1_raw = np.mean(np.abs(y_pred - y_true) <= 1.0)\n",
" within_2_raw = np.mean(np.abs(y_pred - y_true) <= 2.0)\n",
"\n",
" # Rounded\n",
" exact_accuracy = np.mean(y_pred_clipped == y_true)\n",
" one_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 1)\n",
" two_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 2)\n",
"\n",
" print(\"\\nUV Index Prediction Metrics:\")\n",
" print(f\"MAE: {mae:.3f}\")\n",
" print(f\"RMSE: {rmse:.3f}\")\n",
" print(f\"R² Score: {r2:.3f}\")\n",
" print(f\"Exact Match Accuracy: {exact_accuracy:.3f}\")\n",
" print(\"\\nRaw Predictions:\")\n",
" print(f\"MAE: {mae_raw:.3f}\")\n",
" print(f\"RMSE: {rmse_raw:.3f}\")\n",
" print(f\"R² Score: {r2_raw:.3f}\")\n",
" print(f\"Within ±0.5: {within_05_raw:.3f}\")\n",
" print(f\"Within ±1.0: {within_1_raw:.3f}\")\n",
" print(f\"Within ±2.0: {within_2_raw:.3f}\")\n",
"\n",
" print(\"\\nRounded Predictions:\")\n",
" print(f\"MAE: {mae_rounded:.3f}\")\n",
" print(f\"RMSE: {rmse_rounded:.3f}\")\n",
" print(f\"R² Score: {r2_rounded:.3f}\")\n",
" print(f\"Exact Match: {exact_accuracy:.3f}\")\n",
" print(f\"±1 Accuracy: {one_off_accuracy:.3f}\")\n",
" print(f\"±2 Accuracy: {two_off_accuracy:.3f}\")\n",
"\n",
" # Confusion Matrix per livelli di UV\n",
" # Analisi dei livelli UV\n",
" def get_uv_level(value):\n",
" if value <= 2:\n",
" return 'Low'\n",
@ -461,49 +489,195 @@
" else:\n",
" return 'Extreme'\n",
"\n",
" y_true_levels = [get_uv_level(v) for v in y_true.ravel()]\n",
" y_pred_levels = [get_uv_level(v) for v in y_pred_clipped]\n",
" # Calcola livelli UV sia per raw che rounded\n",
" y_true_levels = [get_uv_level(v) for v in y_true]\n",
" y_pred_levels_raw = [get_uv_level(v) for v in y_pred]\n",
" y_pred_levels_rounded = [get_uv_level(v) for v in y_pred_clipped]\n",
"\n",
" print(\"\\nUV Level Confusion Matrix:\")\n",
" # Calcola accuracy dei livelli\n",
" level_accuracy_raw = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels_raw)])\n",
" level_accuracy_rounded = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels_rounded)])\n",
"\n",
" print(\"\\nUV Level Accuracy:\")\n",
" print(f\"Raw predictions: {level_accuracy_raw:.3f}\")\n",
" print(f\"Rounded predictions: {level_accuracy_rounded:.3f}\")\n",
"\n",
" print(\"\\nUV Level Confusion Matrix (Raw Predictions):\")\n",
" print(pd.crosstab(\n",
" pd.Series(y_true_levels, name='Actual'),\n",
" pd.Series(y_pred_levels, name='Predicted')\n",
" pd.Series(y_pred_levels_raw, name='Predicted')\n",
" ))\n",
"\n",
" return mae, rmse, r2, exact_accuracy, one_off_accuracy\n",
" print(\"\\nUV Level Confusion Matrix (Rounded Predictions):\")\n",
" print(pd.crosstab(\n",
" pd.Series(y_true_levels, name='Actual'),\n",
" pd.Series(y_pred_levels_rounded, name='Predicted')\n",
" ))\n",
"\n",
" # Se specificata una cartella, salva i plot di analisi\n",
" if folder_name is not None:\n",
" try:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"\n",
"def plot_uv_predictions(y_true, y_pred):\n",
" \"\"\"\n",
" Visualizzazione delle predizioni specifica per UV index\n",
" \"\"\"\n",
" # Converti in numpy array se necessario\n",
" y_true = np.array(y_true).ravel()\n",
" y_pred = np.array(y_pred).ravel()\n",
"\n",
" # Plot di confronto tra raw e rounded predictions\n",
" plt.figure(figsize=(15, 5))\n",
"\n",
" # Plot 1: Actual vs Predicted\n",
" plt.subplot(1, 2, 1)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
" # Plot 1: Scatter plot confronto\n",
" plt.subplot(1, 3, 1)\n",
" plt.scatter(y_true, y_pred, alpha=0.5, label='Raw')\n",
" plt.scatter(y_true, y_pred_clipped, alpha=0.5, label='Rounded')\n",
" plt.plot([0, 11], [0, 11], 'r--', lw=2)\n",
" plt.xlabel('Actual UV Index')\n",
" plt.ylabel('Predicted UV Index')\n",
" plt.title('Actual vs Predicted UV Index')\n",
" plt.title('Raw vs Rounded Predictions')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" # Plot 2: Distribution of Errors\n",
" plt.subplot(1, 2, 2)\n",
" errors = y_pred - y_true\n",
" plt.hist(errors, bins=20, alpha=0.7)\n",
" plt.xlabel('Prediction Error')\n",
" # Plot 2: Distribuzione errori raw\n",
" plt.subplot(1, 3, 2)\n",
" plt.hist(y_pred - y_true, bins=50, alpha=0.7)\n",
" plt.xlabel('Prediction Error (Raw)')\n",
" plt.ylabel('Frequency')\n",
" plt.title('Distribution of Prediction Errors')\n",
" plt.title('Distribution of Raw Errors')\n",
" plt.grid(True)\n",
"\n",
" # Plot 3: Distribuzione errori rounded\n",
" plt.subplot(1, 3, 3)\n",
" plt.hist(y_pred_clipped - y_true, bins=50, alpha=0.7)\n",
" plt.xlabel('Prediction Error (Rounded)')\n",
" plt.ylabel('Frequency')\n",
" plt.title('Distribution of Rounded Errors')\n",
" plt.grid(True)\n",
"\n",
" plt.tight_layout()\n",
"\n",
" # Salva il plot\n",
" filename = os.path.join(folder_name, f'uv_prediction_analysis_{timestamp}.png')\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nPlot di analisi salvato come: {filename}\")\n",
"\n",
" plt.show()\n",
"\n",
" except Exception as e:\n",
" print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
"\n",
" # Restituisci tutte le metriche in un dizionario\n",
" metrics = {\n",
" 'raw': {\n",
" 'mae': mae_raw,\n",
" 'rmse': rmse_raw,\n",
" 'r2': r2_raw,\n",
" 'within_05': within_05_raw,\n",
" 'within_1': within_1_raw,\n",
" 'within_2': within_2_raw,\n",
" 'level_accuracy': level_accuracy_raw\n",
" },\n",
" 'rounded': {\n",
" 'mae': mae_rounded,\n",
" 'rmse': rmse_rounded,\n",
" 'r2': r2_rounded,\n",
" 'exact_match': exact_accuracy,\n",
" 'one_off': one_off_accuracy,\n",
" 'two_off': two_off_accuracy,\n",
" 'level_accuracy': level_accuracy_rounded\n",
" }\n",
" }\n",
"\n",
" return metrics\n",
"\n",
"def plot_training_history(history, folder_name=None):\n",
" \"\"\"\n",
" Visualizza e salva i plot della loss e delle metriche durante il training\n",
"\n",
" Parameters:\n",
" -----------\n",
" history : tensorflow.keras.callbacks.History\n",
" L'oggetto history restituito dal training del modello\n",
" folder_name : str\n",
" Cartella dove salvare il plot\n",
" \"\"\"\n",
" import os\n",
"\n",
" try:\n",
" # Crea la figura\n",
" plt.figure(figsize=(12, 4))\n",
"\n",
" # Plot della Loss\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(history.history['loss'], label='Training Loss')\n",
" plt.plot(history.history['val_loss'], label='Validation Loss')\n",
" plt.title('Model Loss')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Loss')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" # Plot del MAE\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(history.history['mae'], label='Training MAE')\n",
" plt.plot(history.history['val_mae'], label='Validation MAE')\n",
" plt.title('Model MAE')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('MAE')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" plt.tight_layout()\n",
"\n",
" if folder_name is not None:\n",
" os.makedirs(folder_name, exist_ok=True)\n",
" # Genera il nome del file con timestamp\n",
" filename = os.path.join(folder_name, 'training_history.png')\n",
"\n",
" # Salva la figura\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nPlot della training history salvato come: {filename}\")\n",
"\n",
" # Salva anche i dati numerici in formato CSV\n",
" history_df = pd.DataFrame({\n",
" 'epoch': range(1, len(history.history['loss']) + 1),\n",
" 'training_loss': history.history['loss'],\n",
" 'validation_loss': history.history['val_loss'],\n",
" 'training_mae': history.history['mae'],\n",
" 'validation_mae': history.history['val_mae']\n",
" })\n",
"\n",
" if folder_name is not None:\n",
" csv_filename = os.path.join(folder_name, 'training_history.csv')\n",
" history_df.to_csv(csv_filename, index=False)\n",
" print(f\"Dati della training history salvati come: {csv_filename}\")\n",
"\n",
" # Calcola e salva le statistiche finali\n",
" final_stats = {\n",
" 'final_training_loss': history.history['loss'][-1],\n",
" 'final_validation_loss': history.history['val_loss'][-1],\n",
" 'final_training_mae': history.history['mae'][-1],\n",
" 'final_validation_mae': history.history['val_mae'][-1],\n",
" 'best_validation_loss': min(history.history['val_loss']),\n",
" 'best_validation_mae': min(history.history['val_mae']),\n",
" 'epochs': len(history.history['loss']),\n",
" }\n",
"\n",
" if folder_name is not None:\n",
" # Salva le statistiche in formato JSON\n",
" stats_filename = os.path.join(folder_name, 'training_stats.json')\n",
" with open(stats_filename, 'w') as f:\n",
" json.dump(final_stats, f, indent=4)\n",
" print(f\"Statistiche finali salvate come: {stats_filename}\")\n",
"\n",
" # Stampa le statistiche principali\n",
" print(\"\\nStatistiche finali del training:\")\n",
" print(f\"Loss finale (train/val): {final_stats['final_training_loss']:.4f}/{final_stats['final_validation_loss']:.4f}\")\n",
" print(f\"MAE finale (train/val): {final_stats['final_training_mae']:.4f}/{final_stats['final_validation_mae']:.4f}\")\n",
" print(f\"Miglior validation loss: {final_stats['best_validation_loss']:.4f}\")\n",
" print(f\"Miglior validation MAE: {final_stats['best_validation_mae']:.4f}\")\n",
"\n",
" plt.show()\n",
"\n",
" except Exception as e:\n",
" print(f\"\\nErrore durante la creazione o il salvataggio dei plot: {str(e)}\")\n",
"\n",
"\n",
"def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='uv_index'):\n",
" \"\"\"\n",
@ -538,50 +712,44 @@
" # Early Stopping avanzato\n",
" EarlyStopping(\n",
" monitor='mae',\n",
" patience=15, # Aumentato per dare più tempo dopo le riduzioni del LR\n",
" patience=15,\n",
" restore_best_weights=True,\n",
" mode='min',\n",
" verbose=1,\n",
" min_delta=1e-6 # Reso più sensibile\n",
" min_delta=1e-6\n",
" ),\n",
" # Learning Rate Schedule molto più aggressivo\n",
" ReduceLROnPlateau(\n",
" monitor='mae',\n",
" factor=0.05, # Riduzione molto più aggressiva (95% di riduzione)\n",
" patience=3, # Ridotto per reagire più velocemente\n",
" factor=0.05,\n",
" patience=3,\n",
" verbose=1,\n",
" mode='min',\n",
" min_delta=1e-6,\n",
" cooldown=2, # Ridotto per permettere riduzioni più frequenti\n",
" min_lr=1e-7 # LR minimo più basso\n",
" cooldown=2,\n",
" min_lr=1e-7\n",
" ),\n",
" # Aggiungiamo un secondo scheduler per riduzioni ancora più graduali\n",
" ReduceLROnPlateau(\n",
" monitor='val_loss',\n",
" factor=0.2, # Riduzione più moderata come backup\n",
" patience=2, # Ancora più reattivo\n",
" factor=0.2,\n",
" patience=2,\n",
" verbose=1,\n",
" mode='min',\n",
" min_delta=1e-6,\n",
" cooldown=1,\n",
" min_lr=1e-7\n",
" ),\n",
" # Model Checkpoint per salvare i migliori modelli\n",
" tf.keras.callbacks.ModelCheckpoint(\n",
" filepath=f'{folder_name}_best_uv_model.h5',\n",
" monitor='mae',\n",
" save_best_only=True,\n",
" mode='min'\n",
" ),\n",
" # TensorBoard callback per il monitoraggio\n",
" tf.keras.callbacks.TensorBoard(\n",
" log_dir=f'./logs_{folder_name}',\n",
" histogram_freq=1,\n",
" write_graph=True,\n",
" update_freq='epoch'\n",
" ),\n",
"\n",
" # Custom Callback per monitorare le predizioni fuori range\n",
" tf.keras.callbacks.LambdaCallback(\n",
" on_epoch_end=lambda epoch, logs: print(\n",
" f\"\\nEpoch {epoch + 1}: Predizioni fuori range: \"\n",
@ -590,19 +758,6 @@
" )\n",
" ]\n",
"\n",
" # Calcolo dei class weights se non forniti\n",
" '''\n",
" if class_weights is None:\n",
" # Discretizziamo i valori UV per il calcolo dei pesi\n",
" y_discrete = np.round(y_train).astype(int)\n",
" class_weights = compute_class_weight(\n",
" 'balanced',\n",
" classes=np.unique(y_discrete),\n",
" y=y_discrete\n",
" )\n",
" class_weights = dict(enumerate(class_weights))\n",
" '''\n",
" # Training con gestione degli errori e logging\n",
" try:\n",
" history = model.fit(\n",
" X_train, y_train,\n",
@ -610,9 +765,8 @@
" epochs=epochs,\n",
" batch_size=batch_size,\n",
" callbacks=callbacks,\n",
" #class_weight=class_weights,\n",
" verbose=1,\n",
" shuffle=False, # Abilitato shuffle\n",
" shuffle=False,\n",
" validation_freq=1,\n",
" )\n",
"\n",
@ -631,41 +785,7 @@
" out_of_range = np.sum((predictions < 0) | (predictions > 11))\n",
" print(f\"\\nPredizioni fuori range: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)\")\n",
"\n",
" # Plot della loss durante il training\n",
" plt.figure(figsize=(12, 4))\n",
"\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(history.history['loss'], label='Training Loss')\n",
" plt.plot(history.history['val_loss'], label='Validation Loss')\n",
" plt.title('Model Loss')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Loss')\n",
" plt.legend()\n",
"\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(history.history['mae'], label='Training MAE')\n",
" plt.plot(history.history['val_mae'], label='Validation MAE')\n",
" plt.title('Model MAE')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('MAE')\n",
" plt.legend()\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" # Salvataggio dei risultati del training\n",
" training_results = {\n",
" 'final_loss': float(test_loss),\n",
" 'final_mae': float(test_mae),\n",
" 'final_mse': float(test_mse),\n",
" 'out_of_range_predictions': int(out_of_range),\n",
" 'training_time': int(len(history.history['loss'])),\n",
" 'best_epoch': int(np.argmin(history.history['val_loss'])) + 1\n",
" }\n",
"\n",
" # Salvataggio su file\n",
" with open('training_results.json', 'w') as f:\n",
" json.dump(training_results, f, indent=4)\n",
" plot_training_history(history, folder_name=folder_name)\n",
"\n",
" return history\n",
"\n",
@ -768,7 +888,7 @@
" print(\"Inizializzazione del training del modello UV index...\")\n",
"\n",
" try:\n",
" folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\n",
"\n",
" # Preparazione dei dati\n",
" print(\"\\n1. Preparazione dei dati...\")\n",
" X_train_seq, X_test_seq, y_train, y_test, scaler, features, X_to_predict_seq = prepare_hybrid_data(df)\n",
@ -783,7 +903,7 @@
" # Creazione del modello\n",
" print(\"\\n2. Creazione del modello...\")\n",
" input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])\n",
" model = create_optimized_model(input_shape, folder_name)\n",
" model = create_uv_index_model(input_shape, folder_name)\n",
"\n",
" print(\"\\n4. Avvio del training...\")\n",
" history = train_hybrid_model(\n",
@ -792,7 +912,6 @@
" y_train=y_train,\n",
" X_test=X_test_seq,\n",
" y_test=y_test,\n",
" #class_weights=class_weights, # Ora passiamo direttamente il dizionario\n",
" epochs=100,\n",
" batch_size=128,\n",
" folder_name=folder_name\n",
@ -803,10 +922,7 @@
" predictions = np.clip(predictions, 0, 11)\n",
"\n",
" print(\"\\n6. Valutazione del modello...\")\n",
" metrics = evaluate_uv_predictions(y_test, predictions)\n",
"\n",
" print(\"\\n7. Visualizzazione risultati...\")\n",
" plot_uv_predictions(y_test, predictions)\n",
" metrics = evaluate_uv_predictions(y_test, predictions, folder_name=folder_name)\n",
"\n",
" # Creazione del dizionario dei risultati\n",
" training_results = {\n",
@ -829,13 +945,15 @@
" }\n",
" }\n",
"\n",
" print(\"\\n8. Predizione dei dati mancanti risultati...\")\n",
" print(\"\\n7. Predizione dei dati mancanti risultati...\")\n",
" to_predict_predictions = model.predict(X_to_predict_seq)\n",
" to_predict_predictions = np.clip(to_predict_predictions, 0, 11)\n",
"\n",
" print(\"\\n9. Integrazione delle predizioni nel dataset originale...\")\n",
" print(\"\\n8. Integrazione delle predizioni nel dataset originale...\")\n",
" df_updated = integrate_predictions(df.copy(), to_predict_predictions)\n",
"\n",
" df_updated.to_parquet('./data/weather_data_uvindex.parquet')\n",
"\n",
" # Aggiungi statistiche sulle predizioni al training_results\n",
" training_results['prediction_stats'] = {\n",
" 'n_predictions_added': len(to_predict_predictions),\n",
@ -1380,31 +1498,44 @@
}
],
"source": [
"def plot_error_analysis(y_true, y_pred):\n",
"def plot_error_analysis(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
" Funzione per visualizzare l'analisi degli errori di predizione\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
" Valori reali\n",
" y_pred : array-like\n",
" Valori predetti\n",
" folder_name : str, optional\n",
" Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
"\n",
" # Converti in array numpy 1D se necessario\n",
" if isinstance(y_true, pd.Series):\n",
" y_true = y_true.values\n",
" if isinstance(y_pred, pd.Series):\n",
" y_pred = y_pred.values\n",
" \n",
"\n",
" y_true = y_true.ravel()\n",
" y_pred = y_pred.ravel()\n",
" \n",
"\n",
" # Calcola gli errori\n",
" errors = y_pred - y_true\n",
" \n",
" plt.figure(figsize=(15, 5))\n",
" \n",
"\n",
" # Crea la figura principale\n",
" fig = plt.figure(figsize=(15, 5))\n",
"\n",
" # Plot 1: Distribuzione degli errori\n",
" plt.subplot(1, 3, 1)\n",
" plt.hist(errors, bins=50, alpha=0.7)\n",
" plt.title('Distribuzione degli Errori di Predizione')\n",
" plt.xlabel('Errore')\n",
" plt.ylabel('Frequenza')\n",
" \n",
"\n",
" # Plot 2: Actual vs Predicted\n",
" plt.subplot(1, 3, 2)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
@ -1412,7 +1543,7 @@
" plt.title('Valori Reali vs Predetti')\n",
" plt.xlabel('Valori Reali')\n",
" plt.ylabel('Valori Predetti')\n",
" \n",
"\n",
" # Plot 3: Errori vs Valori Reali\n",
" plt.subplot(1, 3, 3)\n",
" plt.scatter(y_true, errors, alpha=0.5)\n",
@ -1420,18 +1551,35 @@
" plt.title('Errori vs Valori Reali')\n",
" plt.xlabel('Valori Reali')\n",
" plt.ylabel('Errore')\n",
" \n",
"\n",
" plt.tight_layout()\n",
"\n",
" # Salva il plot se è specificata una cartella\n",
" if folder_name is not None:\n",
" try:\n",
" # Crea la cartella se non esiste\n",
" os.makedirs(folder_name, exist_ok=True)\n",
"\n",
" # Genera il nome del file con timestamp\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
" filename = os.path.join(folder_name, f'error_analysis_{timestamp}.png')\n",
"\n",
" # Salva la figura\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nPlot salvato come: {filename}\")\n",
" except Exception as e:\n",
" print(f\"\\nErrore nel salvare il plot: {str(e)}\")\n",
"\n",
" plt.show()\n",
" \n",
"\n",
" # Stampa statistiche degli errori\n",
" print(\"\\nStatistiche degli errori:\")\n",
" print(f\"MAE: {np.mean(np.abs(errors)):.4f}\")\n",
" print(f\"MSE: {np.mean(errors**2):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(errors**2)):.4f}\")\n",
" print(f\"MSE: {np.mean(errors ** 2):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(errors ** 2)):.4f}\")\n",
" print(f\"Media errori: {np.mean(errors):.4f}\")\n",
" print(f\"Std errori: {np.std(errors):.4f}\")\n",
" \n",
"\n",
" # Calcola percentuali di errori entro certe soglie\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
" for threshold in thresholds:\n",
@ -1439,7 +1587,7 @@
" print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
"\n",
"\n",
"plot_error_analysis(y_test, predictions)"
"plot_error_analysis(y_test, predictions, folder_name=folder_name)"
]
},
{
@ -1499,30 +1647,40 @@
}
],
"source": [
"def plot_advanced_prediction_analysis(y_true, y_pred):\n",
"def plot_advanced_prediction_analysis(y_true, y_pred, folder_name=None):\n",
" \"\"\"\n",
" Funzione per visualizzare l'analisi degli errori di predizione e la precisione\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
" Valori reali\n",
" y_pred : array-like\n",
" Valori predetti\n",
" folder_name : str, optional\n",
" Cartella dove salvare i plot. Se None, i plot non vengono salvati.\n",
" \"\"\"\n",
" import os\n",
" from datetime import datetime\n",
" import seaborn as sns\n",
"\n",
" # Converti in array numpy 1D se necessario\n",
" if isinstance(y_true, pd.Series):\n",
" y_true = y_true.values\n",
" if isinstance(y_pred, pd.Series):\n",
" y_pred = y_pred.values\n",
"\n",
"\n",
" y_true = y_true.ravel()\n",
" y_pred = y_pred.ravel()\n",
" \n",
"\n",
" # Calcola gli errori\n",
" errors = y_pred - y_true\n",
" \n",
"\n",
" # Calcola accuracy per diversi livelli di tolleranza\n",
" exact_accuracy = np.mean(np.abs(errors) < 0.1) * 100 # Precisione esatta (±0.1)\n",
" accuracy_05 = np.mean(np.abs(errors) <= 0.5) * 100 # Precisione entro ±0.5\n",
" accuracy_10 = np.mean(np.abs(errors) <= 1.0) * 100 # Precisione entro ±1.0\n",
" \n",
" \n",
" # Calcola accuracy per livelli di rischio UV\n",
" exact_accuracy = np.mean(np.abs(errors) < 0.1) * 100\n",
" accuracy_05 = np.mean(np.abs(errors) <= 0.5) * 100\n",
" accuracy_10 = np.mean(np.abs(errors) <= 1.0) * 100\n",
"\n",
" def get_risk_level(uv):\n",
" if uv < 2:\n",
" return 'Basso'\n",
@ -1534,21 +1692,21 @@
" return 'Molto Alto'\n",
" else:\n",
" return 'Estremo'\n",
" \n",
" \n",
"\n",
" y_true_risk = [get_risk_level(x) for x in y_true]\n",
" y_pred_risk = [get_risk_level(x) for x in y_pred]\n",
" risk_accuracy = np.mean(np.array(y_true_risk) == np.array(y_pred_risk)) * 100\n",
" \n",
" plt.figure(figsize=(20, 10))\n",
" \n",
"\n",
" # Crea la figura principale\n",
" fig = plt.figure(figsize=(20, 10))\n",
"\n",
" # Plot 1: Distribuzione degli errori\n",
" plt.subplot(2, 2, 1)\n",
" plt.hist(errors, bins=50, alpha=0.7)\n",
" plt.title('Distribuzione degli Errori di Predizione')\n",
" plt.xlabel('Errore')\n",
" plt.ylabel('Frequenza')\n",
" \n",
"\n",
" # Plot 2: Actual vs Predicted\n",
" plt.subplot(2, 2, 2)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
@ -1556,7 +1714,7 @@
" plt.title('Valori Reali vs Predetti')\n",
" plt.xlabel('Valori Reali')\n",
" plt.ylabel('Valori Predetti')\n",
" \n",
"\n",
" # Plot 3: Errori vs Valori Reali\n",
" plt.subplot(2, 2, 3)\n",
" plt.scatter(y_true, errors, alpha=0.5)\n",
@ -1564,100 +1722,122 @@
" plt.title('Errori vs Valori Reali')\n",
" plt.xlabel('Valori Reali')\n",
" plt.ylabel('Errore')\n",
" \n",
"\n",
" # Plot 4: Precisione per intervallo di UV\n",
" plt.subplot(2, 2, 4)\n",
" \n",
" # Definisci gli intervalli UV\n",
"\n",
" uv_ranges = [(0, 2), (2, 5), (5, 7), (7, 10), (10, 11)]\n",
" range_labels = ['Basso\\n(0-2)', 'Moderato\\n(2-5)', 'Alto\\n(5-7)', 'Molto Alto\\n(7-10)', 'Estremo\\n(10-11)']\n",
" \n",
"\n",
" accuracies = []\n",
" counts = []\n",
" mae_per_range = []\n",
" \n",
"\n",
" for (low, high) in uv_ranges:\n",
" mask = (y_true >= low) & (y_true < high)\n",
" if mask.any():\n",
" # Calcola MAE per questo range\n",
" mae = np.mean(np.abs(y_pred[mask] - y_true[mask]))\n",
" mae_per_range.append(mae)\n",
" # Conta quanti valori in questo range\n",
" count = np.sum(mask)\n",
" counts.append(count)\n",
" # Calcola precisione entro 0.5 punti UV\n",
" accuracy = np.mean(np.abs(y_pred[mask] - y_true[mask]) <= 0.5) * 100\n",
" accuracies.append(accuracy)\n",
" \n",
"\n",
" # Crea il grafico a barre con doppio asse y\n",
" fig = plt.gca()\n",
" \n",
" # Barre per accuratezza\n",
" ax = plt.gca()\n",
" bars = plt.bar(range_labels, accuracies, alpha=0.6, color='skyblue')\n",
" plt.ylabel('Precisione (%)')\n",
" plt.title('Precisione e MAE per Range UV')\n",
" \n",
" # Aggiungi etichette sopra le barre\n",
"\n",
" for bar in bars:\n",
" height = bar.get_height()\n",
" plt.text(bar.get_x() + bar.get_width() / 2., height,\n",
" f'{height:.1f}%\\n(n={counts[bars.index(bar)]})',\n",
" ha='center', va='bottom')\n",
" \n",
" # Secondo asse y per MAE\n",
" ax2 = fig.twinx()\n",
"\n",
" ax2 = ax.twinx()\n",
" line = ax2.plot(range_labels, mae_per_range, 'r-', marker='o', label='MAE')\n",
" ax2.set_ylabel('MAE', color='red')\n",
" \n",
" # Aggiungi valori MAE\n",
"\n",
" for i, mae in enumerate(mae_per_range):\n",
" ax2.text(i, mae, f'MAE: {mae:.3f}', color='red', ha='center', va='bottom')\n",
" \n",
"\n",
" plt.xticks(rotation=45)\n",
" plt.tight_layout()\n",
" plt.show()\n",
" \n",
"\n",
" # Salva la figura principale se è specificata una cartella\n",
" if folder_name is not None:\n",
" try:\n",
" # Crea la cartella se non esiste\n",
" os.makedirs(folder_name, exist_ok=True)\n",
"\n",
" # Genera il timestamp\n",
" timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"\n",
" # Salva la figura principale\n",
" main_plot_filename = os.path.join(folder_name, f'advanced_analysis_{timestamp}.png')\n",
" plt.savefig(main_plot_filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nPlot principale salvato come: {main_plot_filename}\")\n",
"\n",
" # Crea e salva la matrice di confusione come plot separato\n",
" plt.figure(figsize=(10, 8))\n",
" cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
" risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
" cm_df = pd.DataFrame(cm,\n",
" columns=risk_levels,\n",
" index=risk_levels)\n",
" \n",
" cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
"\n",
" sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
" plt.title('Matrice di Confusione per Livelli di Rischio UV')\n",
" plt.tight_layout()\n",
"\n",
" conf_matrix_filename = os.path.join(folder_name, f'confusion_matrix_{timestamp}.png')\n",
" plt.savefig(conf_matrix_filename, dpi=300, bbox_inches='tight')\n",
" print(f\"Matrice di confusione salvata come: {conf_matrix_filename}\")\n",
"\n",
" except Exception as e:\n",
" print(f\"\\nErrore nel salvare i plot: {str(e)}\")\n",
"\n",
" plt.show()\n",
"\n",
" # Stampa delle statistiche e analisi\n",
" cm = confusion_matrix(y_true_risk, y_pred_risk)\n",
" risk_levels = ['Basso', 'Moderato', 'Alto', 'Molto Alto', 'Estremo']\n",
" cm_df = pd.DataFrame(cm, columns=risk_levels, index=risk_levels)\n",
"\n",
" print(\"\\nMatrice di Confusione per Livelli di Rischio UV:\")\n",
" print(cm_df)\n",
" \n",
"\n",
" print(\"\\nAnalisi Precisione Predizioni UV Index:\")\n",
" print(f\"Precisione esatta (±0.1): {exact_accuracy:.1f}%\")\n",
" print(f\"Precisione entro 0.5 punti: {accuracy_05:.1f}%\")\n",
" print(f\"Precisione entro 1.0 punti: {accuracy_10:.1f}%\")\n",
" print(f\"Precisione livello di rischio: {risk_accuracy:.1f}%\")\n",
" \n",
"\n",
" print(\"\\nAnalisi errori per livello UV:\")\n",
" uv_ranges = [(0, 2, 'Basso'), (2, 5, 'Moderato'), (5, 7, 'Alto'),\n",
" (7, 10, 'Molto Alto'), (10, 11, 'Estremo')]\n",
" \n",
"\n",
" for low, high, label in uv_ranges:\n",
" mask = (y_true >= low) & (y_true < high)\n",
" if mask.any():\n",
" mae = np.mean(np.abs(errors[mask]))\n",
" n_samples = np.sum(mask)\n",
" print(f\"MAE per UV {label} ({low}-{high}): {mae:.3f} (n={n_samples})\")\n",
" \n",
"\n",
" print(\"\\nStatistiche degli errori:\")\n",
" print(f\"Media errori: {np.mean(errors):.3f}\")\n",
" print(f\"Deviazione standard errori: {np.std(errors):.3f}\")\n",
" print(f\"Errore mediano: {np.median(errors):.3f}\")\n",
" print(f\"95° percentile errore assoluto: {np.percentile(np.abs(errors), 95):.3f}\")\n",
" \n",
" # Calcola percentuali di errori entro certe soglie\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
"\n",
" print(\"\\nDistribuzione degli errori:\")\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
" for threshold in thresholds:\n",
" within_threshold = np.mean(np.abs(errors) <= threshold) * 100\n",
" print(f\"Predizioni entro ±{threshold}: {within_threshold:.1f}%\")\n",
"\n",
"\n",
"# Usa la funzione\n",
"plot_advanced_prediction_analysis(y_test, predictions)"
"plot_advanced_prediction_analysis(y_test, predictions, folder_name=folder_name)"
]
},
{

View File

@ -1,7 +0,0 @@
Metadata-Version: 2.1
Name: olive_oil_dashboard
Version: 0.1
Requires-Dist: pandas
Requires-Dist: numpy
Requires-Dist: tensorflow
Requires-Dist: scikit-learn

View File

@ -1,11 +0,0 @@
README.md
setup.py
model_train/__init__.py
model_train/create_train_dataset.py
olive_oil_dashboard.egg-info/PKG-INFO
olive_oil_dashboard.egg-info/SOURCES.txt
olive_oil_dashboard.egg-info/dependency_links.txt
olive_oil_dashboard.egg-info/requires.txt
olive_oil_dashboard.egg-info/top_level.txt
utils/__init__.py
utils/helpers.py

View File

@ -1,4 +0,0 @@
pandas
numpy
tensorflow
scikit-learn

View File

@ -1,2 +0,0 @@
model_train
utils

Binary file not shown.

View File

@ -1,697 +0,0 @@
import tensorflow as tf
from tf.keras.layers import Dense, LSTM, Conv1D, MultiHeadAttention, Dropout, BatchNormalization, LayerNormalization, GlobalAveragePooling1D, Concatenate, Input, Reshape, Activation
from tf.keras.models import Model
import tf.keras.backend as K
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tf.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tf.keras.optimizers import Adam
import matplotlib.pyplot as plt
import json
import joblib
from sklearn.utils.class_weight import compute_class_weight
def get_season(date):
month = date.month
day = date.day
if (month == 12 and day >= 21) or (month <= 3 and day < 20):
return 'Winter'
elif (month == 3 and day >= 20) or (month <= 6 and day < 21):
return 'Spring'
elif (month == 6 and day >= 21) or (month <= 9 and day < 23):
return 'Summer'
elif (month == 9 and day >= 23) or (month <= 12 and day < 21):
return 'Autumn'
else:
return 'Unknown'
def get_time_period(hour):
if 5 <= hour < 12:
return 'Morning'
elif 12 <= hour < 17:
return 'Afternoon'
elif 17 <= hour < 21:
return 'Evening'
else:
return 'Night'
def add_time_features(df):
df['datetime'] = pd.to_datetime(df['datetime'])
df['timestamp'] = df['datetime'].astype(np.int64) // 10 ** 9
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24))
df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24))
df['day_of_week'] = df['datetime'].dt.dayofweek
df['day_of_year'] = df['datetime'].dt.dayofyear
df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)
df['quarter'] = df['datetime'].dt.quarter
df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)
df['is_quarter_end'] = df['datetime'].dt.is_quarter_end.astype(int)
df['is_year_end'] = df['datetime'].dt.is_year_end.astype(int)
df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
df['day_of_year_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25))
df['day_of_year_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365.25))
df['season'] = df['datetime'].apply(get_season)
df['time_period'] = df['hour'].apply(get_time_period)
return df
def add_solar_features(df):
# Calcolo dell'angolo solare
df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))
# Interazioni tra features rilevanti
df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']
df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])
# Feature derivate
df['clear_sky_index'] = (100 - df['cloudcover']) / 100
df['temp_gradient'] = df['temp'] - df['tempmin']
return df
def add_solar_specific_features(df):
# Angolo solare e durata del giorno
df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)
df['solar_noon'] = 12 - df['hour']
df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * np.cos(2 * np.pi * df['solar_noon'] / 24)
# Interazioni
df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']
df['visibility_elevation'] = df['visibility'] * df['solar_elevation']
# Rolling features con finestre più ampie
df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12).mean()
df['temp_rolling_12h'] = df['temp'].rolling(window=12).mean()
return df
def add_advanced_features(df):
# Features esistenti
df = add_time_features(df)
df = add_solar_features(df)
df = add_solar_specific_features(df)
# Aggiungi interazioni tra variabili meteorologiche
df['temp_humidity'] = df['temp'] * df['humidity']
df['temp_cloudcover'] = df['temp'] * df['cloudcover']
df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']
# Features derivate per la radiazione solare
df['clear_sky_factor'] = (100 - df['cloudcover']) / 100
df['day_length'] = np.sin(df['day_of_year_sin']) * 12 + 12 # approssimazione della durata del giorno
# Lag features
df['temp_1h_lag'] = df['temp'].shift(1)
df['cloudcover_1h_lag'] = df['cloudcover'].shift(1)
df['humidity_1h_lag'] = df['humidity'].shift(1)
# Rolling means
df['temp_rolling_mean_6h'] = df['temp'].rolling(window=6).mean()
df['cloudcover_rolling_mean_6h'] = df['cloudcover'].rolling(window=6).mean()
return df
def prepare_advanced_data(df):
# Applicazione delle funzioni di feature engineering
df = add_advanced_features(df)
# Selezione delle feature più rilevanti per UV index
selected_features = [
# Features meteorologiche base
'temp', 'humidity', 'cloudcover', 'visibility', 'pressure',
# Features temporali cicliche
'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
'day_of_year_sin', 'day_of_year_cos',
# Features solari
'solar_angle', 'solar_elevation', 'day_length',
'clear_sky_index', 'solar_noon',
# Interazioni
'cloud_temp_interaction', 'visibility_cloud_interaction',
'cloud_elevation', 'visibility_elevation',
# Rolling features
'cloud_rolling_12h', 'temp_rolling_12h',
'temp_rolling_mean_6h', 'cloudcover_rolling_mean_6h',
# Features categoriche (da encodare)
'season', 'time_period'
]
# One-hot encoding per le feature categoriche
df = pd.get_dummies(df, columns=['season', 'time_period'])
# Aggiorna la lista delle feature con le colonne one-hot
categorical_columns = [col for col in df.columns if col.startswith(('season_', 'time_period_'))]
final_features = [f for f in selected_features if f not in ['season', 'time_period']] + categorical_columns
# Rimozione delle righe con valori NaN (create dai rolling features)
df = df.dropna()
X = df[final_features]
y = df['uvindex']
# Split dei dati
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scaling delle feature
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, scaler, final_features
def create_sequence_data(X, sequence_length=24):
"""
Converte i dati in sequenze per l'input LSTM
sequence_length rappresenta quante ore precedenti considerare
"""
sequences = []
for i in range(len(X) - sequence_length + 1):
sequences.append(X[i:i + sequence_length])
return np.array(sequences)
def prepare_hybrid_data(df):
# Utilizziamo la preparazione dati esistente
X_train_scaled, X_test_scaled, y_train, y_test, scaler, features = prepare_advanced_data(df)
# Convertiamo i dati in sequenze
sequence_length = 24 # 24 ore di dati storici
X_train_seq = create_sequence_data(X_train_scaled, sequence_length)
X_test_seq = create_sequence_data(X_test_scaled, sequence_length)
# Adattiamo le y rimuovendo i primi (sequence_length-1) elementi
y_train = y_train[sequence_length - 1:]
y_test = y_test[sequence_length - 1:]
return X_train_seq, X_test_seq, y_train, y_test, scaler, features
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
"""
Implementa un blocco Transformer Encoder
"""
# Multi-Head Attention
attention_output = MultiHeadAttention(
num_heads=num_heads, key_dim=head_size
)(inputs, inputs)
attention_output = Dropout(dropout)(attention_output)
attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)
# Feed Forward Network
ffn_output = Dense(ff_dim, activation="relu")(attention_output)
ffn_output = Dense(inputs.shape[-1])(ffn_output)
ffn_output = Dropout(dropout)(ffn_output)
return LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)
def custom_activation(x):
"""
Activation function personalizzata che limita l'output tra 0 e 11
"""
return 11 * tf.sigmoid(x)
def custom_loss(y_true, y_pred):
"""
Loss function personalizzata che penalizza fortemente le predizioni fuori range
"""
# MSE base
mse = K.mean(K.square(y_true - y_pred))
# Penalità per valori fuori range
below_range = K.relu(0 - y_pred)
above_range = K.relu(y_pred - 11)
# Aggiungi una forte penalità per valori fuori range
range_penalty = 10.0 * (K.mean(K.square(below_range)) + K.mean(K.square(above_range)))
return mse + range_penalty
def create_hybrid_model(input_shape, n_features):
"""
Crea un modello ibrido con output vincolato tra 0 e 11
"""
# Input Layer
inputs = Input(shape=input_shape)
# CNN Branch - Estrazione pattern locali
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(inputs)
conv2 = Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(conv1)
conv3 = Conv1D(filters=256, kernel_size=7, activation='relu', padding='same')(conv2)
conv_output = BatchNormalization()(conv3)
# LSTM Branch - Dipendenze temporali
lstm1 = LSTM(128, return_sequences=True)(inputs)
lstm2 = LSTM(64, return_sequences=True)(lstm1)
lstm_output = BatchNormalization()(lstm2)
# Combine CNN and LSTM branches
combined = Concatenate()([conv_output, lstm_output])
# Multi-Head Attention per catturare relazioni complesse
attention_output = transformer_encoder(
combined,
head_size=32,
num_heads=8,
ff_dim=256,
dropout=0.1
)
# Global Pooling
pooled = GlobalAveragePooling1D()(attention_output)
# Dense Layers con attivazioni vincolate
dense1 = Dense(128)(pooled)
dense1 = Activation('relu')(dense1)
dense1 = Dropout(0.3)(dense1)
dense2 = Dense(64)(dense1)
dense2 = Activation('relu')(dense2)
dense2 = Dropout(0.2)(dense2)
# Output layer con attivazione personalizzata per limitare tra 0 e 11
outputs = Dense(1)(dense2)
outputs = Activation(custom_activation)(outputs)
# Create model
model = Model(inputs=inputs, outputs=outputs)
# Compile con loss function personalizzata
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
optimizer=optimizer,
loss=custom_loss,
metrics=['mae', 'mse']
)
return model
def evaluate_uv_predictions(y_true, y_pred):
"""
Valutazione specifica per UV index con metriche categoriche
"""
# Arrotonda le predizioni al più vicino intero
y_pred_rounded = np.round(y_pred)
# Clip dei valori tra 0 e 11
y_pred_clipped = np.clip(y_pred_rounded, 0, 11)
# Calcolo metriche
mae = mean_absolute_error(y_true, y_pred_clipped)
rmse = np.sqrt(mean_squared_error(y_true, y_pred_clipped))
r2 = r2_score(y_true, y_pred_clipped)
# Calcolo accuratezza per diversi margini di errore
exact_accuracy = np.mean(y_pred_clipped == y_true)
one_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 1)
two_off_accuracy = np.mean(np.abs(y_pred_clipped - y_true) <= 2)
print("\nUV Index Prediction Metrics:")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"Exact Match Accuracy: {exact_accuracy:.3f}")
print(f"±1 Accuracy: {one_off_accuracy:.3f}")
print(f"±2 Accuracy: {two_off_accuracy:.3f}")
# Confusion Matrix per livelli di UV
def get_uv_level(value):
if value <= 2:
return 'Low'
elif value <= 5:
return 'Moderate'
elif value <= 7:
return 'High'
elif value <= 10:
return 'Very High'
else:
return 'Extreme'
y_true_levels = [get_uv_level(v) for v in y_true]
y_pred_levels = [get_uv_level(v) for v in y_pred_clipped]
print("\nUV Level Confusion Matrix:")
print(pd.crosstab(
pd.Series(y_true_levels, name='Actual'),
pd.Series(y_pred_levels, name='Predicted')
))
return mae, rmse, r2, exact_accuracy, one_off_accuracy
def plot_uv_predictions(y_true, y_pred):
"""
Visualizzazione delle predizioni specifica per UV index
"""
plt.figure(figsize=(15, 5))
# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_true, y_pred, alpha=0.5)
plt.plot([0, 11], [0, 11], 'r--', lw=2)
plt.xlabel('Actual UV Index')
plt.ylabel('Predicted UV Index')
plt.title('Actual vs Predicted UV Index')
plt.grid(True)
# Plot 2: Distribution of Errors
plt.subplot(1, 2, 2)
errors = y_pred - y_true
plt.hist(errors, bins=20, alpha=0.7)
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Errors')
plt.grid(True)
plt.tight_layout()
plt.show()
def train_hybrid_model(model, X_train, y_train, X_test, y_test, class_weights=None, epochs=100, batch_size=32):
"""
Funzione di training avanzata per il modello ibrido UV index con monitoraggio dettagliato
e gestione del training.
Parameters:
-----------
model : keras.Model
Il modello ibrido compilato
X_train : numpy.ndarray
Dati di training
y_train : numpy.ndarray
Target di training
X_test : numpy.ndarray
Dati di validation
y_test : numpy.ndarray
Target di validation
class_weights : dict, optional
Pesi per bilanciare le classi UV
epochs : int, optional
Numero massimo di epoche di training
batch_size : int, optional
Dimensione del batch
Returns:
--------
history : keras.callbacks.History
Storia del training con tutte le metriche
"""
# Callbacks avanzati per il training
callbacks = [
# Early Stopping avanzato
EarlyStopping(
monitor='val_loss',
patience=20,
restore_best_weights=True,
mode='min',
verbose=1,
min_delta=1e-4
),
# Learning Rate Schedule
ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=10,
verbose=1,
mode='min',
min_delta=1e-4,
cooldown=5,
min_lr=1e-6
),
# Model Checkpoint per salvare i migliori modelli
tf.keras.callbacks.ModelCheckpoint(
filepath='best_uv_model.h5',
monitor='val_loss',
save_best_only=True,
mode='min',
verbose=1
),
# TensorBoard callback per il monitoraggio
tf.keras.callbacks.TensorBoard(
log_dir='./logs',
histogram_freq=1,
write_graph=True,
update_freq='epoch'
),
# Custom Callback per monitorare le predizioni fuori range
tf.keras.callbacks.LambdaCallback(
on_epoch_end=lambda epoch, logs: print(
f"\nEpoch {epoch + 1}: Predizioni fuori range: "
f"{np.sum((model.predict(X_test) < 0) | (model.predict(X_test) > 11))}"
) if epoch % 10 == 0 else None
)
]
# Calcolo dei class weights se non forniti
if class_weights is None:
# Discretizziamo i valori UV per il calcolo dei pesi
y_discrete = np.round(y_train).astype(int)
class_weights = compute_class_weight(
'balanced',
classes=np.unique(y_discrete),
y=y_discrete
)
class_weights = dict(enumerate(class_weights))
# Training con gestione degli errori e logging
try:
history = model.fit(
X_train, y_train,
validation_data=(X_test, y_test),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
class_weight=class_weights,
verbose=1,
shuffle=True,
workers=4,
use_multiprocessing=True
)
# Analisi post-training
print("\nTraining completato con successo!")
# Valutazione finale sul test set
test_loss, test_mae, test_mse = model.evaluate(X_test, y_test, verbose=0)
print(f"\nMetriche finali sul test set:")
print(f"Loss: {test_loss:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"MSE: {test_mse:.4f}")
# Analisi delle predizioni
predictions = model.predict(X_test)
out_of_range = np.sum((predictions < 0) | (predictions > 11))
print(f"\nPredizioni fuori range: {out_of_range} ({out_of_range / len(predictions) * 100:.2f}%)")
# Plot della loss durante il training
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()
plt.tight_layout()
plt.show()
# Salvataggio dei risultati del training
training_results = {
'final_loss': test_loss,
'final_mae': test_mae,
'final_mse': test_mse,
'out_of_range_predictions': out_of_range,
'training_time': len(history.history['loss']),
'best_epoch': np.argmin(history.history['val_loss']) + 1
}
# Salvataggio su file
with open('training_results.json', 'w') as f:
json.dump(training_results, f, indent=4)
return history
except Exception as e:
print(f"\nErrore durante il training: {str(e)}")
raise
finally:
# Pulizia della memoria
tf.keras.backend.clear_session()
def train_uvindex_bounded_model(df):
"""
Training completo del modello UV index con preparazione dati, training,
valutazione e visualizzazione dei risultati.
Parameters:
-----------
df : pandas.DataFrame
DataFrame contenente i dati meteorologici e UV index
Returns:
--------
tuple:
- model: modello addestrato
- scaler: scaler utilizzato per la normalizzazione
- features: lista delle feature utilizzate
- history: storia del training
- predictions: predizioni sul test set
- y_test: valori reali del test set
- metrics: metriche di valutazione
- training_results: dizionario con i risultati dettagliati del training
"""
print("Inizializzazione del training del modello UV index...")
try:
# Preparazione dei dati
print("\n1. Preparazione dei dati...")
X_train_seq, X_test_seq, y_train, y_test, scaler, features = prepare_hybrid_data(df)
print(f"Shape dei dati di training: {X_train_seq.shape}")
print(f"Shape dei dati di test: {X_test_seq.shape}")
print(f"Numero di feature utilizzate: {len(features)}")
# Verifica della qualità dei dati
if np.isnan(X_train_seq).any() or np.isnan(y_train).any():
raise ValueError("Trovati valori NaN nei dati di training")
# Verifica del range dei valori UV
if not (0 <= y_train.max() <= 11 and 0 <= y_test.max() <= 11):
print("WARNING: Trovati valori UV index fuori range (0-11)")
# Creazione del modello
print("\n2. Creazione del modello...")
input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])
model = create_hybrid_model(input_shape, len(features))
model.summary()
# Calcolo class weights per bilanciare il dataset
y_discrete = np.round(y_train).astype(int)
class_weights = compute_class_weight(
'balanced',
classes=np.unique(y_discrete),
y=y_discrete
)
class_weights_dict = dict(enumerate(class_weights))
print("\n3. Avvio del training...")
history = train_hybrid_model(
model=model,
X_train=X_train_seq,
y_train=y_train,
X_test=X_test_seq,
y_test=y_test,
class_weights=class_weights_dict,
epochs=100,
batch_size=32
)
print("\n4. Generazione delle predizioni...")
predictions = model.predict(X_test_seq)
# Clip delle predizioni nel range corretto
predictions = np.clip(predictions, 0, 11)
print("\n5. Valutazione del modello...")
metrics = evaluate_uv_predictions(y_test, predictions)
print("\n6. Creazione delle visualizzazioni...")
plot_uv_predictions(y_test, predictions)
# Creazione del dizionario dei risultati
training_results = {
'model_params': {
'input_shape': input_shape,
'n_features': len(features),
'sequence_length': X_train_seq.shape[1]
},
'training_params': {
'batch_size': 32,
'total_epochs': len(history.history['loss']),
'best_epoch': np.argmin(history.history['val_loss']) + 1
},
'performance_metrics': {
'final_loss': float(history.history['val_loss'][-1]),
'final_mae': float(history.history['val_mae'][-1]),
'best_val_loss': float(min(history.history['val_loss'])),
'out_of_range_predictions': int(np.sum((predictions < 0) | (predictions > 11))),
'accuracy_metrics': metrics
},
'feature_importance': {
feature: float(importance)
for feature, importance in zip(features, model.layers[0].get_weights()[0].mean(axis=1))
}
}
# Salvataggio dei risultati
print("\n7. Salvataggio dei risultati...")
# Salva il modello
model.save('uv_index_model.h5')
# Salva i risultati del training
with open('training_results.json', 'w') as f:
json.dump(training_results, f, indent=4)
# Salva lo scaler
joblib.dump(scaler, 'scaler.pkl')
print("\nTraining completato con successo!")
return (
model, scaler, features, history,
predictions, y_test, metrics, training_results
)
except Exception as e:
print(f"\nErrore durante il training: {str(e)}")
raise
finally:
# Pulizia della memoria
tf.keras.backend.clear_session()
df = pd.read_parquet('../data/weather_data.parquet')
# Esegui il training
(model, scaler, features, history,
predictions, y_test, metrics,
training_results) = train_uvindex_bounded_model(df)