wip model

This commit is contained in:
Giuseppe Nucifora 2024-11-07 19:02:42 +01:00
parent e9ec5af072
commit 549ced1aea
21 changed files with 6417 additions and 204 deletions

2
.idea/TesiPegaso.iml generated
View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/models" /> <excludeFolder url="file://$MODULE_DIR$/models" />
</content> </content>
<orderEntry type="jdk" jdkName="ml_pegaso" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="ml_env" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

2
.idea/misc.xml generated
View File

@ -3,7 +3,7 @@
<component name="Black"> <component name="Black">
<option name="sdkName" value="/usr/local/anaconda3" /> <option name="sdkName" value="/usr/local/anaconda3" />
</component> </component>
<component name="ProjectRootManager" version="2" project-jdk-name="ml_pegaso" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="ml_env" project-jdk-type="Python SDK" />
<component name="PyCharmDSProjectLayout"> <component name="PyCharmDSProjectLayout">
<option name="id" value="JupyterRightHiddenStructureLayout" /> <option name="id" value="JupyterRightHiddenStructureLayout" />
</component> </component>

View File

@ -32,7 +32,8 @@
"!pip install seaborn\n", "!pip install seaborn\n",
"!pip install tqdm\n", "!pip install tqdm\n",
"!pip install pydot\n", "!pip install pydot\n",
"!pip install tensorflow-io" "!pip install tensorflow-io\n",
"!pip install pvlib"
], ],
"outputs": [], "outputs": [],
"execution_count": null "execution_count": null
@ -123,6 +124,7 @@
"from tensorflow.keras.optimizers import Adam\n", "from tensorflow.keras.optimizers import Adam\n",
"from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint\n", "from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint\n",
"from datetime import datetime\n", "from datetime import datetime\n",
"from pvlib import solarposition\n",
"import os\n", "import os\n",
"import json\n", "import json\n",
"import joblib\n", "import joblib\n",
@ -527,207 +529,7 @@
" }, scaler_y\n", " }, scaler_y\n",
"\n", "\n",
"\n", "\n",
"def create_radiation_model(input_shape, solar_params_shape=(3,)):\n",
" \"\"\"\n",
" Modello per la radiazione solare con vincoli di non-negatività.\n",
" \"\"\"\n",
" # Input layers\n",
" main_input = Input(shape=input_shape, name='main_input')\n",
" solar_input = Input(shape=solar_params_shape, name='solar_params')\n",
"\n", "\n",
" # Branch CNN\n",
" x1 = Conv1D(32, 3, padding='same')(main_input)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = Conv1D(64, 3, padding='same')(x1)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = GlobalAveragePooling1D()(x1)\n",
"\n",
" # Branch LSTM\n",
" x2 = Bidirectional(LSTM(64, return_sequences=True))(main_input)\n",
" x2 = Bidirectional(LSTM(32))(x2)\n",
" x2 = BatchNormalization()(x2)\n",
"\n",
" # Solar parameters processing\n",
" x3 = Dense(32)(solar_input)\n",
" x3 = BatchNormalization()(x3)\n",
" x3 = Activation('relu')(x3)\n",
"\n",
" # Combine all branches\n",
" x = concatenate([x1, x2, x3])\n",
"\n",
" # Dense layers with non-negativity constraints\n",
" x = Dense(64, kernel_constraint=tf.keras.constraints.NonNeg())(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.2)(x)\n",
"\n",
" x = Dense(32, kernel_constraint=tf.keras.constraints.NonNeg())(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
"\n",
" # Output layer con vincoli di non-negatività\n",
" output = Dense(1,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" activation='relu')(x)\n",
"\n",
" model = Model(inputs=[main_input, solar_input], outputs=output, name=\"SolarRadiation\")\n",
" return model\n",
"\n",
"\n",
"def create_energy_model(input_shape):\n",
" \"\"\"\n",
" Modello migliorato per l'energia solare che sfrutta la relazione con la radiazione.\n",
" Include vincoli di non-negatività e migliore gestione delle dipendenze temporali.\n",
" \"\"\"\n",
" inputs = Input(shape=input_shape)\n",
"\n",
" # Branch 1: Elaborazione temporale con attention\n",
" # Multi-head attention per catturare relazioni temporali\n",
" x1 = MultiHeadAttention(num_heads=8, key_dim=32)(inputs, inputs)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
"\n",
" # Temporal Convolution branch per catturare pattern locali\n",
" x2 = Conv1D(\n",
" filters=64,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(inputs)\n",
" x2 = BatchNormalization()(x2)\n",
" x2 = Activation('relu')(x2)\n",
" x2 = Conv1D(\n",
" filters=32,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(x2)\n",
" x2 = BatchNormalization()(x2)\n",
" x2 = Activation('relu')(x2)\n",
"\n",
" # LSTM branch per memoria a lungo termine\n",
" x3 = LSTM(64, return_sequences=True)(inputs)\n",
" x3 = LSTM(32, return_sequences=False)(x3)\n",
" x3 = BatchNormalization()(x3)\n",
" x3 = Activation('relu')(x3)\n",
"\n",
" # Global pooling per ogni branch\n",
" x1 = GlobalAveragePooling1D()(x1)\n",
" x2 = GlobalAveragePooling1D()(x2)\n",
"\n",
" # Concatena tutti i branch\n",
" x = concatenate([x1, x2, x3])\n",
"\n",
" # Dense layers con vincoli di non-negatività\n",
" x = Dense(\n",
" 128,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.3)(x)\n",
"\n",
" x = Dense(\n",
" 64,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.2)(x)\n",
"\n",
" # Output layer con vincolo di non-negatività\n",
" output = Dense(\n",
" 1,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" activation='relu', # Garantisce output non negativo\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
"\n",
" model = Model(inputs=inputs, outputs=output, name=\"SolarEnergy\")\n",
" return model\n",
"\n",
"\n",
"def create_uv_model(input_shape):\n",
" \"\"\"\n",
" Modello migliorato per l'indice UV che sfrutta sia radiazione che energia solare.\n",
" Include vincoli di non-negatività e considera le relazioni non lineari tra le variabili.\n",
" \"\"\"\n",
" inputs = Input(shape=input_shape)\n",
"\n",
" # CNN branch per pattern locali\n",
" x1 = Conv1D(\n",
" filters=64,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(inputs)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = MaxPooling1D(pool_size=2)(x1)\n",
"\n",
" x1 = Conv1D(\n",
" filters=32,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(x1)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = GlobalAveragePooling1D()(x1)\n",
"\n",
" # Attention branch per relazioni complesse\n",
" # Specialmente utile per le relazioni con radiazione ed energia\n",
" x2 = MultiHeadAttention(num_heads=4, key_dim=32)(inputs, inputs)\n",
" x2 = BatchNormalization()(x2)\n",
" x2 = Activation('relu')(x2)\n",
" x2 = GlobalAveragePooling1D()(x2)\n",
"\n",
" # Dense branch per le feature più recenti\n",
" x3 = GlobalAveragePooling1D()(inputs)\n",
" x3 = Dense(\n",
" 64,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x3)\n",
" x3 = BatchNormalization()(x3)\n",
" x3 = Activation('relu')(x3)\n",
"\n",
" # Fusion dei branch\n",
" x = concatenate([x1, x2, x3])\n",
"\n",
" # Dense layers con vincoli di non-negatività\n",
" x = Dense(\n",
" 128,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.3)(x)\n",
"\n",
" x = Dense(\n",
" 64,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.2)(x)\n",
"\n",
" # Output layer con vincolo di non-negatività\n",
" output = Dense(\n",
" 1,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" activation='relu', # Garantisce output non negativo\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
"\n",
" model = Model(inputs=inputs, outputs=output, name=\"SolarUV\")\n",
" return model\n",
"\n", "\n",
"\n", "\n",
"class CustomCallback(tf.keras.callbacks.Callback):\n", "class CustomCallback(tf.keras.callbacks.Callback):\n",
@ -2434,7 +2236,9 @@
" solar_params_val,\n", " solar_params_val,\n",
" scalers=radiation_scalers,\n", " scalers=radiation_scalers,\n",
" **training_params\n", " **training_params\n",
")" ")\n",
"\n",
"predict_radiation()"
], ],
"outputs": [], "outputs": [],
"execution_count": null "execution_count": null

94
src/README.md Normal file
View File

@ -0,0 +1,94 @@
src/data/data_loader.py:
- load_weather_data()
- load_olive_varieties()
- read_json_files()
- load_single_model_and_scalers()
- save_single_model_and_scalers()
src/data/data_processor.py:
- preprocess_weather_data()
- prepare_solar_data()
- prepare_transformer_data()
- create_sequences()
- encode_techniques()
- decode_techniques()
src/data/data_simulator.py:
- simulate_zone()
- simulate_olive_production_parallel()
- calculate_weather_effect()
- calculate_water_need()
- add_olive_water_consumption_correlation()
src/features/temporal_features.py:
- add_time_features()
- get_season()
- get_time_period()
- create_time_based_features()
src/features/weather_features.py:
- add_solar_features()
- add_solar_specific_features()
- add_environmental_features()
- calculate_vpd()
- add_weather_indicators()
src/features/olive_features.py:
- create_technique_mapping()
- add_olive_features()
- calculate_stress_index()
- calculate_quality_indicators()
- add_production_features()
src/models/transformer.py:
- create_olive_oil_transformer()
- OliveTransformerBlock
- PositionalEncoding
- DataAugmentation
src/models/layers.py:
- MultiScaleAttention
- TemporalConvBlock
- WeatherEmbedding
- OliveVarietyEmbedding
src/models/callbacks.py:
- CustomCallback
- WarmUpLearningRateSchedule
- MetricLogger
- EarlyStoppingWithBest
src/models/training.py:
- compile_model()
- setup_transformer_training()
- train_transformer()
- retrain_model()
- create_callbacks()
src/visualization/plots.py:
- plot_variety_comparison()
- plot_efficiency_vs_production()
- plot_water_efficiency_vs_production()
- plot_water_need_vs_oil_production()
- save_plot()
src/visualization/dashboard.py:
- create_production_dashboard()
- create_weather_dashboard()
- create_efficiency_dashboard()
- update_dashboard_data()
- create_forecast_view()
src/utils/metrics.py:
- calculate_real_error()
- evaluate_model_performance()
- calculate_efficiency_metrics()
- calculate_forecast_accuracy()
- compute_confidence_intervals()
src/utils/helpers.py:
- get_optimal_workers()
- clean_column_name()
- clean_column_names()
- to_camel_case()
- get_full_data()

441
src/data/data_loader.py Normal file
View File

@ -0,0 +1,441 @@
import os
import json
import numpy as np
import pandas as pd
import joblib
import tensorflow as tf
from src.models.solar_models import create_uv_model, create_energy_model, create_radiation_model
from typing import Tuple, Optional
import datetime
def read_json_files(folder_path):
all_data = []
file_list = sorted(os.listdir(folder_path))
for filename in file_list:
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
try:
with open(file_path, 'r') as file:
data = json.load(file)
all_data.extend(data['days'])
except Exception as e:
print(f"Error processing file '{filename}': {str(e)}")
return all_data
def save_single_model_and_scalers(model, model_name, scalers=None, base_path='./kaggle/working/models'):
"""
Salva un singolo modello con tutti i suoi artefatti associati e multipli scaler.
Parameters:
-----------
model : keras.Model
Il modello da salvare
model_name : str
Nome del modello (es. 'solarradiation', 'solarenergy', 'uvindex')
scalers : dict, optional
Dizionario degli scaler associati al modello (es. {'X': x_scaler, 'y': y_scaler})
base_path : str
Percorso base dove salvare il modello
"""
if isinstance(base_path, list):
base_path = './kaggle/working/models'
# Crea la cartella base se non esiste
os.makedirs(base_path, exist_ok=True)
# Crea la sottocartella per il modello specifico
model_path = os.path.join(base_path, model_name)
os.makedirs(model_path, exist_ok=True)
try:
print(f"\nSalvataggio modello {model_name}...")
# 1. Salva il modello completo
model_file = os.path.join(model_path, 'model.keras')
model.save(model_file, save_format='keras')
print(f"- Salvato modello completo: {model_file}")
# 2. Salva i pesi separatamente
weights_path = os.path.join(model_path, 'weights')
os.makedirs(weights_path, exist_ok=True)
weight_file = os.path.join(weights_path, 'weights')
model.save_weights(weight_file)
print(f"- Salvati pesi: {weight_file}")
# 3. Salva il plot del modello
plot_path = os.path.join(model_path, f'{model_name}_architecture.png')
tf.keras.utils.plot_model(
model,
to_file=plot_path,
show_shapes=True,
show_layer_names=True,
rankdir='TB',
expand_nested=True,
dpi=150
)
print(f"- Salvato plot architettura: {plot_path}")
# 4. Salva il summary del modello
summary_path = os.path.join(model_path, f'{model_name}_summary.txt')
with open(summary_path, 'w') as f:
model.summary(print_fn=lambda x: f.write(x + '\n'))
print(f"- Salvato summary modello: {summary_path}")
# 5. Salva gli scaler se forniti
if scalers is not None:
scaler_path = os.path.join(model_path, 'scalers')
os.makedirs(scaler_path, exist_ok=True)
for scaler_name, scaler in scalers.items():
scaler_file = os.path.join(scaler_path, f'{scaler_name}_scaler.joblib')
joblib.dump(scaler, scaler_file)
print(f"- Salvato scaler {scaler_name}: {scaler_file}")
# 6. Salva la configurazione del modello
model_config = {
'has_solar_params': True if model_name == 'solarradiation' else False,
'scalers': list(scalers.keys()) if scalers else []
}
config_path = os.path.join(model_path, 'model_config.joblib')
joblib.dump(model_config, config_path)
print(f"- Salvata configurazione: {config_path}")
# 7. Crea un README specifico per il modello
readme_path = os.path.join(model_path, 'README.txt')
with open(readme_path, 'w') as f:
f.write(f"{model_name.upper()} Model Artifacts\n")
f.write("=" * (len(model_name) + 15) + "\n\n")
f.write("Directory structure:\n")
f.write("- model.keras: Complete model\n")
f.write("- weights/: Model weights\n")
f.write(f"- {model_name}_architecture.png: Visual representation of model architecture\n")
f.write(f"- {model_name}_summary.txt: Detailed model summary\n")
f.write("- model_config.joblib: Model configuration\n")
if scalers:
f.write("- scalers/: Directory containing model scalers\n")
for scaler_name in scalers.keys():
f.write(f" - {scaler_name}_scaler.joblib: {scaler_name} scaler\n")
print(f"\nTutti gli artefatti per {model_name} salvati in: {model_path}")
print(f"Consulta {readme_path} per i dettagli sulla struttura")
except Exception as e:
print(f"Errore nel salvataggio degli artefatti per {model_name}: {str(e)}")
raise
return model_path
def load_single_model_and_scalers(model_name, base_path='./kaggle/working/models'):
"""
Carica un singolo modello con tutti i suoi artefatti e scaler associati.
Parameters:
-----------
model_name : str
Nome del modello da caricare (es. 'solarradiation', 'solarenergy', 'uvindex')
base_path : str
Percorso base dove sono salvati i modelli
Returns:
--------
tuple
(model, scalers, model_config)
"""
model_path = os.path.join(base_path, model_name)
if not os.path.exists(model_path):
print(f"Directory del modello non trovata: {model_path}")
return None, None, None
try:
print(f"\nCaricamento modello {model_name}...")
# 1. Carica la configurazione del modello
config_path = os.path.join(model_path, 'model_config.joblib')
try:
model_config = joblib.load(config_path)
print("- Configurazione modello caricata")
except:
print("! Configurazione modello non trovata, usando configurazione di default")
model_config = {
'has_solar_params': True if model_name == 'solarradiation' else False,
'scalers': ['X', 'y']
}
# 2. Carica il modello
try:
# Prima prova a caricare il modello completo
model_file = os.path.join(model_path, 'model.keras')
model = tf.keras.models.load_model(model_file)
print(f"- Modello caricato da: {model_file}")
# Verifica i pesi
weights_path = os.path.join(model_path, 'weights', 'weights')
if os.path.exists(weights_path + '.index'):
model.load_weights(weights_path)
print("- Pesi verificati con successo")
except Exception as e:
print(f"! Errore nel caricamento del modello: {str(e)}")
print("Tentativo di ricostruzione del modello...")
try:
# Ricostruzione del modello
if model_name == 'solarradiation':
model = create_radiation_model(input_shape=(24, 8))
elif model_name == 'solarenergy':
model = create_energy_model(input_shape=(24, 8))
elif model_name == 'uvindex':
model = create_uv_model(input_shape=(24, 8))
else:
raise ValueError(f"Tipo di modello non riconosciuto: {model_name}")
# Carica i pesi
model.load_weights(weights_path)
print("- Modello ricostruito dai pesi con successo")
except Exception as e:
print(f"! Errore nella ricostruzione del modello: {str(e)}")
return None, None, None
# 3. Carica gli scaler
scalers = {}
scaler_path = os.path.join(model_path, 'scalers')
if os.path.exists(scaler_path):
print("\nCaricamento scaler:")
for scaler_file in os.listdir(scaler_path):
if scaler_file.endswith('_scaler.joblib'):
scaler_name = scaler_file.replace('_scaler.joblib', '')
scaler_file_path = os.path.join(scaler_path, scaler_file)
try:
scalers[scaler_name] = joblib.load(scaler_file_path)
print(f"- Caricato scaler {scaler_name}")
except Exception as e:
print(f"! Errore nel caricamento dello scaler {scaler_name}: {str(e)}")
else:
print("! Directory degli scaler non trovata")
# 4. Verifica integrità del modello
try:
# Verifica che il modello possa fare predizioni
if model_name == 'solarradiation':
dummy_input = [np.zeros((1, 24, 8)), np.zeros((1, 3))]
else:
dummy_input = np.zeros((1, 24, 8))
model.predict(dummy_input, verbose=0)
print("\n✓ Verifica integrità modello completata con successo")
except Exception as e:
print(f"\n! Attenzione: il modello potrebbe non funzionare correttamente: {str(e)}")
# 5. Carica e verifica il summary del modello
summary_path = os.path.join(model_path, f'{model_name}_summary.txt')
if os.path.exists(summary_path):
print("\nSummary del modello disponibile in:", summary_path)
# 6. Verifica il plot dell'architettura
plot_path = os.path.join(model_path, f'{model_name}_architecture.png')
if os.path.exists(plot_path):
print("Plot dell'architettura disponibile in:", plot_path)
print(f"\nCaricamento di {model_name} completato con successo!")
return model, scalers, model_config
except Exception as e:
print(f"\nErrore critico nel caricamento del modello {model_name}: {str(e)}")
return None, None, None
def load_weather_data(
data_path: str,
start_year: Optional[int] = None,
end_year: Optional[int] = None
) -> pd.DataFrame:
"""
Carica e preprocessa i dati meteorologici da file JSON o Parquet.
Parameters
----------
data_path : str
Percorso al file dei dati (può essere .json o .parquet)
start_year : int, optional
Anno di inizio per filtrare i dati
end_year : int, optional
Anno di fine per filtrare i dati
Returns
-------
pd.DataFrame
DataFrame contenente i dati meteo preprocessati
Examples
--------
>>> weather_data = load_weather_data('./data/weather_data.parquet', start_year=2010)
"""
try:
# Determina il tipo di file e carica di conseguenza
if data_path.endswith('.parquet'):
weather_data = pd.read_parquet(data_path)
elif data_path.endswith('.json'):
# Se è un file JSON, prima lo convertiamo in DataFrame
with open(data_path, 'r') as f:
raw_data = json.load(f)
weather_data = create_weather_dataset(raw_data)
else:
raise ValueError(f"Formato file non supportato: {data_path}")
# Converti la colonna datetime
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'], errors='coerce')
# Filtra per anno se specificato
if start_year is not None:
weather_data = weather_data[weather_data['datetime'].dt.year >= start_year]
if end_year is not None:
weather_data = weather_data[weather_data['datetime'].dt.year <= end_year]
# Aggiungi colonne di data
weather_data['date'] = weather_data['datetime'].dt.date
weather_data['year'] = weather_data['datetime'].dt.year
weather_data['month'] = weather_data['datetime'].dt.month
weather_data['day'] = weather_data['datetime'].dt.day
# Rimuovi righe con datetime nullo
weather_data = weather_data.dropna(subset=['datetime'])
# Ordina per datetime
weather_data = weather_data.sort_values('datetime')
# Gestione valori mancanti nelle colonne principali
numeric_columns = weather_data.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
if weather_data[col].isnull().any():
# Interpolazione lineare per i valori mancanti
weather_data[col] = weather_data[col].interpolate(method='linear')
# Rimuovi eventuali duplicati
weather_data = weather_data.drop_duplicates(subset=['datetime'])
# Verifica la completezza dei dati
print(f"Dati caricati dal {weather_data['datetime'].min()} al {weather_data['datetime'].max()}")
print(f"Numero totale di records: {len(weather_data)}")
return weather_data
except Exception as e:
print(f"Errore nel caricamento dei dati meteo: {str(e)}")
raise
def create_weather_dataset(raw_data: list) -> pd.DataFrame:
"""
Converte i dati JSON grezzi in un DataFrame strutturato.
Parameters
----------
raw_data : list
Lista di dizionari contenenti i dati meteo
Returns
-------
pd.DataFrame
DataFrame strutturato con i dati meteo
"""
dataset = []
seen_datetimes = set()
for day in raw_data:
date = day['datetime']
for hour in day['hours']:
datetime_str = f"{date} {hour['datetime']}"
# Verifica duplicati
if datetime_str in seen_datetimes:
continue
seen_datetimes.add(datetime_str)
# Gestione preciptype
if isinstance(hour['preciptype'], list):
preciptype = "__".join(hour['preciptype'])
else:
preciptype = hour['preciptype'] if hour['preciptype'] else ""
# Gestione conditions
conditions = hour['conditions'].replace(', ', '__').replace(' ', '_').lower()
# Crea la riga
row = {
'datetime': datetime_str,
'temp': hour['temp'],
'feelslike': hour['feelslike'],
'humidity': hour['humidity'],
'dew': hour['dew'],
'precip': hour['precip'],
'snow': hour['snow'],
'preciptype': preciptype.lower(),
'windspeed': hour['windspeed'],
'winddir': hour['winddir'],
'pressure': hour['pressure'],
'cloudcover': hour['cloudcover'],
'visibility': hour['visibility'],
'solarradiation': hour['solarradiation'],
'solarenergy': hour['solarenergy'],
'uvindex': hour['uvindex'],
'conditions': conditions,
'tempmax': day['tempmax'],
'tempmin': day['tempmin'],
'precipprob': day['precipprob'],
'precipcover': day['precipcover']
}
dataset.append(row)
# Ordina per datetime
dataset.sort(key=lambda x: datetime.strptime(x['datetime'], "%Y-%m-%d %H:%M:%S"))
return pd.DataFrame(dataset)
def load_olive_varieties(
data_path: str,
add_water_features: bool = True
) -> pd.DataFrame:
"""
Carica e preprocessa i dati delle varietà di olive.
Parameters
----------
data_path : str
Percorso al file dei dati
add_water_features : bool
Se True, aggiunge feature relative al consumo d'acqua
Returns
-------
pd.DataFrame
DataFrame contenente i dati delle varietà di olive
"""
try:
if data_path.endswith('.csv'):
olive_varieties = pd.read_csv(data_path)
elif data_path.endswith('.parquet'):
olive_varieties = pd.read_parquet(data_path)
else:
raise ValueError(f"Formato file non supportato: {data_path}")
# Se richiesto, aggiungi feature sul consumo d'acqua
if add_water_features and 'Fabbisogno Acqua Primavera (m³/ettaro)' not in olive_varieties.columns:
from src.data.data_simulator import add_olive_water_consumption_correlation
olive_varieties = add_olive_water_consumption_correlation(olive_varieties)
print(f"Dati varietà olive caricati: {len(olive_varieties)} varietà")
return olive_varieties
except Exception as e:
print(f"Errore nel caricamento dei dati delle varietà: {str(e)}")
raise

324
src/data/data_processor.py Normal file
View File

@ -0,0 +1,324 @@
# src/data/data_processor.py
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os
from typing import Tuple, List, Dict, Optional, Union
def preprocess_weather_data(weather_df: pd.DataFrame) -> pd.DataFrame:
"""
Calcola statistiche mensili per ogni anno dai dati meteo.
Parameters
----------
weather_df : pd.DataFrame
DataFrame contenente i dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con statistiche mensili
"""
# Calcola statistiche mensili per ogni anno
monthly_weather = weather_df.groupby(['year', 'month']).agg({
'temp': ['mean', 'min', 'max'],
'humidity': 'mean',
'precip': 'sum',
'windspeed': 'mean',
'cloudcover': 'mean',
'solarradiation': 'sum',
'solarenergy': 'sum',
'uvindex': 'max'
}).reset_index()
# Rinomina le colonne
monthly_weather.columns = ['year', 'month'] + [
f'{col[0]}_{col[1]}' for col in monthly_weather.columns[2:]
]
return monthly_weather
def create_sequences(timesteps: int, X: np.ndarray, y: Optional[np.ndarray] = None) -> Union[
np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""
Crea sequenze temporali dai dati.
Parameters
----------
timesteps : int
Numero di timestep per ogni sequenza
X : array-like
Dati di input
y : array-like, optional
Target values
Returns
-------
tuple o array
Se y è fornito: (X_sequences, y_sequences)
Se y è None: X_sequences
"""
Xs = []
for i in range(len(X) - timesteps):
Xs.append(X[i:i + timesteps])
if y is not None:
ys = []
for i in range(len(X) - timesteps):
ys.append(y[i + timesteps])
return np.array(Xs), np.array(ys)
return np.array(Xs)
def prepare_solar_data(weather_data: pd.DataFrame, features: List[str]) -> Tuple:
"""
Prepara i dati per i modelli solari.
Parameters
----------
weather_data : pd.DataFrame
DataFrame contenente i dati meteorologici
features : list
Lista delle feature da utilizzare
Returns
-------
tuple
(X_scaled, scaler_X, y_scaled, scaler_y, data_after_2010)
"""
# Aggiunge le caratteristiche temporali
weather_data = add_advanced_features(weather_data)
weather_data = pd.get_dummies(weather_data, columns=['season', 'time_period'], drop_first=True)
# Filtra dati dopo 2010
data_after_2010 = weather_data[weather_data['year'] >= 2010].copy()
data_after_2010 = data_after_2010.sort_values('datetime')
data_after_2010.set_index('datetime', inplace=True)
# Interpola valori mancanti
target_variables = ['solarradiation', 'solarenergy', 'uvindex']
for column in target_variables:
data_after_2010[column] = data_after_2010[column].interpolate(method='time')
# Rimuovi righe con valori mancanti
data_after_2010.dropna(subset=features + target_variables, inplace=True)
# Prepara X e y
X = data_after_2010[features].values
y = data_after_2010[target_variables].values
# Normalizza features
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y)
return X_scaled, scaler_X, y_scaled, scaler_y, data_after_2010
def prepare_transformer_data(df: pd.DataFrame, olive_varieties_df: pd.DataFrame) -> Tuple:
"""
Prepara i dati per il modello transformer.
"""
# Copia del DataFrame
df = df.copy()
# Ordina per zona e anno
df = df.sort_values(['zone', 'year'])
# Feature definition
temporal_features = ['temp_mean', 'precip_sum', 'solar_energy_sum']
static_features = ['ha']
target_features = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']
# Get clean varieties
all_varieties = olive_varieties_df['Varietà di Olive'].unique()
varieties = [clean_column_name(variety) for variety in all_varieties]
# Variety features structure
variety_features = [
'tech', 'pct', 'prod_t_ha', 'oil_prod_t_ha', 'oil_prod_l_ha',
'min_yield_pct', 'max_yield_pct', 'min_oil_prod_l_ha', 'max_oil_prod_l_ha',
'avg_oil_prod_l_ha', 'l_per_t', 'min_l_per_t', 'max_l_per_t', 'avg_l_per_t'
]
# Prepare columns
new_columns = {}
# Prepare features for each variety
for variety in varieties:
for feature in variety_features:
col_name = f"{variety}_{feature}"
if col_name in df.columns:
if feature != 'tech':
static_features.append(col_name)
# Binary features for cultivation techniques
for technique in ['tradizionale', 'intensiva', 'superintensiva']:
col_name = f"{variety}_{technique}"
new_columns[col_name] = df[f"{variety}_tech"].notna() & (
df[f"{variety}_tech"].str.lower() == technique
).fillna(False)
static_features.append(col_name)
# Add all new columns at once
new_df = pd.concat([df] + [pd.Series(v, name=k) for k, v in new_columns.items()], axis=1)
# Sort by zone and year
df_sorted = new_df.sort_values(['zone', 'year'])
# Window size definition
window_size = 41
# Prepare lists for data collection
temporal_sequences = []
static_features_list = []
targets_list = []
# Process data by zone
for zone in df_sorted['zone'].unique():
zone_data = df_sorted[df_sorted['zone'] == zone].reset_index(drop=True)
if len(zone_data) >= window_size:
for i in range(len(zone_data) - window_size + 1):
temporal_window = zone_data.iloc[i:i + window_size][temporal_features].values
if not np.isnan(temporal_window).any():
temporal_sequences.append(temporal_window)
static_features_list.append(zone_data.iloc[i + window_size - 1][static_features].values)
targets_list.append(zone_data.iloc[i + window_size - 1][target_features].values)
# Convert to numpy arrays
X_temporal = np.array(temporal_sequences)
X_static = np.array(static_features_list)
y = np.array(targets_list)
# Split data
indices = np.random.permutation(len(X_temporal))
train_idx = int(len(indices) * 0.65)
val_idx = int(len(indices) * 0.85)
train_indices = indices[:train_idx]
val_indices = indices[train_idx:val_idx]
test_indices = indices[val_idx:]
# Split datasets
X_temporal_train = X_temporal[train_indices]
X_temporal_val = X_temporal[val_indices]
X_temporal_test = X_temporal[test_indices]
X_static_train = X_static[train_indices]
X_static_val = X_static[val_indices]
X_static_test = X_static[test_indices]
y_train = y[train_indices]
y_val = y[val_indices]
y_test = y[test_indices]
# Standardization
scaler_temporal = StandardScaler()
scaler_static = StandardScaler()
scaler_y = StandardScaler()
# Apply standardization
X_temporal_train = scaler_temporal.fit_transform(X_temporal_train.reshape(-1, len(temporal_features))).reshape(
X_temporal_train.shape)
X_temporal_val = scaler_temporal.transform(X_temporal_val.reshape(-1, len(temporal_features))).reshape(
X_temporal_val.shape)
X_temporal_test = scaler_temporal.transform(X_temporal_test.reshape(-1, len(temporal_features))).reshape(
X_temporal_test.shape)
X_static_train = scaler_static.fit_transform(X_static_train)
X_static_val = scaler_static.transform(X_static_val)
X_static_test = scaler_static.transform(X_static_test)
y_train = scaler_y.fit_transform(y_train)
y_val = scaler_y.transform(y_val)
y_test = scaler_y.transform(y_test)
# Prepare input dictionaries
train_data = {'temporal': X_temporal_train, 'static': X_static_train}
val_data = {'temporal': X_temporal_val, 'static': X_static_val}
test_data = {'temporal': X_temporal_test, 'static': X_static_test}
# Save scalers
base_path = './kaggle/working/models/oil_transformer/'
os.makedirs(base_path, exist_ok=True)
joblib.dump(scaler_temporal, os.path.join(base_path, 'scaler_temporal.joblib'))
joblib.dump(scaler_static, os.path.join(base_path, 'scaler_static.joblib'))
joblib.dump(scaler_y, os.path.join(base_path, 'scaler_y.joblib'))
return (train_data, y_train), (val_data, y_val), (test_data, y_test), (scaler_temporal, scaler_static, scaler_y)
def encode_techniques(df: pd.DataFrame,
mapping_path: str = './kaggle/working/models/technique_mapping.joblib') -> pd.DataFrame:
"""
Codifica le tecniche di coltivazione usando un mapping salvato.
Parameters
----------
df : pd.DataFrame
DataFrame contenente le colonne delle tecniche
mapping_path : str
Percorso al file di mapping
Returns
-------
pd.DataFrame
DataFrame con le tecniche codificate
"""
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}. Run create_technique_mapping first.")
technique_mapping = joblib.load(mapping_path)
# Trova tutte le colonne delle tecniche
tech_columns = [col for col in df.columns if col.endswith('_tech')]
# Applica il mapping a tutte le colonne delle tecniche
for col in tech_columns:
df[col] = df[col].str.lower().map(technique_mapping).fillna(0).astype(int)
return df
def decode_techniques(df: pd.DataFrame,
mapping_path: str = './kaggle/working/models/technique_mapping.joblib') -> pd.DataFrame:
"""
Decodifica le tecniche di coltivazione usando un mapping salvato.
Parameters
----------
df : pd.DataFrame
DataFrame contenente le colonne delle tecniche codificate
mapping_path : str
Percorso al file di mapping
Returns
-------
pd.DataFrame
DataFrame con le tecniche decodificate
"""
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}")
technique_mapping = joblib.load(mapping_path)
reverse_mapping = {v: k for k, v in technique_mapping.items()}
reverse_mapping[0] = '' # Mapping per 0 a stringa vuota
# Trova tutte le colonne delle tecniche
tech_columns = [col for col in df.columns if col.endswith('_tech')]
# Applica il reverse mapping
for col in tech_columns:
df[col] = df[col].map(reverse_mapping)
return df

332
src/data/data_simulator.py Normal file
View File

@ -0,0 +1,332 @@
import pandas as pd
import numpy as np
from typing import Dict
from src.utils.helpers import clean_column_name
def calculate_weather_effect(row: pd.Series, optimal_temp: float) -> float:
"""
Calcola l'effetto delle condizioni meteorologiche sulla produzione.
Parameters
----------
row : pd.Series
Serie contenente i dati meteorologici
optimal_temp : float
Temperatura ottimale per la varietà
Returns
-------
float
Effetto combinato delle condizioni meteo
"""
# Effetti base
temp_effect = -0.1 * (row['temp_mean'] - optimal_temp) ** 2
rain_effect = -0.05 * (row['precip_sum'] - 600) ** 2 / 10000
sun_effect = 0.1 * row['solarenergy_sum'] / 1000
# Fattori di scala basati sulla fase di crescita
if row['growth_phase'] == 'dormancy':
temp_scale = 0.5
rain_scale = 0.2
sun_scale = 0.1
elif row['growth_phase'] == 'flowering':
temp_scale = 2.0
rain_scale = 1.5
sun_scale = 1.0
elif row['growth_phase'] == 'fruit_set':
temp_scale = 1.5
rain_scale = 1.0
sun_scale = 0.8
else: # ripening
temp_scale = 1.0
rain_scale = 0.5
sun_scale = 1.2
# Calcolo dell'effetto combinato
combined_effect = (
temp_scale * temp_effect +
rain_scale * rain_effect +
sun_scale * sun_effect
)
# Aggiustamenti specifici per fase
if row['growth_phase'] == 'flowering':
combined_effect -= 0.5 * max(0, row['precip_sum'] - 50) # Penalità per pioggia eccessiva
elif row['growth_phase'] == 'fruit_set':
combined_effect += 0.3 * max(0, row['temp_mean'] - (optimal_temp + 5)) # Bonus temperature alte
return combined_effect
def calculate_water_need(weather_data: pd.Series, base_need: float, optimal_temp: float) -> float:
"""
Calcola il fabbisogno idrico basato su temperatura e precipitazioni.
Parameters
----------
weather_data : pd.Series
Serie contenente i dati meteorologici
base_need : float
Fabbisogno idrico base
optimal_temp : float
Temperatura ottimale per la varietà
Returns
-------
float
Fabbisogno idrico calcolato
"""
temp_factor = 1 + 0.05 * (weather_data['temp_mean'] - optimal_temp)
rain_factor = 1 - 0.001 * weather_data['precip_sum']
return base_need * temp_factor * rain_factor
def add_olive_water_consumption_correlation(dataset: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge dati correlati al consumo d'acqua per ogni varietà di oliva.
Parameters
----------
dataset : pd.DataFrame
DataFrame contenente i dati delle varietà di olive
Returns
-------
pd.DataFrame
DataFrame con dati aggiuntivi sul consumo d'acqua
"""
# Dati simulati per il fabbisogno d'acqua e correlazione con temperatura
fabbisogno_acqua = {
"Nocellara dell'Etna": {"Primavera": 1200, "Estate": 2000, "Autunno": 1000, "Inverno": 500,
"Temperatura Ottimale": 18, "Resistenza": "Media"},
"Leccino": {"Primavera": 1000, "Estate": 1800, "Autunno": 800, "Inverno": 400, "Temperatura Ottimale": 20,
"Resistenza": "Alta"},
"Frantoio": {"Primavera": 1100, "Estate": 1900, "Autunno": 900, "Inverno": 450, "Temperatura Ottimale": 19,
"Resistenza": "Alta"},
"Coratina": {"Primavera": 1300, "Estate": 2200, "Autunno": 1100, "Inverno": 550, "Temperatura Ottimale": 17,
"Resistenza": "Media"},
"Moraiolo": {"Primavera": 1150, "Estate": 2100, "Autunno": 900, "Inverno": 480, "Temperatura Ottimale": 18,
"Resistenza": "Media"},
"Pendolino": {"Primavera": 1050, "Estate": 1850, "Autunno": 850, "Inverno": 430, "Temperatura Ottimale": 20,
"Resistenza": "Alta"},
"Taggiasca": {"Primavera": 1000, "Estate": 1750, "Autunno": 800, "Inverno": 400, "Temperatura Ottimale": 19,
"Resistenza": "Alta"},
"Canino": {"Primavera": 1100, "Estate": 1900, "Autunno": 900, "Inverno": 450, "Temperatura Ottimale": 18,
"Resistenza": "Media"},
"Itrana": {"Primavera": 1200, "Estate": 2000, "Autunno": 1000, "Inverno": 500, "Temperatura Ottimale": 17,
"Resistenza": "Media"},
"Ogliarola": {"Primavera": 1150, "Estate": 1950, "Autunno": 900, "Inverno": 480, "Temperatura Ottimale": 18,
"Resistenza": "Media"},
"Biancolilla": {"Primavera": 1050, "Estate": 1800, "Autunno": 850, "Inverno": 430, "Temperatura Ottimale": 19,
"Resistenza": "Alta"}
}
# Calcola fabbisogno idrico annuale
for varieta in fabbisogno_acqua:
fabbisogno_acqua[varieta]["Annuale"] = sum(
fabbisogno_acqua[varieta][stagione]
for stagione in ["Primavera", "Estate", "Autunno", "Inverno"]
)
# Aggiungi colonne al dataset
dataset["Fabbisogno Acqua Primavera (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Primavera"])
dataset["Fabbisogno Acqua Estate (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Estate"])
dataset["Fabbisogno Acqua Autunno (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Autunno"])
dataset["Fabbisogno Acqua Inverno (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Inverno"])
dataset["Fabbisogno Idrico Annuale (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Annuale"])
dataset["Temperatura Ottimale"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Temperatura Ottimale"])
dataset["Resistenza alla Siccità"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Resistenza"])
return dataset
def simulate_zone(base_weather: pd.DataFrame,
olive_varieties: pd.DataFrame,
year: int,
zone: int,
all_varieties: np.ndarray,
variety_techniques: Dict) -> Dict:
"""
Simula la produzione di olive per una singola zona.
Parameters
----------
base_weather : pd.DataFrame
DataFrame contenente i dati meteo di base
olive_varieties : pd.DataFrame
DataFrame con le informazioni sulle varietà
year : int
Anno della simulazione
zone : int
ID della zona
all_varieties : np.ndarray
Array con tutte le varietà disponibili
variety_techniques : Dict
Dizionario con le tecniche disponibili per ogni varietà
Returns
-------
Dict
Dizionario con i risultati della simulazione
"""
# Crea una copia dei dati meteo per questa zona
zone_weather = base_weather.copy()
# Genera variazioni meteorologiche specifiche per questa zona
zone_weather['temp_mean'] *= np.random.uniform(0.95, 1.05, len(zone_weather))
zone_weather['precip_sum'] *= np.random.uniform(0.9, 1.1, len(zone_weather))
zone_weather['solarenergy_sum'] *= np.random.uniform(0.95, 1.05, len(zone_weather))
# Genera caratteristiche specifiche della zona
num_varieties = np.random.randint(1, 4) # 1-3 varietà per zona
selected_varieties = np.random.choice(all_varieties, size=num_varieties, replace=False)
hectares = np.random.uniform(1, 10) # Dimensione del terreno
percentages = np.random.dirichlet(np.ones(num_varieties)) # Distribuzione delle varietà
# Inizializzazione contatori annuali
annual_production = 0
annual_min_oil = 0
annual_max_oil = 0
annual_avg_oil = 0
annual_water_need = 0
# Inizializzazione dizionario dati varietà
variety_data = {clean_column_name(variety): {
'tech': '',
'pct': 0,
'prod_t_ha': 0,
'oil_prod_t_ha': 0,
'oil_prod_l_ha': 0,
'min_yield_pct': 0,
'max_yield_pct': 0,
'min_oil_prod_l_ha': 0,
'max_oil_prod_l_ha': 0,
'avg_oil_prod_l_ha': 0,
'l_per_t': 0,
'min_l_per_t': 0,
'max_l_per_t': 0,
'avg_l_per_t': 0,
'olive_prod': 0,
'min_oil_prod': 0,
'max_oil_prod': 0,
'avg_oil_prod': 0,
'water_need': 0
} for variety in all_varieties}
# Simula produzione per ogni varietà selezionata
for i, variety in enumerate(selected_varieties):
# Seleziona tecnica di coltivazione casuale per questa varietà
technique = np.random.choice(variety_techniques[variety])
percentage = percentages[i]
# Ottieni informazioni specifiche della varietà
variety_info = olive_varieties[
(olive_varieties['Varietà di Olive'] == variety) &
(olive_varieties['Tecnica di Coltivazione'] == technique)
].iloc[0]
# Calcola produzione base con variabilità
base_production = variety_info['Produzione (tonnellate/ettaro)'] * 1000 * percentage * hectares / 12
base_production *= np.random.uniform(0.9, 1.1)
# Calcola effetti meteo sulla produzione
weather_effect = zone_weather.apply(
lambda row: calculate_weather_effect(row, variety_info['Temperatura Ottimale']),
axis=1
)
monthly_production = base_production * (1 + weather_effect / 10000)
monthly_production *= np.random.uniform(0.95, 1.05, len(zone_weather))
# Calcola produzione annuale per questa varietà
annual_variety_production = monthly_production.sum()
# Calcola rese di olio con variabilità
min_yield_factor = np.random.uniform(0.95, 1.05)
max_yield_factor = np.random.uniform(0.95, 1.05)
avg_yield_factor = (min_yield_factor + max_yield_factor) / 2
min_oil_production = annual_variety_production * variety_info[
'Min Litri per Tonnellata'] / 1000 * min_yield_factor
max_oil_production = annual_variety_production * variety_info[
'Max Litri per Tonnellata'] / 1000 * max_yield_factor
avg_oil_production = annual_variety_production * variety_info[
'Media Litri per Tonnellata'] / 1000 * avg_yield_factor
# Calcola fabbisogno idrico
base_water_need = (
variety_info['Fabbisogno Acqua Primavera (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Estate (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Autunno (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Inverno (m³/ettaro)']
) / 4
monthly_water_need = zone_weather.apply(
lambda row: calculate_water_need(row, base_water_need, variety_info['Temperatura Ottimale']),
axis=1
)
monthly_water_need *= np.random.uniform(0.95, 1.05, len(monthly_water_need))
annual_variety_water_need = monthly_water_need.sum() * percentage * hectares
# Aggiorna totali annuali
annual_production += annual_variety_production
annual_min_oil += min_oil_production
annual_max_oil += max_oil_production
annual_avg_oil += avg_oil_production
annual_water_need += annual_variety_water_need
# Aggiorna dati varietà
clean_variety = clean_column_name(variety)
variety_data[clean_variety].update({
'tech': clean_column_name(technique),
'pct': percentage,
'prod_t_ha': variety_info['Produzione (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05),
'oil_prod_t_ha': variety_info['Produzione Olio (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05),
'oil_prod_l_ha': variety_info['Produzione Olio (litri/ettaro)'] * np.random.uniform(0.95, 1.05),
'min_yield_pct': variety_info['Min % Resa'] * min_yield_factor,
'max_yield_pct': variety_info['Max % Resa'] * max_yield_factor,
'min_oil_prod_l_ha': variety_info['Min Produzione Olio (litri/ettaro)'] * min_yield_factor,
'max_oil_prod_l_ha': variety_info['Max Produzione Olio (litri/ettaro)'] * max_yield_factor,
'avg_oil_prod_l_ha': variety_info['Media Produzione Olio (litri/ettaro)'] * avg_yield_factor,
'l_per_t': variety_info['Litri per Tonnellata'] * np.random.uniform(0.98, 1.02),
'min_l_per_t': variety_info['Min Litri per Tonnellata'] * min_yield_factor,
'max_l_per_t': variety_info['Max Litri per Tonnellata'] * max_yield_factor,
'avg_l_per_t': variety_info['Media Litri per Tonnellata'] * avg_yield_factor,
'olive_prod': annual_variety_production,
'min_oil_prod': min_oil_production,
'max_oil_prod': max_oil_production,
'avg_oil_prod': avg_oil_production,
'water_need': annual_variety_water_need
})
# Appiattisci i dati delle varietà
flattened_variety_data = {
f'{variety}_{key}': value
for variety, data in variety_data.items()
for key, value in data.items()
}
# Restituisci il risultato della zona
return {
'year': year,
'zone_id': zone + 1,
'temp_mean': zone_weather['temp_mean'].mean(),
'precip_sum': zone_weather['precip_sum'].sum(),
'solar_energy_sum': zone_weather['solarenergy_sum'].sum(),
'ha': hectares,
'zone': f"zone_{zone + 1}",
'olive_prod': annual_production,
'min_oil_prod': annual_min_oil,
'max_oil_prod': annual_max_oil,
'avg_oil_prod': annual_avg_oil,
'total_water_need': annual_water_need,
**flattened_variety_data
}

View File

@ -0,0 +1,220 @@
import pandas as pd
import numpy as np
import joblib
import os
from typing import Dict
def create_technique_mapping(olive_varieties: pd.DataFrame,
mapping_path: str = './kaggle/working/models/technique_mapping.joblib') -> Dict[str, int]:
"""
Crea un mapping numerico per le tecniche di coltivazione.
Parameters
----------
olive_varieties : pd.DataFrame
DataFrame contenente le varietà di olive e le tecniche
mapping_path : str
Percorso dove salvare il mapping
Returns
-------
Dict[str, int]
Dizionario di mapping tecnica -> codice numerico
"""
# Estrai tecniche uniche e convertile in lowercase
all_techniques = olive_varieties['Tecnica di Coltivazione'].str.lower().unique()
# Crea il mapping partendo da 1 (0 è riservato per valori mancanti)
technique_mapping = {tech: i + 1 for i, tech in enumerate(sorted(all_techniques))}
# Salva il mapping
os.makedirs(os.path.dirname(mapping_path), exist_ok=True)
joblib.dump(technique_mapping, mapping_path)
return technique_mapping
def calculate_stress_index(weather_data: pd.DataFrame,
olive_info: pd.Series,
vpd_threshold: float = 2.0) -> float:
"""
Calcola l'indice di stress per le olive basato su condizioni ambientali.
Parameters
----------
weather_data : pd.DataFrame
Dati meteorologici
olive_info : pd.Series
Informazioni sulla varietà di oliva
vpd_threshold : float
Soglia VPD per lo stress
Returns
-------
float
Indice di stress calcolato
"""
# Calcola componenti di stress
temp_stress = np.where(
weather_data['temp'] > olive_info['Temperatura Ottimale'],
(weather_data['temp'] - olive_info['Temperatura Ottimale']) / 10,
0
)
water_stress = np.where(
weather_data['vpd'] > vpd_threshold,
(weather_data['vpd'] - vpd_threshold) / 2,
0
)
# Considera la resistenza alla siccità
resistance_factor = 1.0
if olive_info['Resistenza alla Siccità'] == 'Alta':
resistance_factor = 0.7
elif olive_info['Resistenza alla Siccità'] == 'Media':
resistance_factor = 0.85
# Calcola stress complessivo
total_stress = (temp_stress + water_stress * resistance_factor)
return total_stress.mean()
def calculate_quality_indicators(olive_data: pd.DataFrame,
weather_data: pd.DataFrame) -> pd.DataFrame:
"""
Calcola indicatori di qualità per le olive.
Parameters
----------
olive_data : pd.DataFrame
Dati sulle olive
weather_data : pd.DataFrame
Dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con indicatori di qualità aggiunti
"""
result = olive_data.copy()
# Calcola indicatori base
result['oil_content_index'] = result['Max % Resa'] * (1 - result['stress_index'] * 0.1)
result['fruit_size_index'] = np.clip(
result['Produzione (tonnellate/ettaro)'] * (1 - result['water_stress'] * 0.15),0, None
)
# Calcola indice di maturazione ottimale
optimal_harvest_conditions = (
(weather_data['temp'].between(15, 25)) &
(weather_data['humidity'].between(50, 70)) &
(weather_data['cloudcover'] < 60)
)
result['maturity_index'] = optimal_harvest_conditions.mean()
# Calcola indice di qualità complessivo
result['quality_index'] = (
result['oil_content_index'] * 0.4 +
result['fruit_size_index'] * 0.3 +
result['maturity_index'] * 0.3
)
return result
def add_olive_features(df: pd.DataFrame,
weather_data: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature specifiche per le olive.
Parameters
----------
df : pd.DataFrame
DataFrame delle varietà di olive
weather_data : pd.DataFrame
Dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con feature aggiuntive
"""
result = df.copy()
# Calcola stress index per ogni varietà
result['stress_index'] = result.apply(
lambda row: calculate_stress_index(weather_data, row),
axis=1
)
# Aggiungi indicatori di qualità
result = calculate_quality_indicators(result, weather_data)
# Calcola efficienza produttiva
result['production_efficiency'] = result['Produzione (tonnellate/ettaro)'] / \
result['Fabbisogno Idrico Annuale (m³/ettaro)']
# Calcola indice di adattamento climatico
result['climate_adaptation'] = np.where(
result['Resistenza alla Siccità'] == 'Alta',
0.9,
np.where(result['Resistenza alla Siccità'] == 'Media', 0.7, 0.5)
)
# Aggiungi feature di produzione
result = add_production_features(result, weather_data)
return result
def add_production_features(df: pd.DataFrame,
weather_data: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature relative alla produzione di olive.
Parameters
----------
df : pd.DataFrame
DataFrame delle varietà di olive
weather_data : pd.DataFrame
Dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con feature di produzione
"""
result = df.copy()
# Calcola i rapporti di produzione
result['oil_yield_ratio'] = result['Produzione Olio (tonnellate/ettaro)'] / result['Produzione (tonnellate/ettaro)']
result['water_efficiency'] = result['Produzione (tonnellate/ettaro)'] / result['Fabbisogno Idrico Annuale (m³/ettaro)']
# Calcola indici di produttività
result['productivity_index'] = (
result['oil_yield_ratio'] * 0.4 +
result['water_efficiency'] * 0.3 +
result['climate_adaptation'] * 0.3
)
# Aggiungi indicatori di rendimento
result['yield_stability'] = 1 - (
(result['Max % Resa'] - result['Min % Resa']) / result['Max % Resa']
)
result['oil_quality_potential'] = (
result['Max Litri per Tonnellata'] / 1000 * result['yield_stability'] * (1 - result['stress_index'] * 0.1)
)
# Calcola intervalli di produzione ottimale
result['optimal_production_lower'] = result['Produzione (tonnellate/ettaro)'] * 0.8
result['optimal_production_upper'] = result['Produzione (tonnellate/ettaro)'] * 1.2
# Aggiungi indici economici
result['economic_efficiency'] = (result['Produzione Olio (litri/ettaro)'] / result['Fabbisogno Idrico Annuale (m³/ettaro)']) * result['productivity_index']
return result

View File

@ -0,0 +1,205 @@
import pandas as pd
import numpy as np
from typing import Union, Optional
from datetime import datetime
def get_season(date: datetime) -> str:
"""
Determina la stagione in base alla data.
Parameters
----------
date : datetime
Data per cui determinare la stagione
Returns
-------
str
Nome della stagione ('Winter', 'Spring', 'Summer', 'Autumn')
"""
month = date.month
day = date.day
if (month == 12 and day >= 21) or (month <= 3 and day < 20):
return 'Winter'
elif (month == 3 and day >= 20) or (month <= 6 and day < 21):
return 'Spring'
elif (month == 6 and day >= 21) or (month <= 9 and day < 23):
return 'Summer'
elif (month == 9 and day >= 23) or (month <= 12 and day < 21):
return 'Autumn'
else:
return 'Unknown'
def get_time_period(hour: int) -> str:
"""
Determina il periodo del giorno in base all'ora.
Parameters
----------
hour : int
Ora del giorno (0-23)
Returns
-------
str
Periodo del giorno ('Morning', 'Afternoon', 'Evening', 'Night')
"""
if 5 <= hour < 12:
return 'Morning'
elif 12 <= hour < 17:
return 'Afternoon'
elif 17 <= hour < 21:
return 'Evening'
else:
return 'Night'
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature temporali al DataFrame.
Parameters
----------
df : pd.DataFrame
DataFrame contenente una colonna 'datetime'
Returns
-------
pd.DataFrame
DataFrame con feature temporali aggiuntive
"""
# Assicurati che datetime sia nel formato corretto
df['datetime'] = pd.to_datetime(df['datetime'])
# Feature temporali di base
df['timestamp'] = df['datetime'].astype(np.int64) // 10 ** 9
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
# Feature cicliche
df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24))
df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24))
df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
# Feature calendario
df['day_of_week'] = df['datetime'].dt.dayofweek
df['day_of_year'] = df['datetime'].dt.dayofyear
df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)
df['quarter'] = df['datetime'].dt.quarter
# Feature cicliche giorno dell'anno
df['day_of_year_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25))
df['day_of_year_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365.25))
# Flag speciali
df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)
df['is_quarter_end'] = df['datetime'].dt.is_quarter_end.astype(int)
df['is_year_end'] = df['datetime'].dt.is_year_end.astype(int)
# Periodi del giorno e stagioni
df['season'] = df['datetime'].apply(get_season)
df['time_period'] = df['hour'].apply(get_time_period)
return df
def create_time_based_features(
df: pd.DataFrame,
datetime_col: str = 'datetime',
add_cyclical: bool = True,
add_time_periods: bool = True,
add_seasons: bool = True,
custom_features: Optional[list] = None
) -> pd.DataFrame:
"""
Crea feature temporali personalizzate.
Parameters
----------
df : pd.DataFrame
DataFrame di input
datetime_col : str
Nome della colonna datetime
add_cyclical : bool
Se True, aggiunge feature cicliche
add_time_periods : bool
Se True, aggiunge periodi del giorno
add_seasons : bool
Se True, aggiunge stagioni
custom_features : list, optional
Lista di feature temporali personalizzate da aggiungere
Returns
-------
pd.DataFrame
DataFrame con le nuove feature temporali
"""
# Crea una copia del DataFrame
result = df.copy()
# Converti la colonna datetime se necessario
if not pd.api.types.is_datetime64_any_dtype(result[datetime_col]):
result[datetime_col] = pd.to_datetime(result[datetime_col])
# Feature temporali di base
result['year'] = result[datetime_col].dt.year
result['month'] = result[datetime_col].dt.month
result['day'] = result[datetime_col].dt.day
result['hour'] = result[datetime_col].dt.hour
result['day_of_week'] = result[datetime_col].dt.dayofweek
result['day_of_year'] = result[datetime_col].dt.dayofyear
# Feature cicliche
if add_cyclical:
# Ora
result['hour_sin'] = np.sin(result['hour'] * (2 * np.pi / 24))
result['hour_cos'] = np.cos(result['hour'] * (2 * np.pi / 24))
# Mese
result['month_sin'] = np.sin((result['month'] - 1) * (2 * np.pi / 12))
result['month_cos'] = np.cos((result['month'] - 1) * (2 * np.pi / 12))
# Giorno dell'anno
result['day_of_year_sin'] = np.sin((result['day_of_year'] - 1) * (2 * np.pi / 365.25))
result['day_of_year_cos'] = np.cos((result['day_of_year'] - 1) * (2 * np.pi / 365.25))
# Giorno della settimana
result['day_of_week_sin'] = np.sin(result['day_of_week'] * (2 * np.pi / 7))
result['day_of_week_cos'] = np.cos(result['day_of_week'] * (2 * np.pi / 7))
# Periodi del giorno
if add_time_periods:
result['time_period'] = result['hour'].apply(get_time_period)
# One-hot encoding del periodo del giorno
time_period_dummies = pd.get_dummies(result['time_period'], prefix='time_period')
result = pd.concat([result, time_period_dummies], axis=1)
# Stagioni
if add_seasons:
result['season'] = result[datetime_col].apply(get_season)
# One-hot encoding delle stagioni
season_dummies = pd.get_dummies(result['season'], prefix='season')
result = pd.concat([result, season_dummies], axis=1)
# Feature personalizzate
if custom_features:
for feature in custom_features:
if feature == 'is_weekend':
result['is_weekend'] = result['day_of_week'].isin([5, 6]).astype(int)
elif feature == 'is_business_hour':
result['is_business_hour'] = ((result['hour'] >= 9) &
(result['hour'] < 18) &
~result['is_weekend']).astype(int)
elif feature == 'season_progress':
result['season_progress'] = result.apply(
lambda x: (x['day_of_year'] % 91) / 91.0, axis=1
)
return result

View File

@ -0,0 +1,186 @@
import pandas as pd
import numpy as np
from typing import Union
def calculate_vpd(temp: Union[float, np.ndarray], humidity: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
"""
Calcola il Deficit di Pressione di Vapore (VPD).
VPD è una misura della domanda evaporativa dell'aria.
Parameters
----------
temp : float or np.ndarray
Temperatura in Celsius
humidity : float or np.ndarray
Umidità relativa (0-100)
Returns
-------
float or np.ndarray
VPD in kPa
"""
# Pressione di vapore saturo (kPa)
es = 0.6108 * np.exp((17.27 * temp) / (temp + 237.3))
# Pressione di vapore attuale (kPa)
ea = es * (humidity / 100.0)
# VPD (kPa)
vpd = es - ea
return np.maximum(vpd, 0) # VPD non può essere negativo
def add_solar_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature relative alla radiazione solare.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con feature solari aggiunte
"""
# Calcola angolo solare
df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * \
np.sin(df['hour'] * (2 * np.pi / 24))
# Interazioni tra feature rilevanti
df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']
df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])
# Feature derivate
df['clear_sky_index'] = (100 - df['cloudcover']) / 100
df['temp_gradient'] = df['temp'] - df['tempmin']
# Feature di efficienza solare
df['solar_efficiency'] = df['solarenergy'] / (df['solarradiation'] + 1e-6) # evita divisione per zero
df['solar_temp_ratio'] = df['solarradiation'] / (df['temp'] + 273.15) # temperatura in Kelvin
return df
def add_solar_specific_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature specifiche per l'analisi solare.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con feature solari specifiche aggiunte
"""
# Angolo solare e durata del giorno
df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)
df['solar_noon'] = 12 - df['hour']
df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * \
np.cos(2 * np.pi * df['solar_noon'] / 24)
# Interazioni
df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']
df['visibility_elevation'] = df['visibility'] * df['solar_elevation']
# Rolling features
df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12, min_periods=1).mean()
df['temp_rolling_12h'] = df['temp'].rolling(window=12, min_periods=1).mean()
# Feature di efficienza energetica
df['solar_energy_density'] = df['solarenergy'] / df['day_length']
df['cloud_impact'] = df['solarradiation'] * (1 - df['cloudcover'] / 100)
return df
def add_environmental_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature ambientali derivate.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con feature ambientali aggiunte
"""
# Calcola VPD
df['vpd'] = calculate_vpd(df['temp'], df['humidity'])
# Feature di stress idrico
df['water_stress_index'] = df['vpd'] * (1 - df['humidity'] / 100)
df['dryness_index'] = (df['temp'] - df['dew']) * (100 - df['humidity']) / 100
# Indici di comfort
df['heat_index'] = np.where(
df['temp'] >= 27,
-42.379 + 2.04901523 * df['temp'] + 10.14333127 * df['humidity'] -
0.22475541 * df['temp'] * df['humidity'] - 0.00683783 * df['temp'] ** 2 -
0.05481717 * df['humidity'] ** 2 + 0.00122874 * df['temp'] ** 2 * df['humidity'] +
0.00085282 * df['temp'] * df['humidity'] ** 2 -
0.00000199 * df['temp'] ** 2 * df['humidity'] ** 2,
df['temp']
)
# Rolling means per trend
windows = [3, 6, 12, 24] # ore
for window in windows:
df[f'temp_rolling_mean_{window}h'] = df['temp'].rolling(window=window, min_periods=1).mean()
df[f'humid_rolling_mean_{window}h'] = df['humidity'].rolling(window=window, min_periods=1).mean()
df[f'precip_rolling_sum_{window}h'] = df['precip'].rolling(window=window, min_periods=1).sum()
return df
def add_weather_indicators(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge indicatori meteorologici complessi.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con indicatori meteorologici aggiunti
"""
# Indicatori di stabilità atmosferica
df['temp_stability'] = df['temp_rolling_mean_12h'].std()
df['pressure_tendency'] = df['pressure'].diff()
# Indicatori di precipitazioni
df['rain_intensity'] = np.where(
df['precip'] > 0,
df['precip'] / (df['precip_rolling_sum_24h'] + 1e-6),
0
)
df['dry_spell'] = (df['precip'] == 0).astype(int).groupby(
(df['precip'] != 0).cumsum()
).cumsum()
# Indicatori di comfort termico
df['apparent_temp'] = df['temp'] + 0.33 * df['vpd'] - 0.7 * df['windspeed'] - 4.0
df['frost_risk'] = (df['temp'] < 2).astype(int)
df['heat_stress'] = (df['temp'] > 30).astype(int) * (df['humidity'] > 70).astype(int)
# Indicatori di qualità dell'aria
df['stagnation_index'] = (df['windspeed'] < 5).astype(int) * (df['cloudcover'] > 80).astype(int)
df['visibility_index'] = df['visibility'] * (1 - df['cloudcover'] / 100)
# Indicatori agrometeorologici
df['growing_degree_days'] = np.maximum(0, df['temp'] - 10) # base 10°C
df['chill_hours'] = (df['temp'] < 7).astype(int)
df['evapotranspiration_proxy'] = df['vpd'] * df['solarradiation'] * (1 + 0.536 * df['windspeed'])
return df

207
src/models/callbacks.py Normal file
View File

@ -0,0 +1,207 @@
import tensorflow as tf
import numpy as np
from typing import Dict, Optional, List
import os
import json
from datetime import datetime
@tf.keras.saving.register_keras_serializable()
class CustomCallback(tf.keras.callbacks.Callback):
"""
Callback personalizzato per monitorare la non-negatività delle predizioni
e altre metriche durante il training.
"""
def __init__(self, validation_data: Optional[tuple] = None):
super().__init__()
self.validation_data = validation_data
def on_epoch_end(self, epoch: int, logs: Optional[Dict] = None):
try:
if hasattr(self.model, 'validation_data'):
val_x = self.model.validation_data[0]
if isinstance(val_x, list): # Per il modello della radiazione
val_pred = self.model.predict(val_x, verbose=0)
else:
val_pred = self.model.predict(val_x, verbose=0)
# Verifica non-negatività
if np.any(val_pred < 0):
print("\nWarning: Rilevati valori negativi nelle predizioni")
print(f"Min value: {np.min(val_pred)}")
# Statistiche predizioni
print(f"\nStatistiche predizioni epoca {epoch}:")
print(f"Min: {np.min(val_pred):.4f}")
print(f"Max: {np.max(val_pred):.4f}")
print(f"Media: {np.mean(val_pred):.4f}")
# Aggiunge le metriche ai logs
if logs is not None:
logs['val_pred_min'] = np.min(val_pred)
logs['val_pred_max'] = np.max(val_pred)
logs['val_pred_mean'] = np.mean(val_pred)
except Exception as e:
print(f"\nWarning nel CustomCallback: {str(e)}")
@tf.keras.saving.register_keras_serializable()
class WarmUpLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
"""
Schedule del learning rate con warm-up lineare e decay esponenziale.
"""
def __init__(self, initial_learning_rate: float = 1e-3,
warmup_steps: int = 500,
decay_steps: int = 5000):
super().__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.decay_steps = decay_steps
def __call__(self, step):
warmup_pct = tf.cast(step, tf.float32) / self.warmup_steps
warmup_lr = self.initial_learning_rate * warmup_pct
decay_factor = tf.pow(0.1, tf.cast(step, tf.float32) / self.decay_steps)
decayed_lr = self.initial_learning_rate * decay_factor
return tf.where(step < self.warmup_steps, warmup_lr, decayed_lr)
def get_config(self):
return {
'initial_learning_rate': self.initial_learning_rate,
'warmup_steps': self.warmup_steps,
'decay_steps': self.decay_steps
}
class MetricLogger(tf.keras.callbacks.Callback):
"""
Logger avanzato per metriche di training che salva i risultati in JSON
e crea grafici di progresso.
"""
def __init__(self, log_dir: str = './logs',
metric_list: Optional[List[str]] = None,
save_freq: int = 1):
super().__init__()
self.log_dir = log_dir
os.makedirs(log_dir, exist_ok=True)
self.metric_list = metric_list or ['loss', 'val_loss', 'mae', 'val_mae']
self.save_freq = save_freq
self.history = {metric: [] for metric in self.metric_list}
# Timestamp per il nome del file
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.log_file = os.path.join(log_dir, f'metrics_{self.timestamp}.json')
def on_epoch_end(self, epoch: int, logs: Dict = None):
# Aggiorna lo storico
for metric in self.metric_list:
if metric in logs:
self.history[metric].append(float(logs[metric]))
# Salva i log periodicamente
if (epoch + 1) % self.save_freq == 0:
self._save_logs()
self._create_plots()
def _save_logs(self):
"""Salva i log in formato JSON."""
with open(self.log_file, 'w') as f:
json.dump({
'history': self.history,
'epochs': len(next(iter(self.history.values())))
}, f, indent=4)
def _create_plots(self):
"""Crea grafici delle metriche."""
import matplotlib.pyplot as plt
# Plot per ogni metrica
for metric in self.metric_list:
if metric in self.history and len(self.history[metric]) > 0:
plt.figure(figsize=(10, 6))
plt.plot(self.history[metric])
plt.title(f'Model {metric}')
plt.ylabel(metric)
plt.xlabel('Epoch')
plt.savefig(os.path.join(self.log_dir, f'{metric}_{self.timestamp}.png'))
plt.close()
class EarlyStoppingWithBest(tf.keras.callbacks.EarlyStopping):
"""
Early stopping avanzato che salva il miglior modello e fornisce
analisi dettagliate sulla convergenza.
"""
def __init__(self,
monitor: str = 'val_loss',
min_delta: float = 0,
patience: int = 0,
verbose: int = 0,
mode: str = 'auto',
baseline: Optional[float] = None,
restore_best_weights: bool = True,
start_from_epoch: int = 0):
super().__init__(
monitor=monitor,
min_delta=min_delta,
patience=patience,
verbose=verbose,
mode=mode,
baseline=baseline,
restore_best_weights=restore_best_weights,
start_from_epoch=start_from_epoch
)
self.best_epoch = 0
self.convergence_history = []
def on_epoch_end(self, epoch: int, logs: Optional[Dict] = None):
current = self.get_monitor_value(logs)
if current is None:
return
# Aggiungi il valore corrente alla storia
self.convergence_history.append(float(current))
# Calcola statistiche di convergenza
if len(self.convergence_history) > 1:
improvement = self.convergence_history[-2] - current
pct_improvement = (improvement / self.convergence_history[-2]) * 100
if self.verbose > 0:
print(f"\nEpoch {epoch + 1}: {self.monitor} improved by {pct_improvement:.2f}%")
# Aggiorna best_epoch se necessario
if self.monitor_op(current - self.min_delta, self.best):
self.best = current
self.best_epoch = epoch
self.wait = 0
else:
self.wait += 1
if self.wait >= self.patience:
self.stopped_epoch = epoch
self.model.stop_training = True
if self.restore_best_weights and self.best_weights is not None:
if self.verbose > 0:
print(f"\nRestoring model weights from epoch {self.best_epoch + 1}")
self.model.set_weights(self.best_weights)
def get_convergence_stats(self) -> Dict:
"""
Restituisce statistiche dettagliate sulla convergenza.
"""
if len(self.convergence_history) < 2:
return {}
improvements = np.diff(self.convergence_history)
return {
'best_epoch': self.best_epoch + 1,
'best_value': float(self.best),
'avg_improvement': float(np.mean(improvements)),
'total_improvement': float(self.convergence_history[0] - self.best),
'convergence_rate': float(np.mean(np.abs(improvements[1:] / improvements[:-1]))),
'final_value': float(self.convergence_history[-1])
}

327
src/models/layers.py Normal file
View File

@ -0,0 +1,327 @@
import tensorflow as tf
from tf.keras import layers
from typing import List, Optional
@tf.keras.saving.register_keras_serializable()
class MultiScaleAttention(layers.Layer):
"""
Layer di attenzione multi-scala per catturare pattern temporali a diverse granularità.
Attributes
----------
num_heads : int
Numero di teste di attenzione
head_dim : int
Dimensionalità per ogni testa
scales : List[int]
Lista delle scale temporali da considerare
"""
def __init__(
self,
num_heads: int = 8,
head_dim: int = 64,
scales: List[int] = [1, 2, 4],
dropout: float = 0.1,
**kwargs
):
super().__init__(**kwargs)
self.num_heads = num_heads
self.head_dim = head_dim
self.scales = scales
self.dropout = dropout
# Creiamo un'attention layer per ogni scala
self.attention_layers = [
layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=head_dim,
dropout=dropout,
name=f'attention_scale_{scale}'
) for scale in scales
]
# Layer per combinare le diverse scale
self.combine = layers.Dense(
head_dim * num_heads,
activation='gelu',
name='scale_combination'
)
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
# Lista per salvare gli output delle diverse scale
scale_outputs = []
for scale, attention in zip(self.scales, self.attention_layers):
# Applica max pooling per ridurre la sequenza alla scala corrente
if scale > 1:
pooled = tf.keras.layers.MaxPool1D(
pool_size=scale,
strides=scale
)(inputs)
else:
pooled = inputs
# Applica attenzione alla sequenza ridotta
attended = attention(pooled, pooled)
# Se necessario, riporta alla dimensione originale
if scale > 1:
attended = tf.keras.layers.UpSampling1D(size=scale)(attended)
# Taglia eventuali timestep in eccesso
attended = attended[:, :tf.shape(inputs)[1], :]
scale_outputs.append(attended)
# Concatena e combina gli output delle diverse scale
concatenated = tf.concat(scale_outputs, axis=-1)
output = self.combine(concatenated)
return output
def get_config(self) -> dict:
config = super().get_config()
config.update({
"num_heads": self.num_heads,
"head_dim": self.head_dim,
"scales": self.scales,
"dropout": self.dropout
})
return config
@tf.keras.saving.register_keras_serializable()
class TemporalConvBlock(layers.Layer):
"""
Blocco di convoluzione temporale con residual connection.
Attributes
----------
filters : int
Numero di filtri convoluzionali
kernel_sizes : List[int]
Lista delle dimensioni dei kernel da utilizzare
dilation_rates : List[int]
Lista dei tassi di dilatazione
"""
def __init__(
self,
filters: int = 64,
kernel_sizes: List[int] = [3, 5, 7],
dilation_rates: List[int] = [1, 2, 4],
dropout: float = 0.1,
**kwargs
):
super().__init__(**kwargs)
self.filters = filters
self.kernel_sizes = kernel_sizes
self.dilation_rates = dilation_rates
self.dropout = dropout
# Crea i layer convoluzionali
self.conv_layers = []
for k_size in kernel_sizes:
for d_rate in dilation_rates:
self.conv_layers.append(
layers.Conv1D(
filters=filters // (len(kernel_sizes) * len(dilation_rates)),
kernel_size=k_size,
dilation_rate=d_rate,
padding='same',
activation='gelu'
)
)
# Layer per il processing finale
self.combine = layers.Conv1D(filters, 1)
self.layernorm = layers.LayerNormalization()
self.dropout = layers.Dropout(dropout)
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
# Lista per gli output di ogni convoluzione
conv_outputs = []
# Applica ogni combinazione di kernel size e dilation rate
for conv in self.conv_layers:
conv_outputs.append(conv(inputs))
# Concatena tutti gli output
concatenated = tf.concat(conv_outputs, axis=-1)
# Combinazione finale
x = self.combine(concatenated)
x = self.layernorm(x)
x = self.dropout(x, training=training)
# Residual connection
return x + inputs
def get_config(self) -> dict:
config = super().get_config()
config.update({
"filters": self.filters,
"kernel_sizes": self.kernel_sizes,
"dilation_rates": self.dilation_rates,
"dropout": self.dropout
})
return config
@tf.keras.saving.register_keras_serializable()
class WeatherEmbedding(layers.Layer):
"""
Layer per l'embedding di feature meteorologiche.
Combina embedding categorici e numerici.
Attributes
----------
embedding_dim : int
Dimensionalità dell'embedding
num_numerical : int
Numero di feature numeriche
categorical_features : dict
Dizionario con feature categoriche e loro cardinalità
"""
def __init__(
self,
embedding_dim: int = 32,
num_numerical: int = 8,
categorical_features: Optional[dict] = None,
**kwargs
):
super().__init__(**kwargs)
self.embedding_dim = embedding_dim
self.num_numerical = num_numerical
self.categorical_features = categorical_features or {
'season': 4,
'time_period': 4,
'weather_condition': 10
}
# Layer per feature numeriche
self.numerical_projection = layers.Dense(
embedding_dim,
activation='gelu'
)
# Layer per feature categoriche
self.categorical_embeddings = {
name: layers.Embedding(
input_dim=num_categories,
output_dim=embedding_dim
)
for name, num_categories in self.categorical_features.items()
}
# Layer di combinazione finale
self.combine = layers.Dense(embedding_dim, activation='gelu')
def call(self, inputs: dict) -> tf.Tensor:
# Processa feature numeriche
numerical = self.numerical_projection(inputs['numerical'])
# Lista per gli embedding categorici
categorical_outputs = []
# Processa ogni feature categorica
for name, embedding_layer in self.categorical_embeddings.items():
if name in inputs['categorical']:
embedded = embedding_layer(inputs['categorical'][name])
categorical_outputs.append(embedded)
# Combina tutti gli embedding
if categorical_outputs:
categorical = tf.reduce_mean(tf.stack(categorical_outputs, axis=1), axis=1)
combined = tf.concat([numerical, categorical], axis=-1)
else:
combined = numerical
return self.combine(combined)
def get_config(self) -> dict:
config = super().get_config()
config.update({
"embedding_dim": self.embedding_dim,
"num_numerical": self.num_numerical,
"categorical_features": self.categorical_features
})
return config
@tf.keras.saving.register_keras_serializable()
class OliveVarietyEmbedding(layers.Layer):
"""
Layer per l'embedding delle varietà di olive e delle loro caratteristiche.
Attributes
----------
embedding_dim : int
Dimensionalità dell'embedding
num_varieties : int
Numero di varietà di olive
num_techniques : int
Numero di tecniche di coltivazione
"""
def __init__(
self,
embedding_dim: int = 32,
num_varieties: int = 11,
num_techniques: int = 3,
**kwargs
):
super().__init__(**kwargs)
self.embedding_dim = embedding_dim
self.num_varieties = num_varieties
self.num_techniques = num_techniques
# Embedding per varietà e tecniche
self.variety_embedding = layers.Embedding(
input_dim=num_varieties,
output_dim=embedding_dim
)
self.technique_embedding = layers.Embedding(
input_dim=num_techniques,
output_dim=embedding_dim
)
# Layer per feature continue
self.continuous_projection = layers.Dense(
embedding_dim,
activation='gelu'
)
# Layer di combinazione
self.combine = layers.Dense(embedding_dim, activation='gelu')
def call(self, inputs: dict) -> tf.Tensor:
# Embedding varietà
variety_embedded = self.variety_embedding(inputs['variety'])
# Embedding tecniche
technique_embedded = self.technique_embedding(inputs['technique'])
# Proiezione feature continue
continuous_projected = self.continuous_projection(inputs['continuous'])
# Combinazione
combined = tf.concat([
variety_embedded,
technique_embedded,
continuous_projected
], axis=-1)
return self.combine(combined)
def get_config(self) -> dict:
config = super().get_config()
config.update({
"embedding_dim": self.embedding_dim,
"num_varieties": self.num_varieties,
"num_techniques": self.num_techniques
})
return config

204
src/models/solar_models.py Normal file
View File

@ -0,0 +1,204 @@
import tensorflow as tf
import tf.keras.layers as layers
def create_radiation_model(input_shape, solar_params_shape=(3,)):
"""
Modello per la radiazione solare con vincoli di non-negatività.
"""
# Input layers
main_input = layers.Input(shape=input_shape, name='main_input')
solar_input = layers.Input(shape=solar_params_shape, name='solar_params')
# Branch CNN
x1 = layers.Conv1D(32, 3, padding='same')(main_input)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.Conv1D(64, 3, padding='same')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.GlobalAveragePooling1D()(x1)
# Branch LSTM
x2 = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(main_input)
x2 = layers.Bidirectional(layers.LSTM(32))(x2)
x2 = layers.BatchNormalization()(x2)
# Solar parameters processing
x3 = layers.Dense(32)(solar_input)
x3 = layers.BatchNormalization()(x3)
x3 = layers.Activation('relu')(x3)
# Combine all branches
x = layers.concatenate([x1, x2, x3])
# Dense layers with non-negativity constraints
x = layers.Dense(64, kernel_constraint=tf.keras.constraints.NonNeg())(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(32, kernel_constraint=tf.keras.constraints.NonNeg())(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
# Output layer con vincoli di non-negatività
output = layers.Dense(1,
kernel_constraint=tf.keras.constraints.NonNeg(),
activation='relu')(x)
model = layers.Model(inputs=[main_input, solar_input], outputs=output, name="SolarRadiation")
return model
def create_energy_model(input_shape):
"""
Modello migliorato per l'energia solare che sfrutta la relazione con la radiazione.
Include vincoli di non-negatività e migliore gestione delle dipendenze temporali.
"""
inputs = layers.Input(shape=input_shape)
# Branch 1: Elaborazione temporale con attention
# Multi-head attention per catturare relazioni temporali
x1 = layers.MultiHeadAttention(num_heads=8, key_dim=32)(inputs, inputs)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
# Temporal Convolution branch per catturare pattern locali
x2 = layers.Conv1D(
filters=64,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(inputs)
x2 = layers.BatchNormalization()(x2)
x2 = layers.Activation('relu')(x2)
x2 = layers.Conv1D(
filters=32,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(x2)
x2 = layers.BatchNormalization()(x2)
x2 = layers.Activation('relu')(x2)
# LSTM branch per memoria a lungo termine
x3 = layers.LSTM(64, return_sequences=True)(inputs)
x3 = layers.LSTM(32, return_sequences=False)(x3)
x3 = layers.BatchNormalization()(x3)
x3 = layers.Activation('relu')(x3)
# Global pooling per ogni branch
x1 = layers.GlobalAveragePooling1D()(x1)
x2 = layers.GlobalAveragePooling1D()(x2)
# Concatena tutti i branch
x = layers.concatenate([x1, x2, x3])
# Dense layers con vincoli di non-negatività
x = layers.Dense(
128,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(
64,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.2)(x)
# Output layer con vincolo di non-negatività
output = layers.Dense(
1,
kernel_constraint=tf.keras.constraints.NonNeg(),
activation='relu', # Garantisce output non negativo
kernel_regularizer=layers.l2(0.01)
)(x)
model = layers.Model(inputs=inputs, outputs=output, name="SolarEnergy")
return model
def create_uv_model(input_shape):
"""
Modello migliorato per l'indice UV che sfrutta sia radiazione che energia solare.
Include vincoli di non-negatività e considera le relazioni non lineari tra le variabili.
"""
inputs = layers.Input(shape=input_shape)
# CNN branch per pattern locali
x1 = layers.Conv1D(
filters=64,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(inputs)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.MaxPooling1D(pool_size=2)(x1)
x1 = layers.Conv1D(
filters=32,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.GlobalAveragePooling1D()(x1)
# Attention branch per relazioni complesse
# Specialmente utile per le relazioni con radiazione ed energia
x2 = layers.MultiHeadAttention(num_heads=4, key_dim=32)(inputs, inputs)
x2 = layers.BatchNormalization()(x2)
x2 = layers.Activation('relu')(x2)
x2 = layers.GlobalAveragePooling1D()(x2)
# Dense branch per le feature più recenti
x3 = layers.GlobalAveragePooling1D()(inputs)
x3 = layers.Dense(
64,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x3)
x3 = layers.BatchNormalization()(x3)
x3 = layers.Activation('relu')(x3)
# Fusion dei branch
x = layers.concatenate([x1, x2, x3])
# Dense layers con vincoli di non-negatività
x = layers.Dense(
128,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(
64,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.2)(x)
# Output layer con vincolo di non-negatività
output = layers.Dense(
1,
kernel_constraint=tf.keras.constraints.NonNeg(),
activation='relu', # Garantisce output non negativo
kernel_regularizer=layers.l2(0.01)
)(x)
model = layers.Model(inputs=inputs, outputs=output, name="SolarUV")
return model

385
src/models/training.py Normal file
View File

@ -0,0 +1,385 @@
import tensorflow as tf
import numpy as np
from typing import Dict, Tuple, List
import os
import keras
from src.models.transformer import create_olive_oil_transformer
from src.models.callbacks import CustomCallback, WarmUpLearningRateSchedule
def compile_model(model: tf.keras.Model, learning_rate: float = 1e-3) -> tf.keras.Model:
"""
Compila il modello con le impostazioni ottimizzate.
Parameters
----------
model : tf.keras.Model
Modello da compilare
learning_rate : float
Learning rate iniziale
Returns
-------
tf.keras.Model
Modello compilato
"""
lr_schedule = WarmUpLearningRateSchedule(
initial_learning_rate=learning_rate,
warmup_steps=500,
decay_steps=5000
)
model.compile(
optimizer=tf.keras.optimizers.AdamW(
learning_rate=lr_schedule,
weight_decay=0.01
),
loss=tf.keras.losses.Huber(),
metrics=['mae']
)
return model
def create_callbacks(target_names: List[str],
val_data: Dict,
val_targets: np.ndarray) -> List[tf.keras.callbacks.Callback]:
"""
Crea i callbacks per il training del modello.
Parameters
----------
target_names : list
Lista dei nomi dei target
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
Returns
-------
list
Lista dei callbacks configurati
"""
class TargetSpecificMetric(tf.keras.callbacks.Callback):
def __init__(self, validation_data, target_names):
super().__init__()
self.validation_data = validation_data
self.target_names = target_names
def on_epoch_end(self, epoch, logs={}):
x_val, y_val = self.validation_data
y_pred = self.model.predict(x_val, verbose=0)
for i, name in enumerate(self.target_names):
mae = np.mean(np.abs(y_val[:, i] - y_pred[:, i]))
logs[f'val_{name}_mae'] = mae
# Crea le cartelle per i checkpoint e i log
os.makedirs('./kaggle/working/models/oil_transformer/checkpoints', exist_ok=True)
os.makedirs('./kaggle/working/models/oil_transformer/logs', exist_ok=True)
callbacks = [
# Early Stopping
tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=20,
restore_best_weights=True,
min_delta=0.0005,
mode='min'
),
# Model Checkpoint
tf.keras.callbacks.ModelCheckpoint(
filepath='./kaggle/working/models/oil_transformer/checkpoints/model_{epoch:02d}_{val_loss:.4f}.h5',
monitor='val_loss',
save_best_only=True,
mode='min',
save_weights_only=True
),
# Target specific metrics
TargetSpecificMetric(
validation_data=(val_data, val_targets),
target_names=target_names
),
# Reduce LR on Plateau
tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=10,
min_lr=1e-6,
verbose=1
),
# TensorBoard logging
tf.keras.callbacks.TensorBoard(
log_dir='./kaggle/working/models/oil_transformer/logs',
histogram_freq=1,
write_graph=True,
update_freq='epoch'
)
]
return callbacks
def setup_transformer_training(train_data: Dict,
train_targets: np.ndarray,
val_data: Dict,
val_targets: np.ndarray) -> Tuple[tf.keras.Model, List, List[str]]:
"""
Configura e prepara il transformer con dimensioni dinamiche.
Parameters
----------
train_data : dict
Dati di training
train_targets : np.ndarray
Target di training
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
Returns
-------
tuple
(model, callbacks, target_names)
"""
# Estrai le shape dai dati
temporal_shape = (train_data['temporal'].shape[1], train_data['temporal'].shape[2])
static_shape = (train_data['static'].shape[1],)
num_outputs = train_targets.shape[1]
print(f"Shape rilevate:")
print(f"- Temporal shape: {temporal_shape}")
print(f"- Static shape: {static_shape}")
print(f"- Numero di output: {num_outputs}")
# Target names
target_names = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']
assert len(target_names) == num_outputs, \
f"Il numero di target names ({len(target_names)}) non corrisponde al numero di output ({num_outputs})"
# Crea il modello
model = create_olive_oil_transformer(
temporal_shape=temporal_shape,
static_shape=static_shape,
num_outputs=num_outputs
)
# Compila il modello
model = compile_model(model)
# Crea i callbacks
callbacks = create_callbacks(target_names, val_data, val_targets)
return model, callbacks, target_names
def train_transformer(train_data: Dict,
train_targets: np.ndarray,
val_data: Dict,
val_targets: np.ndarray,
epochs: int = 150,
batch_size: int = 64,
save_name: str = 'final_model') -> Tuple[tf.keras.Model, tf.keras.callbacks.History]:
"""
Funzione principale per l'addestramento del transformer.
Parameters
----------
train_data : dict
Dati di training
train_targets : np.ndarray
Target di training
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
epochs : int
Numero di epoche
batch_size : int
Dimensione del batch
save_name : str
Nome per salvare il modello
Returns
-------
tuple
(model, history)
"""
# Setup del modello
model, callbacks, target_names = setup_transformer_training(
train_data, train_targets, val_data, val_targets
)
# Mostra il summary del modello
model.summary()
os.makedirs(f"./kaggle/working/models/oil_transformer/", exist_ok=True)
keras.utils.plot_model(model, f"./kaggle/working/models/oil_transformer/{save_name}.png", show_shapes=True)
# Training
history = model.fit(
x=train_data,
y=train_targets,
validation_data=(val_data, val_targets),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
verbose=1,
shuffle=True
)
# Salva il modello
save_path = f'./kaggle/working/models/oil_transformer/{save_name}.keras'
model.save(save_path, save_format='keras')
os.makedirs(f'./kaggle/working/models/oil_transformer/weights/', exist_ok=True)
model.save_weights(f'./kaggle/working/models/oil_transformer/weights')
print(f"\nModello salvato in: {save_path}")
return model, history
def retrain_model(base_model: tf.keras.Model,
train_data: Dict,
train_targets: np.ndarray,
val_data: Dict,
val_targets: np.ndarray,
test_data: Dict,
test_targets: np.ndarray,
epochs: int = 50,
batch_size: int = 128) -> Tuple[tf.keras.Model, tf.keras.callbacks.History, Dict]:
"""
Implementa il retraining del modello con i dati combinati.
Parameters
----------
base_model : tf.keras.Model
Modello base da riaddestrate
train_data : dict
Dati di training
train_targets : np.ndarray
Target di training
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
test_data : dict
Dati di test
test_targets : np.ndarray
Target di test
epochs : int
Numero di epoche
batch_size : int
Dimensione del batch
Returns
-------
tuple
(model, history, final_metrics)
"""
print("Valutazione performance iniziali del modello...")
initial_metrics = {
'train': evaluate_model_performance(base_model, train_data, train_targets, "training"),
'val': evaluate_model_performance(base_model, val_data, val_targets, "validazione"),
'test': evaluate_model_performance(base_model, test_data, test_targets, "test")
}
# Combina i dati
combined_data = {
'temporal': np.concatenate([
train_data['temporal'],
val_data['temporal'],
test_data['temporal']
]),
'static': np.concatenate([
train_data['static'],
val_data['static'],
test_data['static']
])
}
combined_targets = np.concatenate([train_targets, val_targets, test_targets])
# Nuova suddivisione
indices = np.arange(len(combined_targets))
np.random.shuffle(indices)
split_idx = int(len(indices) * 0.9)
train_idx, val_idx = indices[:split_idx], indices[split_idx:]
# Prepara i dati per il retraining
retrain_data = {k: v[train_idx] for k, v in combined_data.items()}
retrain_targets = combined_targets[train_idx]
retrain_val_data = {k: v[val_idx] for k, v in combined_data.items()}
retrain_val_targets = combined_targets[val_idx]
# Callbacks
callbacks = [
tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True,
min_delta=0.0001
),
tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.2,
patience=5,
min_lr=1e-6,
verbose=1
),
tf.keras.callbacks.ModelCheckpoint(
filepath='./kaggle/working/models/oil_transformer/retrain_checkpoints/model_{epoch:02d}_{val_loss:.4f}.keras',
monitor='val_loss',
save_best_only=True,
mode='min',
save_weights_only=True
)
]
# Ricompila il modello
base_model = compile_model(
base_model,
learning_rate=1e-4 # Learning rate più basso per il fine-tuning
)
print("\nAvvio retraining...")
history = base_model.fit(
retrain_data,
retrain_targets,
validation_data=(retrain_val_data, retrain_val_targets),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
verbose=1
)
print("\nValutazione performance finali...")
final_metrics = {
'train': evaluate_model_performance(base_model, train_data, train_targets, "training"),
'val': evaluate_model_performance(base_model, val_data, val_targets, "validazione"),
'test': evaluate_model_performance(base_model, test_data, test_targets, "test")
}
# Salva il modello
save_path = './kaggle/working/models/oil_transformer/retrained_model.keras'
base_model.save(save_path, save_format='keras')
print(f"\nModello riaddestrato salvato in: {save_path}")
# Report miglioramenti
print("\nMiglioramenti delle performance:")
for dataset in ['train', 'val', 'test']:
print(f"\nSet {dataset}:")
for metric in initial_metrics[dataset].keys():
initial = initial_metrics[dataset][metric]
final = final_metrics[dataset][metric]
improvement = ((initial - final) / initial) * 100
print(f"{metric}: {improvement:.2f}% di miglioramento")
return base_model, history, final_metrics

332
src/models/transformer.py Normal file
View File

@ -0,0 +1,332 @@
import tensorflow as tf
from tf.keras import layers
from typing import Tuple, Optional, List
@tf.keras.saving.register_keras_serializable()
class DataAugmentation(layers.Layer):
"""
Layer personalizzato per l'augmentation dei dati temporali.
Attributes
----------
noise_stddev : float
Deviazione standard del rumore gaussiano
"""
def __init__(self, noise_stddev: float = 0.03, **kwargs):
super().__init__(**kwargs)
self.noise_stddev = noise_stddev
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
"""
Applica l'augmentation durante il training.
Parameters
----------
inputs : tf.Tensor
Dati di input
training : bool, optional
Flag che indica se siamo in fase di training
Returns
-------
tf.Tensor
Dati aumentati se in training, altrimenti dati originali
"""
if training:
return inputs + tf.random.normal(
shape=tf.shape(inputs),
mean=0.0,
stddev=self.noise_stddev
)
return inputs
def get_config(self) -> dict:
config = super().get_config()
config.update({"noise_stddev": self.noise_stddev})
return config
@tf.keras.saving.register_keras_serializable()
class PositionalEncoding(layers.Layer):
"""
Layer per l'encoding posizionale nel transformer.
Attributes
----------
d_model : int
Dimensionalità del modello
"""
def __init__(self, d_model: int, **kwargs):
super().__init__(**kwargs)
self.d_model = d_model
def build(self, input_shape: tf.TensorShape):
"""
Costruisce la matrice di encoding posizionale.
Parameters
----------
input_shape : tf.TensorShape
Shape dell'input
"""
_, seq_length, _ = input_shape
# Crea la matrice di encoding posizionale
position = tf.range(seq_length, dtype=tf.float32)[:, tf.newaxis]
div_term = tf.exp(
tf.range(0, self.d_model, 2, dtype=tf.float32) *
(-tf.math.log(10000.0) / self.d_model)
)
# Calcola sin e cos
pos_encoding = tf.zeros((1, seq_length, self.d_model))
pos_encoding_even = tf.sin(position * div_term)
pos_encoding_odd = tf.cos(position * div_term)
# Assegna i valori alle posizioni pari e dispari
pos_encoding = tf.concat(
[tf.expand_dims(pos_encoding_even, -1),
tf.expand_dims(pos_encoding_odd, -1)],
axis=-1
)
pos_encoding = tf.reshape(pos_encoding, (1, seq_length, -1))
pos_encoding = pos_encoding[:, :, :self.d_model]
# Salva l'encoding come peso non trainabile
self.pos_encoding = self.add_weight(
shape=(1, seq_length, self.d_model),
initializer=tf.keras.initializers.Constant(pos_encoding),
trainable=False,
name='positional_encoding'
)
super().build(input_shape)
def call(self, inputs: tf.Tensor) -> tf.Tensor:
"""
Applica l'encoding posizionale.
Parameters
----------
inputs : tf.Tensor
Dati di input
Returns
-------
tf.Tensor
Dati con encoding posizionale aggiunto
"""
batch_size = tf.shape(inputs)[0]
return inputs + tf.tile(self.pos_encoding, [batch_size, 1, 1])
def get_config(self) -> dict:
config = super().get_config()
config.update({"d_model": self.d_model})
return config
@tf.keras.saving.register_keras_serializable()
class OliveTransformerBlock(layers.Layer):
"""
Blocco transformer personalizzato per dati di produzione olive.
Attributes
----------
num_heads : int
Numero di teste di attenzione
key_dim : int
Dimensione delle chiavi
ff_dim : int
Dimensione del feed-forward network
dropout : float
Tasso di dropout
"""
def __init__(self, num_heads: int, key_dim: int, ff_dim: int, dropout: float = 0.1, **kwargs):
super().__init__(**kwargs)
self.num_heads = num_heads
self.key_dim = key_dim
self.ff_dim = ff_dim
self.dropout = dropout
# Multi-head attention
self.mha = layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=key_dim,
dropout=dropout
)
# Feed-forward network
self.ffn = tf.keras.Sequential([
layers.Dense(ff_dim, activation="gelu"),
layers.Dropout(dropout),
layers.Dense(key_dim)
])
# Layer normalization
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
# Dropout layers
self.dropout1 = layers.Dropout(dropout)
self.dropout2 = layers.Dropout(dropout)
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
"""
Forward pass del blocco transformer.
Parameters
----------
inputs : tf.Tensor
Dati di input
training : bool, optional
Flag di training
Returns
-------
tf.Tensor
Output del blocco transformer
"""
# Multi-head attention
attn_output = self.mha(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
# Feed-forward network
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def get_config(self) -> dict:
config = super().get_config()
config.update({
"num_heads": self.num_heads,
"key_dim": self.key_dim,
"ff_dim": self.ff_dim,
"dropout": self.dropout
})
return config
def create_olive_oil_transformer(
temporal_shape: Tuple[int, int],
static_shape: Tuple[int],
num_outputs: int,
d_model: int = 128,
num_heads: int = 8,
ff_dim: int = 256,
num_transformer_blocks: int = 4,
mlp_units: List[int] = [256, 128, 64],
dropout: float = 0.2
) -> tf.keras.Model:
"""
Crea un transformer per la predizione della produzione di olio d'oliva.
Parameters
----------
temporal_shape : tuple
Shape dei dati temporali (timesteps, features)
static_shape : tuple
Shape dei dati statici (features,)
num_outputs : int
Numero di output del modello
d_model : int
Dimensionalità del modello
num_heads : int
Numero di teste di attenzione
ff_dim : int
Dimensione del feed-forward network
num_transformer_blocks : int
Numero di blocchi transformer
mlp_units : list
Unità nei layer MLP
dropout : float
Tasso di dropout
Returns
-------
tf.keras.Model
Modello transformer configurato
"""
# Input layers
temporal_input = layers.Input(shape=temporal_shape, name='temporal')
static_input = layers.Input(shape=static_shape, name='static')
# === TEMPORAL PATH ===
x = layers.LayerNormalization(epsilon=1e-6)(temporal_input)
x = DataAugmentation()(x)
# Temporal projection
x = layers.Dense(d_model // 2, activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-5))(x)
x = layers.Dropout(dropout)(x)
x = layers.Dense(d_model, activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-5))(x)
# Positional encoding
x = PositionalEncoding(d_model)(x)
# Transformer blocks
skip_connection = x
for _ in range(num_transformer_blocks):
x = OliveTransformerBlock(num_heads, d_model, ff_dim, dropout)(x)
# Add final skip connection
x = layers.Add()([x, skip_connection])
# Temporal pooling
attention_pooled = layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=d_model // 4
)(x, x)
attention_pooled = layers.GlobalAveragePooling1D()(attention_pooled)
# Additional pooling operations
avg_pooled = layers.GlobalAveragePooling1D()(x)
max_pooled = layers.GlobalMaxPooling1D()(x)
# Combine pooling results
temporal_features = layers.Concatenate()([attention_pooled, avg_pooled, max_pooled])
# === STATIC PATH ===
static_features = layers.LayerNormalization(epsilon=1e-6)(static_input)
for units in [256, 128, 64]:
static_features = layers.Dense(
units,
activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-5)
)(static_features)
static_features = layers.Dropout(dropout)(static_features)
# === FEATURE FUSION ===
combined = layers.Concatenate()([temporal_features, static_features])
# === MLP HEAD ===
x = combined
for units in mlp_units:
x = layers.BatchNormalization()(x)
x = layers.Dense(
units,
activation="gelu",
kernel_regularizer=tf.keras.regularizers.l2(1e-5)
)(x)
x = layers.Dropout(dropout)(x)
# Output layer
outputs = layers.Dense(
num_outputs,
activation='linear',
kernel_regularizer=tf.keras.regularizers.l2(1e-5)
)(x)
# Create model
model = tf.keras.Model(
inputs={'temporal': temporal_input, 'static': static_input},
outputs=outputs,
name='OliveOilTransformer'
)
return model

1862
src/olive-oil-dashboard.py Normal file

File diff suppressed because it is too large Load Diff

36
src/olive_config.json Normal file
View File

@ -0,0 +1,36 @@
{
"oliveto": {
"hectares": 10,
"varieties": [
{
"variety": "Nocellara dell'Etna",
"technique": "Tradizionale",
"percentage": 70
},
{
"variety": "Frantoio",
"technique": "Tradizionale",
"percentage": 30
}
]
},
"costs": {
"fixed": {
"ammortamento": 2000,
"assicurazione": 500,
"manutenzione": 800
},
"variable": {
"raccolta": 0.35,
"potatura": 600,
"fertilizzanti": 400
},
"transformation": {
"molitura": 0.15,
"stoccaggio": 0.2,
"bottiglia": 1.2,
"etichettatura": 0.3
},
"selling_price": 12
}
}

502
src/training-notebook.ipynb Normal file
View File

@ -0,0 +1,502 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Addestramento Modello per Previsione Produzione Olio d'Oliva\n",
"\n",
"Questo notebook utilizza le funzioni modularizzate per:\n",
"1. Caricare e preprocessare i dati meteorologici\n",
"2. Preparare i dati per il training\n",
"3. Configurare e addestrare il modello"
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-07T17:19:09.011001Z",
"start_time": "2024-11-07T17:18:13.513Z"
}
},
"cell_type": "code",
"source": [
"#!apt-get update\n",
"#!apt-get install graphviz -y\n",
"\n",
"!pip install tensorflow\n",
"!pip install numpy\n",
"!pip install pandas\n",
"\n",
"!pip install keras\n",
"!pip install scikit-learn\n",
"!pip install matplotlib\n",
"!pip install joblib\n",
"!pip install pyarrow\n",
"!pip install fastparquet\n",
"!pip install scipy\n",
"!pip install seaborn\n",
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io\n",
"!pip install pvlib"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tensorflow in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (2.16.2)\r\n",
"Requirement already satisfied: absl-py>=1.0.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.1.0)\r\n",
"Requirement already satisfied: astunparse>=1.6.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.6.3)\r\n",
"Requirement already satisfied: flatbuffers>=23.5.26 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (24.3.25)\r\n",
"Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.4.0)\r\n",
"Requirement already satisfied: google-pasta>=0.1.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.2.0)\r\n",
"Requirement already satisfied: h5py>=3.10.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.11.0)\r\n",
"Requirement already satisfied: libclang>=13.0.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (18.1.1)\r\n",
"Requirement already satisfied: ml-dtypes~=0.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.3.2)\r\n",
"Requirement already satisfied: opt-einsum>=2.3.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.3.0)\r\n",
"Requirement already satisfied: packaging in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (24.1)\r\n",
"Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.20.3)\r\n",
"Requirement already satisfied: requests<3,>=2.21.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.32.3)\r\n",
"Requirement already satisfied: setuptools in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (72.1.0)\r\n",
"Requirement already satisfied: six>=1.12.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.16.0)\r\n",
"Requirement already satisfied: termcolor>=1.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.1.0)\r\n",
"Requirement already satisfied: typing-extensions>=3.6.6 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (4.11.0)\r\n",
"Requirement already satisfied: wrapt>=1.11.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.14.1)\r\n",
"Requirement already satisfied: grpcio<2.0,>=1.24.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.48.2)\r\n",
"Requirement already satisfied: tensorboard<2.17,>=2.16 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.16.2)\r\n",
"Requirement already satisfied: keras>=3.0.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.5.0)\r\n",
"Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.37.1)\r\n",
"Requirement already satisfied: numpy<2.0.0,>=1.23.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.23.5)\r\n",
"Requirement already satisfied: wheel<1.0,>=0.23.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from astunparse>=1.6.0->tensorflow) (0.43.0)\r\n",
"Requirement already satisfied: rich in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras>=3.0.0->tensorflow) (13.8.0)\r\n",
"Requirement already satisfied: namex in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras>=3.0.0->tensorflow) (0.0.8)\r\n",
"Requirement already satisfied: optree in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras>=3.0.0->tensorflow) (0.12.1)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (3.3.2)\r\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (3.7)\r\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (2.2.2)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (2024.8.30)\r\n",
"Requirement already satisfied: markdown>=2.6.8 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (3.4.1)\r\n",
"Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (0.7.0)\r\n",
"Requirement already satisfied: werkzeug>=1.0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (3.0.3)\r\n",
"Requirement already satisfied: importlib-metadata>=4.4 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from markdown>=2.6.8->tensorboard<2.17,>=2.16->tensorflow) (7.0.1)\r\n",
"Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from werkzeug>=1.0.1->tensorboard<2.17,>=2.16->tensorflow) (2.1.3)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras>=3.0.0->tensorflow) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras>=3.0.0->tensorflow) (2.15.1)\r\n",
"Requirement already satisfied: zipp>=0.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.17,>=2.16->tensorflow) (3.17.0)\r\n",
"Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.0.0->tensorflow) (0.1.2)\r\n",
"Requirement already satisfied: numpy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.23.5)\r\n",
"Requirement already satisfied: pandas in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (2.2.2)\r\n",
"Requirement already satisfied: numpy>=1.22.4 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (1.23.5)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (2023.3)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
"Requirement already satisfied: keras in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (3.5.0)\r\n",
"Requirement already satisfied: absl-py in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (2.1.0)\r\n",
"Requirement already satisfied: numpy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (1.23.5)\r\n",
"Requirement already satisfied: rich in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (13.8.0)\r\n",
"Requirement already satisfied: namex in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (0.0.8)\r\n",
"Requirement already satisfied: h5py in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (3.11.0)\r\n",
"Requirement already satisfied: optree in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (0.12.1)\r\n",
"Requirement already satisfied: ml-dtypes in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (0.3.2)\r\n",
"Requirement already satisfied: packaging in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (24.1)\r\n",
"Requirement already satisfied: typing-extensions>=4.5.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from optree->keras) (4.11.0)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras) (2.15.1)\r\n",
"Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from markdown-it-py>=2.2.0->rich->keras) (0.1.2)\r\n",
"Requirement already satisfied: scikit-learn in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.5.1)\r\n",
"Requirement already satisfied: numpy>=1.19.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (1.23.5)\r\n",
"Requirement already satisfied: scipy>=1.6.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (1.11.4)\r\n",
"Requirement already satisfied: joblib>=1.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (1.4.2)\r\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (3.5.0)\r\n",
"Requirement already satisfied: matplotlib in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (3.8.4)\r\n",
"Requirement already satisfied: contourpy>=1.0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (1.2.0)\r\n",
"Requirement already satisfied: cycler>=0.10 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (0.11.0)\r\n",
"Requirement already satisfied: fonttools>=4.22.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (4.51.0)\r\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (1.4.4)\r\n",
"Requirement already satisfied: numpy>=1.21 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (1.23.5)\r\n",
"Requirement already satisfied: packaging>=20.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (24.1)\r\n",
"Requirement already satisfied: pillow>=8 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (10.4.0)\r\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (3.0.9)\r\n",
"Requirement already satisfied: python-dateutil>=2.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (2.9.0.post0)\r\n",
"Requirement already satisfied: importlib-resources>=3.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (6.4.0)\r\n",
"Requirement already satisfied: zipp>=3.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.17.0)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\r\n",
"Requirement already satisfied: joblib in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.4.2)\r\n",
"Requirement already satisfied: pyarrow in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (17.0.0)\r\n",
"Requirement already satisfied: numpy>=1.16.6 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pyarrow) (1.23.5)\r\n",
"Requirement already satisfied: fastparquet in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (2024.5.0)\r\n",
"Requirement already satisfied: pandas>=1.5.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (2.2.2)\r\n",
"Requirement already satisfied: numpy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (1.23.5)\r\n",
"Requirement already satisfied: cramjam>=2.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (2.8.3)\r\n",
"Requirement already satisfied: fsspec in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (2024.6.1)\r\n",
"Requirement already satisfied: packaging in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (24.1)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.5.0->fastparquet) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.5.0->fastparquet) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.5.0->fastparquet) (2023.3)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->fastparquet) (1.16.0)\r\n",
"Requirement already satisfied: scipy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.11.4)\r\n",
"Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scipy) (1.23.5)\r\n",
"Requirement already satisfied: seaborn in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (0.13.2)\r\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from seaborn) (1.23.5)\r\n",
"Requirement already satisfied: pandas>=1.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from seaborn) (2.2.2)\r\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from seaborn) (3.8.4)\r\n",
"Requirement already satisfied: contourpy>=1.0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.0)\r\n",
"Requirement already satisfied: cycler>=0.10 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.11.0)\r\n",
"Requirement already satisfied: fonttools>=4.22.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.51.0)\r\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.4)\r\n",
"Requirement already satisfied: packaging>=20.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)\r\n",
"Requirement already satisfied: pillow>=8 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)\r\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.0.9)\r\n",
"Requirement already satisfied: python-dateutil>=2.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\r\n",
"Requirement already satisfied: importlib-resources>=3.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (6.4.0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.2->seaborn) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.2->seaborn) (2023.3)\r\n",
"Requirement already satisfied: zipp>=3.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.4->seaborn) (3.17.0)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)\r\n",
"Collecting tqdm\r\n",
" Downloading tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)\r\n",
"Downloading tqdm-4.67.0-py3-none-any.whl (78 kB)\r\n",
"Installing collected packages: tqdm\r\n",
"Successfully installed tqdm-4.67.0\r\n",
"Collecting pydot\r\n",
" Downloading pydot-3.0.2-py3-none-any.whl.metadata (10 kB)\r\n",
"Requirement already satisfied: pyparsing>=3.0.9 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pydot) (3.0.9)\r\n",
"Downloading pydot-3.0.2-py3-none-any.whl (35 kB)\r\n",
"Installing collected packages: pydot\r\n",
"Successfully installed pydot-3.0.2\r\n",
"Collecting tensorflow-io\r\n",
" Downloading tensorflow_io-0.37.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (14 kB)\r\n",
"Requirement already satisfied: tensorflow-io-gcs-filesystem==0.37.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow-io) (0.37.1)\r\n",
"Downloading tensorflow_io-0.37.1-cp39-cp39-macosx_12_0_arm64.whl (31.8 MB)\r\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m31.8/31.8 MB\u001B[0m \u001B[31m1.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m0m\r\n",
"\u001B[?25hInstalling collected packages: tensorflow-io\r\n",
"Successfully installed tensorflow-io-0.37.1\r\n",
"Collecting pvlib\r\n",
" Downloading pvlib-0.11.1-py3-none-any.whl.metadata (2.8 kB)\r\n",
"Requirement already satisfied: numpy>=1.19.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (1.23.5)\r\n",
"Requirement already satisfied: pandas>=1.3.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (2.2.2)\r\n",
"Requirement already satisfied: pytz in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (2024.1)\r\n",
"Requirement already satisfied: requests in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (2.32.3)\r\n",
"Requirement already satisfied: scipy>=1.6.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (1.11.4)\r\n",
"Requirement already satisfied: h5py in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (3.11.0)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.3.0->pvlib) (2.9.0.post0)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.3.0->pvlib) (2023.3)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (3.3.2)\r\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (3.7)\r\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (2.2.2)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (2024.8.30)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas>=1.3.0->pvlib) (1.16.0)\r\n",
"Downloading pvlib-0.11.1-py3-none-any.whl (29.5 MB)\r\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m29.5/29.5 MB\u001B[0m \u001B[31m2.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
"\u001B[?25hInstalling collected packages: pvlib\r\n",
"Successfully installed pvlib-0.11.1\r\n"
]
}
],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"import tensorflow as tf\n",
"import keras\n",
"\n",
"print(f\"Keras version: {keras.__version__}\")\n",
"print(f\"TensorFlow version: {tf.__version__}\")\n",
"print(f\"TensorFlow version: {tf.__version__}\")\n",
"print(f\"CUDA available: {tf.test.is_built_with_cuda()}\")\n",
"print(f\"GPU devices: {tf.config.list_physical_devices('GPU')}\")\n",
"\n",
"# GPU configuration\n",
"gpus = tf.config.experimental.list_physical_devices('GPU')\n",
"if gpus:\n",
" try:\n",
" for gpu in gpus:\n",
" tf.config.experimental.set_memory_growth(gpu, True)\n",
" logical_gpus = tf.config.experimental.list_logical_devices('GPU')\n",
" print(len(gpus), \"Physical GPUs,\", len(logical_gpus), \"Logical GPUs\")\n",
" except RuntimeError as e:\n",
" print(e)"
]
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Test semplice per verificare che la GPU funzioni\n",
"def test_gpu():\n",
" print(\"TensorFlow version:\", tf.__version__)\n",
" print(\"\\nDispositivi disponibili:\")\n",
" print(tf.config.list_physical_devices())\n",
"\n",
" # Creiamo e moltiplichiamo due tensori sulla GPU\n",
" with tf.device('/GPU:0'):\n",
" a = tf.random.normal([10000, 10000])\n",
" b = tf.random.normal([10000, 10000])\n",
" c = tf.matmul(a, b)\n",
"\n",
" print(\"\\nShape del risultato:\", c.shape)\n",
" print(\"Device del tensore:\", c.device)\n",
" return \"Test completato con successo!\"\n",
"\n",
"\n",
"test_gpu()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Imports necessari\n",
"from src.data.data_loader import load_weather_data, load_olive_varieties\n",
"from src.data.data_processor import prepare_solar_data, prepare_transformer_data\n",
"from src.features.weather_features import add_solar_features, add_environmental_features\n",
"from src.features.temporal_features import add_time_features\n",
"from src.models.training import train_transformer, setup_transformer_training\n",
"from src.utils.helpers import get_optimal_workers\n",
"from src.visualization.plots import plot_correlation_matrix\n",
"import pandas as pd\n",
"import os"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Caricamento e Preparazione Dati"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"random_state_value = 42\n",
"\n",
"base_dir = './kaggle'\n",
"input_dir = f'{base_dir}/input'\n",
"working_dir = f'{base_dir}/working'\n",
"working_data_dir = f'{working_dir}/data'\n",
"data_models_dir = f'{working_data_dir}/models'\n",
"\n",
"os.makedirs(working_dir, exist_ok=True)\n",
"os.makedirs(working_data_dir, exist_ok=True)\n",
"os.makedirs(data_models_dir, exist_ok=True)\n",
"\n",
"# Carica i dati meteorologici\n",
"weather_data = load_weather_data(\n",
" f'{input_dir}/olive-oil/weather_data.parquet',\n",
" start_year=2010\n",
")\n",
"\n",
"# Carica i dati delle varietà di olive\n",
"olive_varieties = load_olive_varieties(\n",
" f'{input_dir}/olive-oil/variety_olive_oil_production.csv'\n",
")\n",
"\n",
"print(f\"Shape dati meteo: {weather_data.shape}\")\n",
"print(f\"Shape dati olive: {olive_varieties.shape}\")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Feature Engineering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Aggiungi feature temporali\n",
"weather_data = add_time_features(weather_data)\n",
"\n",
"# Aggiungi feature solari e ambientali\n",
"weather_data = add_solar_features(weather_data)\n",
"weather_data = add_environmental_features(weather_data)\n",
"\n",
"# Definisci le feature da utilizzare\n",
"features = [\n",
" 'temp', 'tempmin', 'tempmax', 'humidity', 'cloudcover',\n",
" 'windspeed', 'pressure', 'visibility',\n",
" 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',\n",
" 'day_of_year_sin', 'day_of_year_cos',\n",
" 'temp_humidity', 'temp_cloudcover', 'visibility_cloudcover',\n",
" 'clear_sky_factor', 'day_length',\n",
" 'temp_1h_lag', 'cloudcover_1h_lag', 'humidity_1h_lag',\n",
" 'temp_rolling_mean_6h', 'cloudcover_rolling_mean_6h'\n",
" ] + [col for col in weather_data.columns if 'season_' in col or 'time_period_' in col]\n",
"\n",
"print(f\"Numero totale di feature: {len(features)}\")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Analisi delle Correlazioni"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Analizza correlazioni tra feature\n",
"plot_correlation_matrix(\n",
" weather_data[features + ['solarradiation', 'solarenergy', 'uvindex']],\n",
" title='Correlazioni tra Feature Meteorologiche'\n",
")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Preparazione Dati per il Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Prepara i dati per il modello\n",
"X_scaled, scaler_X, y_scaled, scaler_y, data_after_2010 = prepare_solar_data(\n",
" weather_data,\n",
" features\n",
")\n",
"\n",
"# Prepara i dati per il transformer\n",
"(train_data, train_targets), (val_data, val_targets), (test_data, test_targets), scalers = prepare_transformer_data(\n",
" data_after_2010, olive_varieties)\n",
"\n",
"print(\"\\nShape dei dati:\")\n",
"print(f\"Training - Temporal: {train_data['temporal'].shape}, Static: {train_data['static'].shape}\")\n",
"print(f\"Validation - Temporal: {val_data['temporal'].shape}, Static: {val_data['static'].shape}\")\n",
"print(f\"Test - Temporal: {test_data['temporal'].shape}, Static: {test_data['static'].shape}\")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Training del Modello"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Training del transformer\n",
"model, history = train_transformer(\n",
" train_data=train_data,\n",
" train_targets=train_targets,\n",
" val_data=val_data,\n",
" val_targets=val_targets,\n",
" epochs=150,\n",
" batch_size=64,\n",
" save_name='weather_transformer'\n",
")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Valutazione del Modello"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"from src.utils.metrics import calculate_real_error, evaluate_model_performance\n",
"\n",
"# Calcola gli errori reali\n",
"percentage_errors, absolute_errors = calculate_real_error(\n",
" model,\n",
" test_data,\n",
" test_targets,\n",
" scaler_y,\n",
" target_names=['solarradiation', 'solarenergy', 'uvindex']\n",
")\n",
"\n",
"# Valuta le performance del modello\n",
"metrics = evaluate_model_performance(\n",
" model,\n",
" test_data,\n",
" test_targets,\n",
" 'test'\n",
")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Visualizzazione dei Risultati"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"from src.visualization.plots import (\n",
" plot_production_trends,\n",
" plot_correlation_matrix\n",
")\n",
"\n",
"# Plot dei trend di produzione\n",
"predictions = model.predict(test_data)\n",
"predictions_real = scaler_y.inverse_transform(predictions)\n",
"\n",
"# Crea DataFrame con predizioni\n",
"results_df = pd.DataFrame(\n",
" predictions_real,\n",
" columns=['solarradiation', 'solarenergy', 'uvindex']\n",
")\n",
"\n",
"# Plot delle correlazioni tra predizioni\n",
"plot_correlation_matrix(\n",
" results_df,\n",
" title='Correlazioni tra Predizioni'\n",
")\n",
"\n",
"# Plot dei trend temporali\n",
"plot_production_trends(results_df)"
],
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

215
src/utils/helpers.py Normal file
View File

@ -0,0 +1,215 @@
import psutil
import multiprocessing
import re
import pandas as pd
from typing import List
def get_optimal_workers() -> int:
"""
Calcola il numero ottimale di workers basandosi sulle risorse del sistema.
Returns
-------
int
Numero ottimale di workers
"""
# Ottiene il numero di CPU logiche (inclusi i thread virtuali)
cpu_count = multiprocessing.cpu_count()
# Ottiene la memoria totale e disponibile in GB
memory = psutil.virtual_memory()
total_memory_gb = memory.total / (1024 ** 3)
available_memory_gb = memory.available / (1024 ** 3)
# Stima della memoria necessaria per worker (esempio: 2GB per worker)
memory_per_worker_gb = 2
# Calcola il numero massimo di workers basato sulla memoria disponibile
max_workers_by_memory = int(available_memory_gb / memory_per_worker_gb)
# Usa il minimo tra:
# - numero di CPU disponibili - 1 (lascia una CPU libera per il sistema)
# - numero massimo di workers basato sulla memoria
# - un limite massimo arbitrario (es. 32) per evitare troppo overhead
optimal_workers = min(
cpu_count - 1,
max_workers_by_memory,
32 # limite massimo arbitrario
)
# Assicura almeno 1 worker
return max(1, optimal_workers)
def clean_column_name(name: str) -> str:
"""
Rimuove caratteri speciali e spazi, converte in snake_case e abbrevia.
Parameters
----------
name : str
Nome della colonna da pulire
Returns
-------
str
Nome della colonna pulito
"""
# Rimuove caratteri speciali
name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
# Converte in snake_case
name = name.lower().replace(' ', '_')
# Abbreviazioni comuni
abbreviations = {
'production': 'prod',
'percentage': 'pct',
'hectare': 'ha',
'tonnes': 't',
'litres': 'l',
'minimum': 'min',
'maximum': 'max',
'average': 'avg'
}
for full, abbr in abbreviations.items():
name = name.replace(full, abbr)
return name
def clean_column_names(df: pd.DataFrame) -> List[str]:
"""
Pulisce tutti i nomi delle colonne in un DataFrame.
Parameters
----------
df : pd.DataFrame
DataFrame con le colonne da pulire
Returns
-------
list
Lista dei nuovi nomi delle colonne puliti
"""
new_columns = []
for col in df.columns:
# Usa regex per separare le varietà
varieties = re.findall(r'([a-z]+)_([a-z_]+)', col)
if varieties:
new_columns.append(f"{varieties[0][0]}_{varieties[0][1]}")
else:
new_columns.append(col)
return new_columns
def to_camel_case(text: str) -> str:
"""
Converte una stringa in camelCase.
Gestisce stringhe con spazi, trattini o underscore.
Se è una sola parola, la restituisce in minuscolo.
Parameters
----------
text : str
Testo da convertire
Returns
-------
str
Testo convertito in camelCase
"""
# Rimuove eventuali spazi iniziali e finali
text = text.strip()
# Se la stringa è vuota, ritorna stringa vuota
if not text:
return ""
# Sostituisce trattini e underscore con spazi
text = text.replace('-', ' ').replace('_', ' ')
# Divide la stringa in parole
words = text.split()
# Se non ci sono parole dopo lo split, ritorna stringa vuota
if not words:
return ""
# Se c'è una sola parola, ritorna in minuscolo
if len(words) == 1:
return words[0].lower()
# Altrimenti procedi con il camelCase
result = words[0].lower()
for word in words[1:]:
result += word.capitalize()
return result
def get_full_data(simulated_data: pd.DataFrame,
olive_varieties: pd.DataFrame) -> pd.DataFrame:
"""
Ottiene il dataset completo combinando dati simulati e varietà di olive.
Parameters
----------
simulated_data : pd.DataFrame
DataFrame con i dati simulati
olive_varieties : pd.DataFrame
DataFrame con le informazioni sulle varietà
Returns
-------
pd.DataFrame
DataFrame completo con tutte le informazioni
"""
# Colonne base rilevanti
relevant_columns = [
'year', 'temp_mean', 'precip_sum', 'solar_energy_sum',
'ha', 'zone', 'olive_prod'
]
# Aggiungi colonne specifiche per varietà
all_varieties = olive_varieties['Varietà di Olive'].unique()
varieties = [clean_column_name(variety) for variety in all_varieties]
for variety in varieties:
relevant_columns.extend([
f'{variety}_olive_prod',
f'{variety}_tech'
])
# Seleziona solo le colonne rilevanti
full_data = simulated_data[relevant_columns].copy()
# Aggiungi feature calcolate
for variety in varieties:
# Calcola efficienza produttiva
if f'{variety}_olive_prod' in full_data.columns:
full_data[f'{variety}_efficiency'] = (
full_data[f'{variety}_olive_prod'] / full_data['ha']
)
# Aggiungi indicatori tecnici
if f'{variety}_tech' in full_data.columns:
technique_dummies = pd.get_dummies(
full_data[f'{variety}_tech'],
prefix=f'{variety}_technique'
)
full_data = pd.concat([full_data, technique_dummies], axis=1)
# Aggiungi feature temporali
full_data['month'] = 1 # Assumiamo dati annuali
full_data['day'] = 1 # Assumiamo dati annuali
# Calcola medie mobili
for col in ['temp_mean', 'precip_sum', 'solar_energy_sum']:
full_data[f'{col}_ma3'] = full_data[col].rolling(window=3, min_periods=1).mean()
full_data[f'{col}_ma5'] = full_data[col].rolling(window=5, min_periods=1).mean()
return full_data

282
src/utils/metrics.py Normal file
View File

@ -0,0 +1,282 @@
import numpy as np
from typing import Dict, Tuple, List, Optional
from scipy import stats
def calculate_real_error(
model,
test_data: Dict,
test_targets: np.ndarray,
scaler_y,
target_names: Optional[List[str]] = None
) -> Tuple[List[float], List[float]]:
"""
Calcola l'errore reale denormalizzando le predizioni.
Parameters
----------
model : tf.keras.Model
Modello addestrato
test_data : dict
Dati di test
test_targets : np.ndarray
Target di test
scaler_y : scaler
Scaler utilizzato per normalizzare i target
target_names : list, optional
Nomi dei target
Returns
-------
tuple
(percentage_errors, absolute_errors)
"""
# Predizioni
predictions = model.predict(test_data)
# Denormalizza predizioni e target
predictions_real = scaler_y.inverse_transform(predictions)
targets_real = scaler_y.inverse_transform(test_targets)
# Calcola errori percentuali e assoluti
percentage_errors = []
absolute_errors = []
if target_names is None:
target_names = [f'target_{i}' for i in range(predictions_real.shape[1])]
# Calcola errori per ogni target
for i in range(predictions_real.shape[1]):
mae = np.mean(np.abs(predictions_real[:, i] - targets_real[:, i]))
mape = np.mean(np.abs((predictions_real[:, i] - targets_real[:, i]) / targets_real[:, i])) * 100
percentage_errors.append(mape)
absolute_errors.append(mae)
print(f"\n{target_names[i]}:")
print(f"MAE assoluto: {mae:.2f}")
print(f"Errore percentuale medio: {mape:.2f}%")
print(f"Precisione: {100 - mape:.2f}%")
print("-" * 50)
return percentage_errors, absolute_errors
def evaluate_model_performance(
model,
data: Dict,
targets: np.ndarray,
set_name: str = "",
threshold: Optional[float] = None
) -> Dict:
"""
Valuta le performance del modello su un set di dati.
Parameters
----------
model : tf.keras.Model
Modello da valutare
data : dict
Dati di input
targets : np.ndarray
Target reali
set_name : str
Nome del set di dati
threshold : float, optional
Soglia per calcolare accuracy binaria
Returns
-------
dict
Dizionario con le metriche calcolate
"""
predictions = model.predict(data, verbose=0)
metrics = {}
target_names = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']
for i, name in enumerate(target_names):
# Metriche di base
mae = np.mean(np.abs(targets[:, i] - predictions[:, i]))
mse = np.mean(np.square(targets[:, i] - predictions[:, i]))
rmse = np.sqrt(mse)
mape = np.mean(np.abs((targets[:, i] - predictions[:, i]) / (targets[:, i] + 1e-7))) * 100
# R2 score
ss_res = np.sum(np.square(targets[:, i] - predictions[:, i]))
ss_tot = np.sum(np.square(targets[:, i] - np.mean(targets[:, i])))
r2 = 1 - (ss_res / (ss_tot + 1e-7))
# Salva le metriche
metrics[f"{name}_mae"] = mae
metrics[f"{name}_rmse"] = rmse
metrics[f"{name}_mape"] = mape
metrics[f"{name}_r2"] = r2
# Calcola accuracy binaria se fornita una soglia
if threshold is not None:
binary_acc = np.mean(
(predictions[:, i] > threshold) == (targets[:, i] > threshold)
)
metrics[f"{name}_binary_acc"] = binary_acc
if set_name:
print(f"\nPerformance sul set {set_name}:")
for metric, value in metrics.items():
print(f"{metric}: {value:.4f}")
return metrics
def calculate_efficiency_metrics(
predictions: np.ndarray,
targets: np.ndarray,
resource_usage: np.ndarray
) -> Dict:
"""
Calcola metriche di efficienza basate sull'utilizzo delle risorse.
Parameters
----------
predictions : np.ndarray
Predizioni del modello
targets : np.ndarray
Target reali
resource_usage : np.ndarray
Dati sull'utilizzo delle risorse
Returns
-------
dict
Metriche di efficienza
"""
metrics = {}
# Efficienza di produzione
production_efficiency = predictions / (resource_usage + 1e-7)
target_efficiency = targets / (resource_usage + 1e-7)
# Calcola metriche
metrics['mean_efficiency'] = np.mean(production_efficiency)
metrics['efficiency_error'] = np.mean(np.abs(production_efficiency - target_efficiency))
metrics['efficiency_std'] = np.std(production_efficiency)
# ROI stimato
estimated_roi = (predictions - resource_usage) / (resource_usage + 1e-7)
actual_roi = (targets - resource_usage) / (resource_usage + 1e-7)
metrics['roi_error'] = np.mean(np.abs(estimated_roi - actual_roi))
# Sostenibilità
metrics['resource_utilization'] = np.mean(predictions / resource_usage)
metrics['efficiency_improvement'] = (
np.mean(production_efficiency) - np.mean(target_efficiency)
) / np.mean(target_efficiency) * 100
return metrics
def calculate_forecast_accuracy(
predictions: np.ndarray,
targets: np.ndarray,
horizons: List[int]
) -> Dict:
"""
Calcola l'accuratezza delle previsioni per diversi orizzonti temporali.
Parameters
----------
predictions : np.ndarray
Predizioni del modello
targets : np.ndarray
Target reali
horizons : list
Lista degli orizzonti temporali da valutare
Returns
-------
dict
Accuratezza per ogni orizzonte
"""
accuracy_metrics = {}
for horizon in horizons:
# Seleziona dati per l'orizzonte corrente
pred_horizon = predictions[:-horizon]
target_horizon = targets[horizon:]
# Calcola metriche
mae = np.mean(np.abs(pred_horizon - target_horizon))
mape = np.mean(np.abs((pred_horizon - target_horizon) / (target_horizon + 1e-7))) * 100
rmse = np.sqrt(np.mean(np.square(pred_horizon - target_horizon)))
# Calcola il coefficiente di correlazione
corr = np.corrcoef(pred_horizon.flatten(), target_horizon.flatten())[0, 1]
# Salva le metriche
accuracy_metrics[f'horizon_{horizon}'] = {
'mae': mae,
'mape': mape,
'rmse': rmse,
'correlation': corr
}
print(f"\nMetriche per orizzonte {horizon}:")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"RMSE: {rmse:.4f}")
print(f"Correlazione: {corr:.4f}")
return accuracy_metrics
def compute_confidence_intervals(
predictions: np.ndarray,
alpha: float = 0.05,
n_bootstrap: int = 1000
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Calcola intervalli di confidenza usando bootstrap.
Parameters
----------
predictions : np.ndarray
Predizioni del modello
alpha : float
Livello di significatività
n_bootstrap : int
Numero di campioni bootstrap
Returns
-------
tuple
(lower_bound, upper_bound, mean_predictions)
"""
n_samples, n_targets = predictions.shape
bootstrap_means = np.zeros((n_bootstrap, n_targets))
# Bootstrap sampling
for i in range(n_bootstrap):
indices = np.random.randint(0, n_samples, size=n_samples)
bootstrap_sample = predictions[indices]
bootstrap_means[i] = np.mean(bootstrap_sample, axis=0)
# Calcola intervalli di confidenza
lower_percentile = alpha / 2 * 100
upper_percentile = (1 - alpha / 2) * 100
lower_bound = np.percentile(bootstrap_means, lower_percentile, axis=0)
upper_bound = np.percentile(bootstrap_means, upper_percentile, axis=0)
mean_predictions = np.mean(predictions, axis=0)
# Calcola intervalli usando t-distribution
std_error = np.std(bootstrap_means, axis=0)
t_value = stats.t.ppf(1 - alpha / 2, df=n_samples - 1)
margin_error = t_value * std_error
print("\nIntervalli di Confidenza:")
for i in range(n_targets):
print(f"\nTarget {i + 1}:")
print(f"Media: {mean_predictions[i]:.4f}")
print(f"Intervallo: [{lower_bound[i]:.4f}, {upper_bound[i]:.4f}]")
print(f"Margine di errore: ±{margin_error[i]:.4f}")
return lower_bound, upper_bound, mean_predictions

255
src/visualization/plots.py Normal file
View File

@ -0,0 +1,255 @@
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from typing import Optional
def save_plot(plt: plt, title: str, output_dir: str = './kaggle/working/plots'):
"""
Salva il plot corrente con un nome formattato.
Parameters
----------
plt : matplotlib.pyplot
Riferimento a pyplot
title : str
Titolo del plot
output_dir : str
Directory di output per i plot
"""
os.makedirs(output_dir, exist_ok=True)
# Pulisci il nome del file
filename = "".join(x for x in title if x.isalnum() or x in [' ', '-', '_']).rstrip()
filename = filename.replace(' ', '_').lower()
filepath = os.path.join(output_dir, f"{filename}.png")
plt.savefig(filepath, bbox_inches='tight', dpi=300)
print(f"Plot salvato come: {filepath}")
def plot_variety_comparison(comparison_data: pd.DataFrame, metric: str):
"""
Crea un grafico a barre per confrontare le varietà di olive su una metrica specifica.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
metric : str
Nome della metrica da visualizzare
"""
plt.figure(figsize=(12, 6))
bars = plt.bar(comparison_data['Variety'], comparison_data[metric])
plt.title(f'Confronto di {metric} tra Varietà di Olive')
plt.xlabel('Varietà')
plt.ylabel(metric)
plt.xticks(rotation=45, ha='right')
# Aggiungi etichette sopra le barre
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2., height,
f'{height:.2f}',
ha='center', va='bottom')
plt.tight_layout()
plt.show()
# Salva il plot
save_plot(plt,
f'variety_comparison_{metric.lower().replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "")}')
plt.close()
def plot_efficiency_vs_production(comparison_data: pd.DataFrame):
"""
Crea uno scatter plot dell'efficienza vs produzione.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
"""
plt.figure(figsize=(10, 6))
plt.scatter(comparison_data['Avg Olive Production (kg/ha)'],
comparison_data['Oil Efficiency (L/kg)'],
s=100)
# Aggiungi etichette per ogni punto
for i, row in comparison_data.iterrows():
plt.annotate(row['Variety'],
(row['Avg Olive Production (kg/ha)'], row['Oil Efficiency (L/kg)']),
xytext=(5, 5), textcoords='offset points')
plt.title('Efficienza Olio vs Produzione Olive per Varietà')
plt.xlabel('Produzione Media Olive (kg/ha)')
plt.ylabel('Efficienza Olio (L olio / kg olive)')
plt.tight_layout()
# Salva il plot
save_plot(plt, 'efficiency_vs_production')
plt.close()
def plot_water_efficiency_vs_production(comparison_data: pd.DataFrame):
"""
Crea uno scatter plot dell'efficienza idrica vs produzione.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
"""
plt.figure(figsize=(10, 6))
plt.scatter(comparison_data['Avg Olive Production (kg/ha)'],
comparison_data['Water Efficiency (L oil/m³ water)'],
s=100)
# Aggiungi etichette per ogni punto
for i, row in comparison_data.iterrows():
plt.annotate(row['Variety'],
(row['Avg Olive Production (kg/ha)'],
row['Water Efficiency (L oil/m³ water)']),
xytext=(5, 5), textcoords='offset points')
plt.title('Efficienza Idrica vs Produzione Olive per Varietà')
plt.xlabel('Produzione Media Olive (kg/ha)')
plt.ylabel('Efficienza Idrica (L olio / m³ acqua)')
plt.tight_layout()
plt.show()
# Salva il plot
save_plot(plt, 'water_efficiency_vs_production')
plt.close()
def plot_water_need_vs_oil_production(comparison_data: pd.DataFrame):
"""
Crea uno scatter plot del fabbisogno idrico vs produzione di olio.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
"""
plt.figure(figsize=(10, 6))
plt.scatter(comparison_data['Avg Water Need (m³/ha)'],
comparison_data['Avg Oil Production (L/ha)'],
s=100)
# Aggiungi etichette per ogni punto
for i, row in comparison_data.iterrows():
plt.annotate(row['Variety'],
(row['Avg Water Need (m³/ha)'],
row['Avg Oil Production (L/ha)']),
xytext=(5, 5), textcoords='offset points')
plt.title('Produzione Olio vs Fabbisogno Idrico per Varietà')
plt.xlabel('Fabbisogno Idrico Medio (m³/ha)')
plt.ylabel('Produzione Media Olio (L/ha)')
plt.tight_layout()
plt.show()
# Salva il plot
save_plot(plt, 'water_need_vs_oil_production')
plt.close()
def plot_production_trends(data: pd.DataFrame,
variety: Optional[str] = None,
metrics: Optional[list] = None):
"""
Crea grafici di trend per le metriche di produzione.
Parameters
----------
data : pd.DataFrame
DataFrame con i dati di produzione
variety : str, optional
Varietà specifica da visualizzare
metrics : list, optional
Lista delle metriche da visualizzare
"""
if metrics is None:
metrics = ['olive_prod', 'oil_prod', 'water_need']
# Filtra per varietà se specificata
if variety:
data = data[data['variety'] == variety]
# Crea subplot per ogni metrica
fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4 * len(metrics)))
if len(metrics) == 1:
axes = [axes]
for ax, metric in zip(axes, metrics):
sns.lineplot(data=data, x='year', y=metric, ax=ax)
if variety:
ax.set_title(f'{metric} per {variety}')
else:
ax.set_title(f'{metric} - Tutte le varietà')
ax.set_xlabel('Anno')
plt.tight_layout()
# Salva il plot
title = f'production_trends{"_" + variety if variety else ""}'
save_plot(plt, title)
plt.close()
def plot_correlation_matrix(data: pd.DataFrame,
variables: Optional[list] = None,
title: str = "Matrice di Correlazione"):
"""
Crea una matrice di correlazione con heatmap.
Parameters
----------
data : pd.DataFrame
DataFrame con i dati
variables : list, optional
Lista delle variabili da includere
title : str
Titolo del plot
"""
if variables:
corr_matrix = data[variables].corr()
else:
corr_matrix = data.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix,
annot=True,
cmap='coolwarm',
center=0,
fmt='.2f')
plt.title(title)
plt.tight_layout()
# Salva il plot
save_plot(plt, 'correlation_matrix')
plt.close()
def setup_plotting_style():
"""
Configura lo stile dei plot per uniformità.
"""
plt.style.use('seaborn')
sns.set_palette("husl")
# Impostazioni personalizzate
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10