wip model

This commit is contained in:
Giuseppe Nucifora 2024-11-07 19:02:42 +01:00
parent e9ec5af072
commit 549ced1aea
21 changed files with 6417 additions and 204 deletions

2
.idea/TesiPegaso.iml generated
View File

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/models" />
</content>
<orderEntry type="jdk" jdkName="ml_pegaso" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="ml_env" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View File

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="/usr/local/anaconda3" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="ml_pegaso" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="ml_env" project-jdk-type="Python SDK" />
<component name="PyCharmDSProjectLayout">
<option name="id" value="JupyterRightHiddenStructureLayout" />
</component>

View File

@ -32,7 +32,8 @@
"!pip install seaborn\n",
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io"
"!pip install tensorflow-io\n",
"!pip install pvlib"
],
"outputs": [],
"execution_count": null
@ -123,6 +124,7 @@
"from tensorflow.keras.optimizers import Adam\n",
"from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint\n",
"from datetime import datetime\n",
"from pvlib import solarposition\n",
"import os\n",
"import json\n",
"import joblib\n",
@ -527,207 +529,7 @@
" }, scaler_y\n",
"\n",
"\n",
"def create_radiation_model(input_shape, solar_params_shape=(3,)):\n",
" \"\"\"\n",
" Modello per la radiazione solare con vincoli di non-negatività.\n",
" \"\"\"\n",
" # Input layers\n",
" main_input = Input(shape=input_shape, name='main_input')\n",
" solar_input = Input(shape=solar_params_shape, name='solar_params')\n",
"\n",
" # Branch CNN\n",
" x1 = Conv1D(32, 3, padding='same')(main_input)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = Conv1D(64, 3, padding='same')(x1)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = GlobalAveragePooling1D()(x1)\n",
"\n",
" # Branch LSTM\n",
" x2 = Bidirectional(LSTM(64, return_sequences=True))(main_input)\n",
" x2 = Bidirectional(LSTM(32))(x2)\n",
" x2 = BatchNormalization()(x2)\n",
"\n",
" # Solar parameters processing\n",
" x3 = Dense(32)(solar_input)\n",
" x3 = BatchNormalization()(x3)\n",
" x3 = Activation('relu')(x3)\n",
"\n",
" # Combine all branches\n",
" x = concatenate([x1, x2, x3])\n",
"\n",
" # Dense layers with non-negativity constraints\n",
" x = Dense(64, kernel_constraint=tf.keras.constraints.NonNeg())(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.2)(x)\n",
"\n",
" x = Dense(32, kernel_constraint=tf.keras.constraints.NonNeg())(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
"\n",
" # Output layer con vincoli di non-negatività\n",
" output = Dense(1,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" activation='relu')(x)\n",
"\n",
" model = Model(inputs=[main_input, solar_input], outputs=output, name=\"SolarRadiation\")\n",
" return model\n",
"\n",
"\n",
"def create_energy_model(input_shape):\n",
" \"\"\"\n",
" Modello migliorato per l'energia solare che sfrutta la relazione con la radiazione.\n",
" Include vincoli di non-negatività e migliore gestione delle dipendenze temporali.\n",
" \"\"\"\n",
" inputs = Input(shape=input_shape)\n",
"\n",
" # Branch 1: Elaborazione temporale con attention\n",
" # Multi-head attention per catturare relazioni temporali\n",
" x1 = MultiHeadAttention(num_heads=8, key_dim=32)(inputs, inputs)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
"\n",
" # Temporal Convolution branch per catturare pattern locali\n",
" x2 = Conv1D(\n",
" filters=64,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(inputs)\n",
" x2 = BatchNormalization()(x2)\n",
" x2 = Activation('relu')(x2)\n",
" x2 = Conv1D(\n",
" filters=32,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(x2)\n",
" x2 = BatchNormalization()(x2)\n",
" x2 = Activation('relu')(x2)\n",
"\n",
" # LSTM branch per memoria a lungo termine\n",
" x3 = LSTM(64, return_sequences=True)(inputs)\n",
" x3 = LSTM(32, return_sequences=False)(x3)\n",
" x3 = BatchNormalization()(x3)\n",
" x3 = Activation('relu')(x3)\n",
"\n",
" # Global pooling per ogni branch\n",
" x1 = GlobalAveragePooling1D()(x1)\n",
" x2 = GlobalAveragePooling1D()(x2)\n",
"\n",
" # Concatena tutti i branch\n",
" x = concatenate([x1, x2, x3])\n",
"\n",
" # Dense layers con vincoli di non-negatività\n",
" x = Dense(\n",
" 128,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.3)(x)\n",
"\n",
" x = Dense(\n",
" 64,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.2)(x)\n",
"\n",
" # Output layer con vincolo di non-negatività\n",
" output = Dense(\n",
" 1,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" activation='relu', # Garantisce output non negativo\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
"\n",
" model = Model(inputs=inputs, outputs=output, name=\"SolarEnergy\")\n",
" return model\n",
"\n",
"\n",
"def create_uv_model(input_shape):\n",
" \"\"\"\n",
" Modello migliorato per l'indice UV che sfrutta sia radiazione che energia solare.\n",
" Include vincoli di non-negatività e considera le relazioni non lineari tra le variabili.\n",
" \"\"\"\n",
" inputs = Input(shape=input_shape)\n",
"\n",
" # CNN branch per pattern locali\n",
" x1 = Conv1D(\n",
" filters=64,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(inputs)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = MaxPooling1D(pool_size=2)(x1)\n",
"\n",
" x1 = Conv1D(\n",
" filters=32,\n",
" kernel_size=3,\n",
" padding='same',\n",
" kernel_constraint=tf.keras.constraints.NonNeg()\n",
" )(x1)\n",
" x1 = BatchNormalization()(x1)\n",
" x1 = Activation('relu')(x1)\n",
" x1 = GlobalAveragePooling1D()(x1)\n",
"\n",
" # Attention branch per relazioni complesse\n",
" # Specialmente utile per le relazioni con radiazione ed energia\n",
" x2 = MultiHeadAttention(num_heads=4, key_dim=32)(inputs, inputs)\n",
" x2 = BatchNormalization()(x2)\n",
" x2 = Activation('relu')(x2)\n",
" x2 = GlobalAveragePooling1D()(x2)\n",
"\n",
" # Dense branch per le feature più recenti\n",
" x3 = GlobalAveragePooling1D()(inputs)\n",
" x3 = Dense(\n",
" 64,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x3)\n",
" x3 = BatchNormalization()(x3)\n",
" x3 = Activation('relu')(x3)\n",
"\n",
" # Fusion dei branch\n",
" x = concatenate([x1, x2, x3])\n",
"\n",
" # Dense layers con vincoli di non-negatività\n",
" x = Dense(\n",
" 128,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.3)(x)\n",
"\n",
" x = Dense(\n",
" 64,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
" x = BatchNormalization()(x)\n",
" x = Activation('relu')(x)\n",
" x = Dropout(0.2)(x)\n",
"\n",
" # Output layer con vincolo di non-negatività\n",
" output = Dense(\n",
" 1,\n",
" kernel_constraint=tf.keras.constraints.NonNeg(),\n",
" activation='relu', # Garantisce output non negativo\n",
" kernel_regularizer=l2(0.01)\n",
" )(x)\n",
"\n",
" model = Model(inputs=inputs, outputs=output, name=\"SolarUV\")\n",
" return model\n",
"\n",
"\n",
"class CustomCallback(tf.keras.callbacks.Callback):\n",
@ -2434,7 +2236,9 @@
" solar_params_val,\n",
" scalers=radiation_scalers,\n",
" **training_params\n",
")"
")\n",
"\n",
"predict_radiation()"
],
"outputs": [],
"execution_count": null

94
src/README.md Normal file
View File

@ -0,0 +1,94 @@
src/data/data_loader.py:
- load_weather_data()
- load_olive_varieties()
- read_json_files()
- load_single_model_and_scalers()
- save_single_model_and_scalers()
src/data/data_processor.py:
- preprocess_weather_data()
- prepare_solar_data()
- prepare_transformer_data()
- create_sequences()
- encode_techniques()
- decode_techniques()
src/data/data_simulator.py:
- simulate_zone()
- simulate_olive_production_parallel()
- calculate_weather_effect()
- calculate_water_need()
- add_olive_water_consumption_correlation()
src/features/temporal_features.py:
- add_time_features()
- get_season()
- get_time_period()
- create_time_based_features()
src/features/weather_features.py:
- add_solar_features()
- add_solar_specific_features()
- add_environmental_features()
- calculate_vpd()
- add_weather_indicators()
src/features/olive_features.py:
- create_technique_mapping()
- add_olive_features()
- calculate_stress_index()
- calculate_quality_indicators()
- add_production_features()
src/models/transformer.py:
- create_olive_oil_transformer()
- OliveTransformerBlock
- PositionalEncoding
- DataAugmentation
src/models/layers.py:
- MultiScaleAttention
- TemporalConvBlock
- WeatherEmbedding
- OliveVarietyEmbedding
src/models/callbacks.py:
- CustomCallback
- WarmUpLearningRateSchedule
- MetricLogger
- EarlyStoppingWithBest
src/models/training.py:
- compile_model()
- setup_transformer_training()
- train_transformer()
- retrain_model()
- create_callbacks()
src/visualization/plots.py:
- plot_variety_comparison()
- plot_efficiency_vs_production()
- plot_water_efficiency_vs_production()
- plot_water_need_vs_oil_production()
- save_plot()
src/visualization/dashboard.py:
- create_production_dashboard()
- create_weather_dashboard()
- create_efficiency_dashboard()
- update_dashboard_data()
- create_forecast_view()
src/utils/metrics.py:
- calculate_real_error()
- evaluate_model_performance()
- calculate_efficiency_metrics()
- calculate_forecast_accuracy()
- compute_confidence_intervals()
src/utils/helpers.py:
- get_optimal_workers()
- clean_column_name()
- clean_column_names()
- to_camel_case()
- get_full_data()

441
src/data/data_loader.py Normal file
View File

@ -0,0 +1,441 @@
import os
import json
import numpy as np
import pandas as pd
import joblib
import tensorflow as tf
from src.models.solar_models import create_uv_model, create_energy_model, create_radiation_model
from typing import Tuple, Optional
import datetime
def read_json_files(folder_path):
all_data = []
file_list = sorted(os.listdir(folder_path))
for filename in file_list:
if filename.endswith('.json'):
file_path = os.path.join(folder_path, filename)
try:
with open(file_path, 'r') as file:
data = json.load(file)
all_data.extend(data['days'])
except Exception as e:
print(f"Error processing file '{filename}': {str(e)}")
return all_data
def save_single_model_and_scalers(model, model_name, scalers=None, base_path='./kaggle/working/models'):
"""
Salva un singolo modello con tutti i suoi artefatti associati e multipli scaler.
Parameters:
-----------
model : keras.Model
Il modello da salvare
model_name : str
Nome del modello (es. 'solarradiation', 'solarenergy', 'uvindex')
scalers : dict, optional
Dizionario degli scaler associati al modello (es. {'X': x_scaler, 'y': y_scaler})
base_path : str
Percorso base dove salvare il modello
"""
if isinstance(base_path, list):
base_path = './kaggle/working/models'
# Crea la cartella base se non esiste
os.makedirs(base_path, exist_ok=True)
# Crea la sottocartella per il modello specifico
model_path = os.path.join(base_path, model_name)
os.makedirs(model_path, exist_ok=True)
try:
print(f"\nSalvataggio modello {model_name}...")
# 1. Salva il modello completo
model_file = os.path.join(model_path, 'model.keras')
model.save(model_file, save_format='keras')
print(f"- Salvato modello completo: {model_file}")
# 2. Salva i pesi separatamente
weights_path = os.path.join(model_path, 'weights')
os.makedirs(weights_path, exist_ok=True)
weight_file = os.path.join(weights_path, 'weights')
model.save_weights(weight_file)
print(f"- Salvati pesi: {weight_file}")
# 3. Salva il plot del modello
plot_path = os.path.join(model_path, f'{model_name}_architecture.png')
tf.keras.utils.plot_model(
model,
to_file=plot_path,
show_shapes=True,
show_layer_names=True,
rankdir='TB',
expand_nested=True,
dpi=150
)
print(f"- Salvato plot architettura: {plot_path}")
# 4. Salva il summary del modello
summary_path = os.path.join(model_path, f'{model_name}_summary.txt')
with open(summary_path, 'w') as f:
model.summary(print_fn=lambda x: f.write(x + '\n'))
print(f"- Salvato summary modello: {summary_path}")
# 5. Salva gli scaler se forniti
if scalers is not None:
scaler_path = os.path.join(model_path, 'scalers')
os.makedirs(scaler_path, exist_ok=True)
for scaler_name, scaler in scalers.items():
scaler_file = os.path.join(scaler_path, f'{scaler_name}_scaler.joblib')
joblib.dump(scaler, scaler_file)
print(f"- Salvato scaler {scaler_name}: {scaler_file}")
# 6. Salva la configurazione del modello
model_config = {
'has_solar_params': True if model_name == 'solarradiation' else False,
'scalers': list(scalers.keys()) if scalers else []
}
config_path = os.path.join(model_path, 'model_config.joblib')
joblib.dump(model_config, config_path)
print(f"- Salvata configurazione: {config_path}")
# 7. Crea un README specifico per il modello
readme_path = os.path.join(model_path, 'README.txt')
with open(readme_path, 'w') as f:
f.write(f"{model_name.upper()} Model Artifacts\n")
f.write("=" * (len(model_name) + 15) + "\n\n")
f.write("Directory structure:\n")
f.write("- model.keras: Complete model\n")
f.write("- weights/: Model weights\n")
f.write(f"- {model_name}_architecture.png: Visual representation of model architecture\n")
f.write(f"- {model_name}_summary.txt: Detailed model summary\n")
f.write("- model_config.joblib: Model configuration\n")
if scalers:
f.write("- scalers/: Directory containing model scalers\n")
for scaler_name in scalers.keys():
f.write(f" - {scaler_name}_scaler.joblib: {scaler_name} scaler\n")
print(f"\nTutti gli artefatti per {model_name} salvati in: {model_path}")
print(f"Consulta {readme_path} per i dettagli sulla struttura")
except Exception as e:
print(f"Errore nel salvataggio degli artefatti per {model_name}: {str(e)}")
raise
return model_path
def load_single_model_and_scalers(model_name, base_path='./kaggle/working/models'):
"""
Carica un singolo modello con tutti i suoi artefatti e scaler associati.
Parameters:
-----------
model_name : str
Nome del modello da caricare (es. 'solarradiation', 'solarenergy', 'uvindex')
base_path : str
Percorso base dove sono salvati i modelli
Returns:
--------
tuple
(model, scalers, model_config)
"""
model_path = os.path.join(base_path, model_name)
if not os.path.exists(model_path):
print(f"Directory del modello non trovata: {model_path}")
return None, None, None
try:
print(f"\nCaricamento modello {model_name}...")
# 1. Carica la configurazione del modello
config_path = os.path.join(model_path, 'model_config.joblib')
try:
model_config = joblib.load(config_path)
print("- Configurazione modello caricata")
except:
print("! Configurazione modello non trovata, usando configurazione di default")
model_config = {
'has_solar_params': True if model_name == 'solarradiation' else False,
'scalers': ['X', 'y']
}
# 2. Carica il modello
try:
# Prima prova a caricare il modello completo
model_file = os.path.join(model_path, 'model.keras')
model = tf.keras.models.load_model(model_file)
print(f"- Modello caricato da: {model_file}")
# Verifica i pesi
weights_path = os.path.join(model_path, 'weights', 'weights')
if os.path.exists(weights_path + '.index'):
model.load_weights(weights_path)
print("- Pesi verificati con successo")
except Exception as e:
print(f"! Errore nel caricamento del modello: {str(e)}")
print("Tentativo di ricostruzione del modello...")
try:
# Ricostruzione del modello
if model_name == 'solarradiation':
model = create_radiation_model(input_shape=(24, 8))
elif model_name == 'solarenergy':
model = create_energy_model(input_shape=(24, 8))
elif model_name == 'uvindex':
model = create_uv_model(input_shape=(24, 8))
else:
raise ValueError(f"Tipo di modello non riconosciuto: {model_name}")
# Carica i pesi
model.load_weights(weights_path)
print("- Modello ricostruito dai pesi con successo")
except Exception as e:
print(f"! Errore nella ricostruzione del modello: {str(e)}")
return None, None, None
# 3. Carica gli scaler
scalers = {}
scaler_path = os.path.join(model_path, 'scalers')
if os.path.exists(scaler_path):
print("\nCaricamento scaler:")
for scaler_file in os.listdir(scaler_path):
if scaler_file.endswith('_scaler.joblib'):
scaler_name = scaler_file.replace('_scaler.joblib', '')
scaler_file_path = os.path.join(scaler_path, scaler_file)
try:
scalers[scaler_name] = joblib.load(scaler_file_path)
print(f"- Caricato scaler {scaler_name}")
except Exception as e:
print(f"! Errore nel caricamento dello scaler {scaler_name}: {str(e)}")
else:
print("! Directory degli scaler non trovata")
# 4. Verifica integrità del modello
try:
# Verifica che il modello possa fare predizioni
if model_name == 'solarradiation':
dummy_input = [np.zeros((1, 24, 8)), np.zeros((1, 3))]
else:
dummy_input = np.zeros((1, 24, 8))
model.predict(dummy_input, verbose=0)
print("\n✓ Verifica integrità modello completata con successo")
except Exception as e:
print(f"\n! Attenzione: il modello potrebbe non funzionare correttamente: {str(e)}")
# 5. Carica e verifica il summary del modello
summary_path = os.path.join(model_path, f'{model_name}_summary.txt')
if os.path.exists(summary_path):
print("\nSummary del modello disponibile in:", summary_path)
# 6. Verifica il plot dell'architettura
plot_path = os.path.join(model_path, f'{model_name}_architecture.png')
if os.path.exists(plot_path):
print("Plot dell'architettura disponibile in:", plot_path)
print(f"\nCaricamento di {model_name} completato con successo!")
return model, scalers, model_config
except Exception as e:
print(f"\nErrore critico nel caricamento del modello {model_name}: {str(e)}")
return None, None, None
def load_weather_data(
data_path: str,
start_year: Optional[int] = None,
end_year: Optional[int] = None
) -> pd.DataFrame:
"""
Carica e preprocessa i dati meteorologici da file JSON o Parquet.
Parameters
----------
data_path : str
Percorso al file dei dati (può essere .json o .parquet)
start_year : int, optional
Anno di inizio per filtrare i dati
end_year : int, optional
Anno di fine per filtrare i dati
Returns
-------
pd.DataFrame
DataFrame contenente i dati meteo preprocessati
Examples
--------
>>> weather_data = load_weather_data('./data/weather_data.parquet', start_year=2010)
"""
try:
# Determina il tipo di file e carica di conseguenza
if data_path.endswith('.parquet'):
weather_data = pd.read_parquet(data_path)
elif data_path.endswith('.json'):
# Se è un file JSON, prima lo convertiamo in DataFrame
with open(data_path, 'r') as f:
raw_data = json.load(f)
weather_data = create_weather_dataset(raw_data)
else:
raise ValueError(f"Formato file non supportato: {data_path}")
# Converti la colonna datetime
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'], errors='coerce')
# Filtra per anno se specificato
if start_year is not None:
weather_data = weather_data[weather_data['datetime'].dt.year >= start_year]
if end_year is not None:
weather_data = weather_data[weather_data['datetime'].dt.year <= end_year]
# Aggiungi colonne di data
weather_data['date'] = weather_data['datetime'].dt.date
weather_data['year'] = weather_data['datetime'].dt.year
weather_data['month'] = weather_data['datetime'].dt.month
weather_data['day'] = weather_data['datetime'].dt.day
# Rimuovi righe con datetime nullo
weather_data = weather_data.dropna(subset=['datetime'])
# Ordina per datetime
weather_data = weather_data.sort_values('datetime')
# Gestione valori mancanti nelle colonne principali
numeric_columns = weather_data.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
if weather_data[col].isnull().any():
# Interpolazione lineare per i valori mancanti
weather_data[col] = weather_data[col].interpolate(method='linear')
# Rimuovi eventuali duplicati
weather_data = weather_data.drop_duplicates(subset=['datetime'])
# Verifica la completezza dei dati
print(f"Dati caricati dal {weather_data['datetime'].min()} al {weather_data['datetime'].max()}")
print(f"Numero totale di records: {len(weather_data)}")
return weather_data
except Exception as e:
print(f"Errore nel caricamento dei dati meteo: {str(e)}")
raise
def create_weather_dataset(raw_data: list) -> pd.DataFrame:
"""
Converte i dati JSON grezzi in un DataFrame strutturato.
Parameters
----------
raw_data : list
Lista di dizionari contenenti i dati meteo
Returns
-------
pd.DataFrame
DataFrame strutturato con i dati meteo
"""
dataset = []
seen_datetimes = set()
for day in raw_data:
date = day['datetime']
for hour in day['hours']:
datetime_str = f"{date} {hour['datetime']}"
# Verifica duplicati
if datetime_str in seen_datetimes:
continue
seen_datetimes.add(datetime_str)
# Gestione preciptype
if isinstance(hour['preciptype'], list):
preciptype = "__".join(hour['preciptype'])
else:
preciptype = hour['preciptype'] if hour['preciptype'] else ""
# Gestione conditions
conditions = hour['conditions'].replace(', ', '__').replace(' ', '_').lower()
# Crea la riga
row = {
'datetime': datetime_str,
'temp': hour['temp'],
'feelslike': hour['feelslike'],
'humidity': hour['humidity'],
'dew': hour['dew'],
'precip': hour['precip'],
'snow': hour['snow'],
'preciptype': preciptype.lower(),
'windspeed': hour['windspeed'],
'winddir': hour['winddir'],
'pressure': hour['pressure'],
'cloudcover': hour['cloudcover'],
'visibility': hour['visibility'],
'solarradiation': hour['solarradiation'],
'solarenergy': hour['solarenergy'],
'uvindex': hour['uvindex'],
'conditions': conditions,
'tempmax': day['tempmax'],
'tempmin': day['tempmin'],
'precipprob': day['precipprob'],
'precipcover': day['precipcover']
}
dataset.append(row)
# Ordina per datetime
dataset.sort(key=lambda x: datetime.strptime(x['datetime'], "%Y-%m-%d %H:%M:%S"))
return pd.DataFrame(dataset)
def load_olive_varieties(
data_path: str,
add_water_features: bool = True
) -> pd.DataFrame:
"""
Carica e preprocessa i dati delle varietà di olive.
Parameters
----------
data_path : str
Percorso al file dei dati
add_water_features : bool
Se True, aggiunge feature relative al consumo d'acqua
Returns
-------
pd.DataFrame
DataFrame contenente i dati delle varietà di olive
"""
try:
if data_path.endswith('.csv'):
olive_varieties = pd.read_csv(data_path)
elif data_path.endswith('.parquet'):
olive_varieties = pd.read_parquet(data_path)
else:
raise ValueError(f"Formato file non supportato: {data_path}")
# Se richiesto, aggiungi feature sul consumo d'acqua
if add_water_features and 'Fabbisogno Acqua Primavera (m³/ettaro)' not in olive_varieties.columns:
from src.data.data_simulator import add_olive_water_consumption_correlation
olive_varieties = add_olive_water_consumption_correlation(olive_varieties)
print(f"Dati varietà olive caricati: {len(olive_varieties)} varietà")
return olive_varieties
except Exception as e:
print(f"Errore nel caricamento dei dati delle varietà: {str(e)}")
raise

324
src/data/data_processor.py Normal file
View File

@ -0,0 +1,324 @@
# src/data/data_processor.py
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import os
from typing import Tuple, List, Dict, Optional, Union
def preprocess_weather_data(weather_df: pd.DataFrame) -> pd.DataFrame:
"""
Calcola statistiche mensili per ogni anno dai dati meteo.
Parameters
----------
weather_df : pd.DataFrame
DataFrame contenente i dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con statistiche mensili
"""
# Calcola statistiche mensili per ogni anno
monthly_weather = weather_df.groupby(['year', 'month']).agg({
'temp': ['mean', 'min', 'max'],
'humidity': 'mean',
'precip': 'sum',
'windspeed': 'mean',
'cloudcover': 'mean',
'solarradiation': 'sum',
'solarenergy': 'sum',
'uvindex': 'max'
}).reset_index()
# Rinomina le colonne
monthly_weather.columns = ['year', 'month'] + [
f'{col[0]}_{col[1]}' for col in monthly_weather.columns[2:]
]
return monthly_weather
def create_sequences(timesteps: int, X: np.ndarray, y: Optional[np.ndarray] = None) -> Union[
np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""
Crea sequenze temporali dai dati.
Parameters
----------
timesteps : int
Numero di timestep per ogni sequenza
X : array-like
Dati di input
y : array-like, optional
Target values
Returns
-------
tuple o array
Se y è fornito: (X_sequences, y_sequences)
Se y è None: X_sequences
"""
Xs = []
for i in range(len(X) - timesteps):
Xs.append(X[i:i + timesteps])
if y is not None:
ys = []
for i in range(len(X) - timesteps):
ys.append(y[i + timesteps])
return np.array(Xs), np.array(ys)
return np.array(Xs)
def prepare_solar_data(weather_data: pd.DataFrame, features: List[str]) -> Tuple:
"""
Prepara i dati per i modelli solari.
Parameters
----------
weather_data : pd.DataFrame
DataFrame contenente i dati meteorologici
features : list
Lista delle feature da utilizzare
Returns
-------
tuple
(X_scaled, scaler_X, y_scaled, scaler_y, data_after_2010)
"""
# Aggiunge le caratteristiche temporali
weather_data = add_advanced_features(weather_data)
weather_data = pd.get_dummies(weather_data, columns=['season', 'time_period'], drop_first=True)
# Filtra dati dopo 2010
data_after_2010 = weather_data[weather_data['year'] >= 2010].copy()
data_after_2010 = data_after_2010.sort_values('datetime')
data_after_2010.set_index('datetime', inplace=True)
# Interpola valori mancanti
target_variables = ['solarradiation', 'solarenergy', 'uvindex']
for column in target_variables:
data_after_2010[column] = data_after_2010[column].interpolate(method='time')
# Rimuovi righe con valori mancanti
data_after_2010.dropna(subset=features + target_variables, inplace=True)
# Prepara X e y
X = data_after_2010[features].values
y = data_after_2010[target_variables].values
# Normalizza features
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y)
return X_scaled, scaler_X, y_scaled, scaler_y, data_after_2010
def prepare_transformer_data(df: pd.DataFrame, olive_varieties_df: pd.DataFrame) -> Tuple:
"""
Prepara i dati per il modello transformer.
"""
# Copia del DataFrame
df = df.copy()
# Ordina per zona e anno
df = df.sort_values(['zone', 'year'])
# Feature definition
temporal_features = ['temp_mean', 'precip_sum', 'solar_energy_sum']
static_features = ['ha']
target_features = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']
# Get clean varieties
all_varieties = olive_varieties_df['Varietà di Olive'].unique()
varieties = [clean_column_name(variety) for variety in all_varieties]
# Variety features structure
variety_features = [
'tech', 'pct', 'prod_t_ha', 'oil_prod_t_ha', 'oil_prod_l_ha',
'min_yield_pct', 'max_yield_pct', 'min_oil_prod_l_ha', 'max_oil_prod_l_ha',
'avg_oil_prod_l_ha', 'l_per_t', 'min_l_per_t', 'max_l_per_t', 'avg_l_per_t'
]
# Prepare columns
new_columns = {}
# Prepare features for each variety
for variety in varieties:
for feature in variety_features:
col_name = f"{variety}_{feature}"
if col_name in df.columns:
if feature != 'tech':
static_features.append(col_name)
# Binary features for cultivation techniques
for technique in ['tradizionale', 'intensiva', 'superintensiva']:
col_name = f"{variety}_{technique}"
new_columns[col_name] = df[f"{variety}_tech"].notna() & (
df[f"{variety}_tech"].str.lower() == technique
).fillna(False)
static_features.append(col_name)
# Add all new columns at once
new_df = pd.concat([df] + [pd.Series(v, name=k) for k, v in new_columns.items()], axis=1)
# Sort by zone and year
df_sorted = new_df.sort_values(['zone', 'year'])
# Window size definition
window_size = 41
# Prepare lists for data collection
temporal_sequences = []
static_features_list = []
targets_list = []
# Process data by zone
for zone in df_sorted['zone'].unique():
zone_data = df_sorted[df_sorted['zone'] == zone].reset_index(drop=True)
if len(zone_data) >= window_size:
for i in range(len(zone_data) - window_size + 1):
temporal_window = zone_data.iloc[i:i + window_size][temporal_features].values
if not np.isnan(temporal_window).any():
temporal_sequences.append(temporal_window)
static_features_list.append(zone_data.iloc[i + window_size - 1][static_features].values)
targets_list.append(zone_data.iloc[i + window_size - 1][target_features].values)
# Convert to numpy arrays
X_temporal = np.array(temporal_sequences)
X_static = np.array(static_features_list)
y = np.array(targets_list)
# Split data
indices = np.random.permutation(len(X_temporal))
train_idx = int(len(indices) * 0.65)
val_idx = int(len(indices) * 0.85)
train_indices = indices[:train_idx]
val_indices = indices[train_idx:val_idx]
test_indices = indices[val_idx:]
# Split datasets
X_temporal_train = X_temporal[train_indices]
X_temporal_val = X_temporal[val_indices]
X_temporal_test = X_temporal[test_indices]
X_static_train = X_static[train_indices]
X_static_val = X_static[val_indices]
X_static_test = X_static[test_indices]
y_train = y[train_indices]
y_val = y[val_indices]
y_test = y[test_indices]
# Standardization
scaler_temporal = StandardScaler()
scaler_static = StandardScaler()
scaler_y = StandardScaler()
# Apply standardization
X_temporal_train = scaler_temporal.fit_transform(X_temporal_train.reshape(-1, len(temporal_features))).reshape(
X_temporal_train.shape)
X_temporal_val = scaler_temporal.transform(X_temporal_val.reshape(-1, len(temporal_features))).reshape(
X_temporal_val.shape)
X_temporal_test = scaler_temporal.transform(X_temporal_test.reshape(-1, len(temporal_features))).reshape(
X_temporal_test.shape)
X_static_train = scaler_static.fit_transform(X_static_train)
X_static_val = scaler_static.transform(X_static_val)
X_static_test = scaler_static.transform(X_static_test)
y_train = scaler_y.fit_transform(y_train)
y_val = scaler_y.transform(y_val)
y_test = scaler_y.transform(y_test)
# Prepare input dictionaries
train_data = {'temporal': X_temporal_train, 'static': X_static_train}
val_data = {'temporal': X_temporal_val, 'static': X_static_val}
test_data = {'temporal': X_temporal_test, 'static': X_static_test}
# Save scalers
base_path = './kaggle/working/models/oil_transformer/'
os.makedirs(base_path, exist_ok=True)
joblib.dump(scaler_temporal, os.path.join(base_path, 'scaler_temporal.joblib'))
joblib.dump(scaler_static, os.path.join(base_path, 'scaler_static.joblib'))
joblib.dump(scaler_y, os.path.join(base_path, 'scaler_y.joblib'))
return (train_data, y_train), (val_data, y_val), (test_data, y_test), (scaler_temporal, scaler_static, scaler_y)
def encode_techniques(df: pd.DataFrame,
mapping_path: str = './kaggle/working/models/technique_mapping.joblib') -> pd.DataFrame:
"""
Codifica le tecniche di coltivazione usando un mapping salvato.
Parameters
----------
df : pd.DataFrame
DataFrame contenente le colonne delle tecniche
mapping_path : str
Percorso al file di mapping
Returns
-------
pd.DataFrame
DataFrame con le tecniche codificate
"""
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}. Run create_technique_mapping first.")
technique_mapping = joblib.load(mapping_path)
# Trova tutte le colonne delle tecniche
tech_columns = [col for col in df.columns if col.endswith('_tech')]
# Applica il mapping a tutte le colonne delle tecniche
for col in tech_columns:
df[col] = df[col].str.lower().map(technique_mapping).fillna(0).astype(int)
return df
def decode_techniques(df: pd.DataFrame,
mapping_path: str = './kaggle/working/models/technique_mapping.joblib') -> pd.DataFrame:
"""
Decodifica le tecniche di coltivazione usando un mapping salvato.
Parameters
----------
df : pd.DataFrame
DataFrame contenente le colonne delle tecniche codificate
mapping_path : str
Percorso al file di mapping
Returns
-------
pd.DataFrame
DataFrame con le tecniche decodificate
"""
if not os.path.exists(mapping_path):
raise FileNotFoundError(f"Mapping not found at {mapping_path}")
technique_mapping = joblib.load(mapping_path)
reverse_mapping = {v: k for k, v in technique_mapping.items()}
reverse_mapping[0] = '' # Mapping per 0 a stringa vuota
# Trova tutte le colonne delle tecniche
tech_columns = [col for col in df.columns if col.endswith('_tech')]
# Applica il reverse mapping
for col in tech_columns:
df[col] = df[col].map(reverse_mapping)
return df

332
src/data/data_simulator.py Normal file
View File

@ -0,0 +1,332 @@
import pandas as pd
import numpy as np
from typing import Dict
from src.utils.helpers import clean_column_name
def calculate_weather_effect(row: pd.Series, optimal_temp: float) -> float:
"""
Calcola l'effetto delle condizioni meteorologiche sulla produzione.
Parameters
----------
row : pd.Series
Serie contenente i dati meteorologici
optimal_temp : float
Temperatura ottimale per la varietà
Returns
-------
float
Effetto combinato delle condizioni meteo
"""
# Effetti base
temp_effect = -0.1 * (row['temp_mean'] - optimal_temp) ** 2
rain_effect = -0.05 * (row['precip_sum'] - 600) ** 2 / 10000
sun_effect = 0.1 * row['solarenergy_sum'] / 1000
# Fattori di scala basati sulla fase di crescita
if row['growth_phase'] == 'dormancy':
temp_scale = 0.5
rain_scale = 0.2
sun_scale = 0.1
elif row['growth_phase'] == 'flowering':
temp_scale = 2.0
rain_scale = 1.5
sun_scale = 1.0
elif row['growth_phase'] == 'fruit_set':
temp_scale = 1.5
rain_scale = 1.0
sun_scale = 0.8
else: # ripening
temp_scale = 1.0
rain_scale = 0.5
sun_scale = 1.2
# Calcolo dell'effetto combinato
combined_effect = (
temp_scale * temp_effect +
rain_scale * rain_effect +
sun_scale * sun_effect
)
# Aggiustamenti specifici per fase
if row['growth_phase'] == 'flowering':
combined_effect -= 0.5 * max(0, row['precip_sum'] - 50) # Penalità per pioggia eccessiva
elif row['growth_phase'] == 'fruit_set':
combined_effect += 0.3 * max(0, row['temp_mean'] - (optimal_temp + 5)) # Bonus temperature alte
return combined_effect
def calculate_water_need(weather_data: pd.Series, base_need: float, optimal_temp: float) -> float:
"""
Calcola il fabbisogno idrico basato su temperatura e precipitazioni.
Parameters
----------
weather_data : pd.Series
Serie contenente i dati meteorologici
base_need : float
Fabbisogno idrico base
optimal_temp : float
Temperatura ottimale per la varietà
Returns
-------
float
Fabbisogno idrico calcolato
"""
temp_factor = 1 + 0.05 * (weather_data['temp_mean'] - optimal_temp)
rain_factor = 1 - 0.001 * weather_data['precip_sum']
return base_need * temp_factor * rain_factor
def add_olive_water_consumption_correlation(dataset: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge dati correlati al consumo d'acqua per ogni varietà di oliva.
Parameters
----------
dataset : pd.DataFrame
DataFrame contenente i dati delle varietà di olive
Returns
-------
pd.DataFrame
DataFrame con dati aggiuntivi sul consumo d'acqua
"""
# Dati simulati per il fabbisogno d'acqua e correlazione con temperatura
fabbisogno_acqua = {
"Nocellara dell'Etna": {"Primavera": 1200, "Estate": 2000, "Autunno": 1000, "Inverno": 500,
"Temperatura Ottimale": 18, "Resistenza": "Media"},
"Leccino": {"Primavera": 1000, "Estate": 1800, "Autunno": 800, "Inverno": 400, "Temperatura Ottimale": 20,
"Resistenza": "Alta"},
"Frantoio": {"Primavera": 1100, "Estate": 1900, "Autunno": 900, "Inverno": 450, "Temperatura Ottimale": 19,
"Resistenza": "Alta"},
"Coratina": {"Primavera": 1300, "Estate": 2200, "Autunno": 1100, "Inverno": 550, "Temperatura Ottimale": 17,
"Resistenza": "Media"},
"Moraiolo": {"Primavera": 1150, "Estate": 2100, "Autunno": 900, "Inverno": 480, "Temperatura Ottimale": 18,
"Resistenza": "Media"},
"Pendolino": {"Primavera": 1050, "Estate": 1850, "Autunno": 850, "Inverno": 430, "Temperatura Ottimale": 20,
"Resistenza": "Alta"},
"Taggiasca": {"Primavera": 1000, "Estate": 1750, "Autunno": 800, "Inverno": 400, "Temperatura Ottimale": 19,
"Resistenza": "Alta"},
"Canino": {"Primavera": 1100, "Estate": 1900, "Autunno": 900, "Inverno": 450, "Temperatura Ottimale": 18,
"Resistenza": "Media"},
"Itrana": {"Primavera": 1200, "Estate": 2000, "Autunno": 1000, "Inverno": 500, "Temperatura Ottimale": 17,
"Resistenza": "Media"},
"Ogliarola": {"Primavera": 1150, "Estate": 1950, "Autunno": 900, "Inverno": 480, "Temperatura Ottimale": 18,
"Resistenza": "Media"},
"Biancolilla": {"Primavera": 1050, "Estate": 1800, "Autunno": 850, "Inverno": 430, "Temperatura Ottimale": 19,
"Resistenza": "Alta"}
}
# Calcola fabbisogno idrico annuale
for varieta in fabbisogno_acqua:
fabbisogno_acqua[varieta]["Annuale"] = sum(
fabbisogno_acqua[varieta][stagione]
for stagione in ["Primavera", "Estate", "Autunno", "Inverno"]
)
# Aggiungi colonne al dataset
dataset["Fabbisogno Acqua Primavera (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Primavera"])
dataset["Fabbisogno Acqua Estate (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Estate"])
dataset["Fabbisogno Acqua Autunno (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Autunno"])
dataset["Fabbisogno Acqua Inverno (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Inverno"])
dataset["Fabbisogno Idrico Annuale (m³/ettaro)"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Annuale"])
dataset["Temperatura Ottimale"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Temperatura Ottimale"])
dataset["Resistenza alla Siccità"] = dataset["Varietà di Olive"].apply(
lambda x: fabbisogno_acqua[x]["Resistenza"])
return dataset
def simulate_zone(base_weather: pd.DataFrame,
olive_varieties: pd.DataFrame,
year: int,
zone: int,
all_varieties: np.ndarray,
variety_techniques: Dict) -> Dict:
"""
Simula la produzione di olive per una singola zona.
Parameters
----------
base_weather : pd.DataFrame
DataFrame contenente i dati meteo di base
olive_varieties : pd.DataFrame
DataFrame con le informazioni sulle varietà
year : int
Anno della simulazione
zone : int
ID della zona
all_varieties : np.ndarray
Array con tutte le varietà disponibili
variety_techniques : Dict
Dizionario con le tecniche disponibili per ogni varietà
Returns
-------
Dict
Dizionario con i risultati della simulazione
"""
# Crea una copia dei dati meteo per questa zona
zone_weather = base_weather.copy()
# Genera variazioni meteorologiche specifiche per questa zona
zone_weather['temp_mean'] *= np.random.uniform(0.95, 1.05, len(zone_weather))
zone_weather['precip_sum'] *= np.random.uniform(0.9, 1.1, len(zone_weather))
zone_weather['solarenergy_sum'] *= np.random.uniform(0.95, 1.05, len(zone_weather))
# Genera caratteristiche specifiche della zona
num_varieties = np.random.randint(1, 4) # 1-3 varietà per zona
selected_varieties = np.random.choice(all_varieties, size=num_varieties, replace=False)
hectares = np.random.uniform(1, 10) # Dimensione del terreno
percentages = np.random.dirichlet(np.ones(num_varieties)) # Distribuzione delle varietà
# Inizializzazione contatori annuali
annual_production = 0
annual_min_oil = 0
annual_max_oil = 0
annual_avg_oil = 0
annual_water_need = 0
# Inizializzazione dizionario dati varietà
variety_data = {clean_column_name(variety): {
'tech': '',
'pct': 0,
'prod_t_ha': 0,
'oil_prod_t_ha': 0,
'oil_prod_l_ha': 0,
'min_yield_pct': 0,
'max_yield_pct': 0,
'min_oil_prod_l_ha': 0,
'max_oil_prod_l_ha': 0,
'avg_oil_prod_l_ha': 0,
'l_per_t': 0,
'min_l_per_t': 0,
'max_l_per_t': 0,
'avg_l_per_t': 0,
'olive_prod': 0,
'min_oil_prod': 0,
'max_oil_prod': 0,
'avg_oil_prod': 0,
'water_need': 0
} for variety in all_varieties}
# Simula produzione per ogni varietà selezionata
for i, variety in enumerate(selected_varieties):
# Seleziona tecnica di coltivazione casuale per questa varietà
technique = np.random.choice(variety_techniques[variety])
percentage = percentages[i]
# Ottieni informazioni specifiche della varietà
variety_info = olive_varieties[
(olive_varieties['Varietà di Olive'] == variety) &
(olive_varieties['Tecnica di Coltivazione'] == technique)
].iloc[0]
# Calcola produzione base con variabilità
base_production = variety_info['Produzione (tonnellate/ettaro)'] * 1000 * percentage * hectares / 12
base_production *= np.random.uniform(0.9, 1.1)
# Calcola effetti meteo sulla produzione
weather_effect = zone_weather.apply(
lambda row: calculate_weather_effect(row, variety_info['Temperatura Ottimale']),
axis=1
)
monthly_production = base_production * (1 + weather_effect / 10000)
monthly_production *= np.random.uniform(0.95, 1.05, len(zone_weather))
# Calcola produzione annuale per questa varietà
annual_variety_production = monthly_production.sum()
# Calcola rese di olio con variabilità
min_yield_factor = np.random.uniform(0.95, 1.05)
max_yield_factor = np.random.uniform(0.95, 1.05)
avg_yield_factor = (min_yield_factor + max_yield_factor) / 2
min_oil_production = annual_variety_production * variety_info[
'Min Litri per Tonnellata'] / 1000 * min_yield_factor
max_oil_production = annual_variety_production * variety_info[
'Max Litri per Tonnellata'] / 1000 * max_yield_factor
avg_oil_production = annual_variety_production * variety_info[
'Media Litri per Tonnellata'] / 1000 * avg_yield_factor
# Calcola fabbisogno idrico
base_water_need = (
variety_info['Fabbisogno Acqua Primavera (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Estate (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Autunno (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Inverno (m³/ettaro)']
) / 4
monthly_water_need = zone_weather.apply(
lambda row: calculate_water_need(row, base_water_need, variety_info['Temperatura Ottimale']),
axis=1
)
monthly_water_need *= np.random.uniform(0.95, 1.05, len(monthly_water_need))
annual_variety_water_need = monthly_water_need.sum() * percentage * hectares
# Aggiorna totali annuali
annual_production += annual_variety_production
annual_min_oil += min_oil_production
annual_max_oil += max_oil_production
annual_avg_oil += avg_oil_production
annual_water_need += annual_variety_water_need
# Aggiorna dati varietà
clean_variety = clean_column_name(variety)
variety_data[clean_variety].update({
'tech': clean_column_name(technique),
'pct': percentage,
'prod_t_ha': variety_info['Produzione (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05),
'oil_prod_t_ha': variety_info['Produzione Olio (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05),
'oil_prod_l_ha': variety_info['Produzione Olio (litri/ettaro)'] * np.random.uniform(0.95, 1.05),
'min_yield_pct': variety_info['Min % Resa'] * min_yield_factor,
'max_yield_pct': variety_info['Max % Resa'] * max_yield_factor,
'min_oil_prod_l_ha': variety_info['Min Produzione Olio (litri/ettaro)'] * min_yield_factor,
'max_oil_prod_l_ha': variety_info['Max Produzione Olio (litri/ettaro)'] * max_yield_factor,
'avg_oil_prod_l_ha': variety_info['Media Produzione Olio (litri/ettaro)'] * avg_yield_factor,
'l_per_t': variety_info['Litri per Tonnellata'] * np.random.uniform(0.98, 1.02),
'min_l_per_t': variety_info['Min Litri per Tonnellata'] * min_yield_factor,
'max_l_per_t': variety_info['Max Litri per Tonnellata'] * max_yield_factor,
'avg_l_per_t': variety_info['Media Litri per Tonnellata'] * avg_yield_factor,
'olive_prod': annual_variety_production,
'min_oil_prod': min_oil_production,
'max_oil_prod': max_oil_production,
'avg_oil_prod': avg_oil_production,
'water_need': annual_variety_water_need
})
# Appiattisci i dati delle varietà
flattened_variety_data = {
f'{variety}_{key}': value
for variety, data in variety_data.items()
for key, value in data.items()
}
# Restituisci il risultato della zona
return {
'year': year,
'zone_id': zone + 1,
'temp_mean': zone_weather['temp_mean'].mean(),
'precip_sum': zone_weather['precip_sum'].sum(),
'solar_energy_sum': zone_weather['solarenergy_sum'].sum(),
'ha': hectares,
'zone': f"zone_{zone + 1}",
'olive_prod': annual_production,
'min_oil_prod': annual_min_oil,
'max_oil_prod': annual_max_oil,
'avg_oil_prod': annual_avg_oil,
'total_water_need': annual_water_need,
**flattened_variety_data
}

View File

@ -0,0 +1,220 @@
import pandas as pd
import numpy as np
import joblib
import os
from typing import Dict
def create_technique_mapping(olive_varieties: pd.DataFrame,
mapping_path: str = './kaggle/working/models/technique_mapping.joblib') -> Dict[str, int]:
"""
Crea un mapping numerico per le tecniche di coltivazione.
Parameters
----------
olive_varieties : pd.DataFrame
DataFrame contenente le varietà di olive e le tecniche
mapping_path : str
Percorso dove salvare il mapping
Returns
-------
Dict[str, int]
Dizionario di mapping tecnica -> codice numerico
"""
# Estrai tecniche uniche e convertile in lowercase
all_techniques = olive_varieties['Tecnica di Coltivazione'].str.lower().unique()
# Crea il mapping partendo da 1 (0 è riservato per valori mancanti)
technique_mapping = {tech: i + 1 for i, tech in enumerate(sorted(all_techniques))}
# Salva il mapping
os.makedirs(os.path.dirname(mapping_path), exist_ok=True)
joblib.dump(technique_mapping, mapping_path)
return technique_mapping
def calculate_stress_index(weather_data: pd.DataFrame,
olive_info: pd.Series,
vpd_threshold: float = 2.0) -> float:
"""
Calcola l'indice di stress per le olive basato su condizioni ambientali.
Parameters
----------
weather_data : pd.DataFrame
Dati meteorologici
olive_info : pd.Series
Informazioni sulla varietà di oliva
vpd_threshold : float
Soglia VPD per lo stress
Returns
-------
float
Indice di stress calcolato
"""
# Calcola componenti di stress
temp_stress = np.where(
weather_data['temp'] > olive_info['Temperatura Ottimale'],
(weather_data['temp'] - olive_info['Temperatura Ottimale']) / 10,
0
)
water_stress = np.where(
weather_data['vpd'] > vpd_threshold,
(weather_data['vpd'] - vpd_threshold) / 2,
0
)
# Considera la resistenza alla siccità
resistance_factor = 1.0
if olive_info['Resistenza alla Siccità'] == 'Alta':
resistance_factor = 0.7
elif olive_info['Resistenza alla Siccità'] == 'Media':
resistance_factor = 0.85
# Calcola stress complessivo
total_stress = (temp_stress + water_stress * resistance_factor)
return total_stress.mean()
def calculate_quality_indicators(olive_data: pd.DataFrame,
weather_data: pd.DataFrame) -> pd.DataFrame:
"""
Calcola indicatori di qualità per le olive.
Parameters
----------
olive_data : pd.DataFrame
Dati sulle olive
weather_data : pd.DataFrame
Dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con indicatori di qualità aggiunti
"""
result = olive_data.copy()
# Calcola indicatori base
result['oil_content_index'] = result['Max % Resa'] * (1 - result['stress_index'] * 0.1)
result['fruit_size_index'] = np.clip(
result['Produzione (tonnellate/ettaro)'] * (1 - result['water_stress'] * 0.15),0, None
)
# Calcola indice di maturazione ottimale
optimal_harvest_conditions = (
(weather_data['temp'].between(15, 25)) &
(weather_data['humidity'].between(50, 70)) &
(weather_data['cloudcover'] < 60)
)
result['maturity_index'] = optimal_harvest_conditions.mean()
# Calcola indice di qualità complessivo
result['quality_index'] = (
result['oil_content_index'] * 0.4 +
result['fruit_size_index'] * 0.3 +
result['maturity_index'] * 0.3
)
return result
def add_olive_features(df: pd.DataFrame,
weather_data: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature specifiche per le olive.
Parameters
----------
df : pd.DataFrame
DataFrame delle varietà di olive
weather_data : pd.DataFrame
Dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con feature aggiuntive
"""
result = df.copy()
# Calcola stress index per ogni varietà
result['stress_index'] = result.apply(
lambda row: calculate_stress_index(weather_data, row),
axis=1
)
# Aggiungi indicatori di qualità
result = calculate_quality_indicators(result, weather_data)
# Calcola efficienza produttiva
result['production_efficiency'] = result['Produzione (tonnellate/ettaro)'] / \
result['Fabbisogno Idrico Annuale (m³/ettaro)']
# Calcola indice di adattamento climatico
result['climate_adaptation'] = np.where(
result['Resistenza alla Siccità'] == 'Alta',
0.9,
np.where(result['Resistenza alla Siccità'] == 'Media', 0.7, 0.5)
)
# Aggiungi feature di produzione
result = add_production_features(result, weather_data)
return result
def add_production_features(df: pd.DataFrame,
weather_data: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature relative alla produzione di olive.
Parameters
----------
df : pd.DataFrame
DataFrame delle varietà di olive
weather_data : pd.DataFrame
Dati meteorologici
Returns
-------
pd.DataFrame
DataFrame con feature di produzione
"""
result = df.copy()
# Calcola i rapporti di produzione
result['oil_yield_ratio'] = result['Produzione Olio (tonnellate/ettaro)'] / result['Produzione (tonnellate/ettaro)']
result['water_efficiency'] = result['Produzione (tonnellate/ettaro)'] / result['Fabbisogno Idrico Annuale (m³/ettaro)']
# Calcola indici di produttività
result['productivity_index'] = (
result['oil_yield_ratio'] * 0.4 +
result['water_efficiency'] * 0.3 +
result['climate_adaptation'] * 0.3
)
# Aggiungi indicatori di rendimento
result['yield_stability'] = 1 - (
(result['Max % Resa'] - result['Min % Resa']) / result['Max % Resa']
)
result['oil_quality_potential'] = (
result['Max Litri per Tonnellata'] / 1000 * result['yield_stability'] * (1 - result['stress_index'] * 0.1)
)
# Calcola intervalli di produzione ottimale
result['optimal_production_lower'] = result['Produzione (tonnellate/ettaro)'] * 0.8
result['optimal_production_upper'] = result['Produzione (tonnellate/ettaro)'] * 1.2
# Aggiungi indici economici
result['economic_efficiency'] = (result['Produzione Olio (litri/ettaro)'] / result['Fabbisogno Idrico Annuale (m³/ettaro)']) * result['productivity_index']
return result

View File

@ -0,0 +1,205 @@
import pandas as pd
import numpy as np
from typing import Union, Optional
from datetime import datetime
def get_season(date: datetime) -> str:
"""
Determina la stagione in base alla data.
Parameters
----------
date : datetime
Data per cui determinare la stagione
Returns
-------
str
Nome della stagione ('Winter', 'Spring', 'Summer', 'Autumn')
"""
month = date.month
day = date.day
if (month == 12 and day >= 21) or (month <= 3 and day < 20):
return 'Winter'
elif (month == 3 and day >= 20) or (month <= 6 and day < 21):
return 'Spring'
elif (month == 6 and day >= 21) or (month <= 9 and day < 23):
return 'Summer'
elif (month == 9 and day >= 23) or (month <= 12 and day < 21):
return 'Autumn'
else:
return 'Unknown'
def get_time_period(hour: int) -> str:
"""
Determina il periodo del giorno in base all'ora.
Parameters
----------
hour : int
Ora del giorno (0-23)
Returns
-------
str
Periodo del giorno ('Morning', 'Afternoon', 'Evening', 'Night')
"""
if 5 <= hour < 12:
return 'Morning'
elif 12 <= hour < 17:
return 'Afternoon'
elif 17 <= hour < 21:
return 'Evening'
else:
return 'Night'
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature temporali al DataFrame.
Parameters
----------
df : pd.DataFrame
DataFrame contenente una colonna 'datetime'
Returns
-------
pd.DataFrame
DataFrame con feature temporali aggiuntive
"""
# Assicurati che datetime sia nel formato corretto
df['datetime'] = pd.to_datetime(df['datetime'])
# Feature temporali di base
df['timestamp'] = df['datetime'].astype(np.int64) // 10 ** 9
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute
# Feature cicliche
df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24))
df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24))
df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
# Feature calendario
df['day_of_week'] = df['datetime'].dt.dayofweek
df['day_of_year'] = df['datetime'].dt.dayofyear
df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)
df['quarter'] = df['datetime'].dt.quarter
# Feature cicliche giorno dell'anno
df['day_of_year_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25))
df['day_of_year_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365.25))
# Flag speciali
df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)
df['is_quarter_end'] = df['datetime'].dt.is_quarter_end.astype(int)
df['is_year_end'] = df['datetime'].dt.is_year_end.astype(int)
# Periodi del giorno e stagioni
df['season'] = df['datetime'].apply(get_season)
df['time_period'] = df['hour'].apply(get_time_period)
return df
def create_time_based_features(
df: pd.DataFrame,
datetime_col: str = 'datetime',
add_cyclical: bool = True,
add_time_periods: bool = True,
add_seasons: bool = True,
custom_features: Optional[list] = None
) -> pd.DataFrame:
"""
Crea feature temporali personalizzate.
Parameters
----------
df : pd.DataFrame
DataFrame di input
datetime_col : str
Nome della colonna datetime
add_cyclical : bool
Se True, aggiunge feature cicliche
add_time_periods : bool
Se True, aggiunge periodi del giorno
add_seasons : bool
Se True, aggiunge stagioni
custom_features : list, optional
Lista di feature temporali personalizzate da aggiungere
Returns
-------
pd.DataFrame
DataFrame con le nuove feature temporali
"""
# Crea una copia del DataFrame
result = df.copy()
# Converti la colonna datetime se necessario
if not pd.api.types.is_datetime64_any_dtype(result[datetime_col]):
result[datetime_col] = pd.to_datetime(result[datetime_col])
# Feature temporali di base
result['year'] = result[datetime_col].dt.year
result['month'] = result[datetime_col].dt.month
result['day'] = result[datetime_col].dt.day
result['hour'] = result[datetime_col].dt.hour
result['day_of_week'] = result[datetime_col].dt.dayofweek
result['day_of_year'] = result[datetime_col].dt.dayofyear
# Feature cicliche
if add_cyclical:
# Ora
result['hour_sin'] = np.sin(result['hour'] * (2 * np.pi / 24))
result['hour_cos'] = np.cos(result['hour'] * (2 * np.pi / 24))
# Mese
result['month_sin'] = np.sin((result['month'] - 1) * (2 * np.pi / 12))
result['month_cos'] = np.cos((result['month'] - 1) * (2 * np.pi / 12))
# Giorno dell'anno
result['day_of_year_sin'] = np.sin((result['day_of_year'] - 1) * (2 * np.pi / 365.25))
result['day_of_year_cos'] = np.cos((result['day_of_year'] - 1) * (2 * np.pi / 365.25))
# Giorno della settimana
result['day_of_week_sin'] = np.sin(result['day_of_week'] * (2 * np.pi / 7))
result['day_of_week_cos'] = np.cos(result['day_of_week'] * (2 * np.pi / 7))
# Periodi del giorno
if add_time_periods:
result['time_period'] = result['hour'].apply(get_time_period)
# One-hot encoding del periodo del giorno
time_period_dummies = pd.get_dummies(result['time_period'], prefix='time_period')
result = pd.concat([result, time_period_dummies], axis=1)
# Stagioni
if add_seasons:
result['season'] = result[datetime_col].apply(get_season)
# One-hot encoding delle stagioni
season_dummies = pd.get_dummies(result['season'], prefix='season')
result = pd.concat([result, season_dummies], axis=1)
# Feature personalizzate
if custom_features:
for feature in custom_features:
if feature == 'is_weekend':
result['is_weekend'] = result['day_of_week'].isin([5, 6]).astype(int)
elif feature == 'is_business_hour':
result['is_business_hour'] = ((result['hour'] >= 9) &
(result['hour'] < 18) &
~result['is_weekend']).astype(int)
elif feature == 'season_progress':
result['season_progress'] = result.apply(
lambda x: (x['day_of_year'] % 91) / 91.0, axis=1
)
return result

View File

@ -0,0 +1,186 @@
import pandas as pd
import numpy as np
from typing import Union
def calculate_vpd(temp: Union[float, np.ndarray], humidity: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
"""
Calcola il Deficit di Pressione di Vapore (VPD).
VPD è una misura della domanda evaporativa dell'aria.
Parameters
----------
temp : float or np.ndarray
Temperatura in Celsius
humidity : float or np.ndarray
Umidità relativa (0-100)
Returns
-------
float or np.ndarray
VPD in kPa
"""
# Pressione di vapore saturo (kPa)
es = 0.6108 * np.exp((17.27 * temp) / (temp + 237.3))
# Pressione di vapore attuale (kPa)
ea = es * (humidity / 100.0)
# VPD (kPa)
vpd = es - ea
return np.maximum(vpd, 0) # VPD non può essere negativo
def add_solar_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature relative alla radiazione solare.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con feature solari aggiunte
"""
# Calcola angolo solare
df['solar_angle'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * \
np.sin(df['hour'] * (2 * np.pi / 24))
# Interazioni tra feature rilevanti
df['cloud_temp_interaction'] = df['cloudcover'] * df['temp']
df['visibility_cloud_interaction'] = df['visibility'] * (100 - df['cloudcover'])
# Feature derivate
df['clear_sky_index'] = (100 - df['cloudcover']) / 100
df['temp_gradient'] = df['temp'] - df['tempmin']
# Feature di efficienza solare
df['solar_efficiency'] = df['solarenergy'] / (df['solarradiation'] + 1e-6) # evita divisione per zero
df['solar_temp_ratio'] = df['solarradiation'] / (df['temp'] + 273.15) # temperatura in Kelvin
return df
def add_solar_specific_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature specifiche per l'analisi solare.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con feature solari specifiche aggiunte
"""
# Angolo solare e durata del giorno
df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)
df['solar_noon'] = 12 - df['hour']
df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * \
np.cos(2 * np.pi * df['solar_noon'] / 24)
# Interazioni
df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']
df['visibility_elevation'] = df['visibility'] * df['solar_elevation']
# Rolling features
df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12, min_periods=1).mean()
df['temp_rolling_12h'] = df['temp'].rolling(window=12, min_periods=1).mean()
# Feature di efficienza energetica
df['solar_energy_density'] = df['solarenergy'] / df['day_length']
df['cloud_impact'] = df['solarradiation'] * (1 - df['cloudcover'] / 100)
return df
def add_environmental_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge feature ambientali derivate.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con feature ambientali aggiunte
"""
# Calcola VPD
df['vpd'] = calculate_vpd(df['temp'], df['humidity'])
# Feature di stress idrico
df['water_stress_index'] = df['vpd'] * (1 - df['humidity'] / 100)
df['dryness_index'] = (df['temp'] - df['dew']) * (100 - df['humidity']) / 100
# Indici di comfort
df['heat_index'] = np.where(
df['temp'] >= 27,
-42.379 + 2.04901523 * df['temp'] + 10.14333127 * df['humidity'] -
0.22475541 * df['temp'] * df['humidity'] - 0.00683783 * df['temp'] ** 2 -
0.05481717 * df['humidity'] ** 2 + 0.00122874 * df['temp'] ** 2 * df['humidity'] +
0.00085282 * df['temp'] * df['humidity'] ** 2 -
0.00000199 * df['temp'] ** 2 * df['humidity'] ** 2,
df['temp']
)
# Rolling means per trend
windows = [3, 6, 12, 24] # ore
for window in windows:
df[f'temp_rolling_mean_{window}h'] = df['temp'].rolling(window=window, min_periods=1).mean()
df[f'humid_rolling_mean_{window}h'] = df['humidity'].rolling(window=window, min_periods=1).mean()
df[f'precip_rolling_sum_{window}h'] = df['precip'].rolling(window=window, min_periods=1).sum()
return df
def add_weather_indicators(df: pd.DataFrame) -> pd.DataFrame:
"""
Aggiunge indicatori meteorologici complessi.
Parameters
----------
df : pd.DataFrame
DataFrame di input
Returns
-------
pd.DataFrame
DataFrame con indicatori meteorologici aggiunti
"""
# Indicatori di stabilità atmosferica
df['temp_stability'] = df['temp_rolling_mean_12h'].std()
df['pressure_tendency'] = df['pressure'].diff()
# Indicatori di precipitazioni
df['rain_intensity'] = np.where(
df['precip'] > 0,
df['precip'] / (df['precip_rolling_sum_24h'] + 1e-6),
0
)
df['dry_spell'] = (df['precip'] == 0).astype(int).groupby(
(df['precip'] != 0).cumsum()
).cumsum()
# Indicatori di comfort termico
df['apparent_temp'] = df['temp'] + 0.33 * df['vpd'] - 0.7 * df['windspeed'] - 4.0
df['frost_risk'] = (df['temp'] < 2).astype(int)
df['heat_stress'] = (df['temp'] > 30).astype(int) * (df['humidity'] > 70).astype(int)
# Indicatori di qualità dell'aria
df['stagnation_index'] = (df['windspeed'] < 5).astype(int) * (df['cloudcover'] > 80).astype(int)
df['visibility_index'] = df['visibility'] * (1 - df['cloudcover'] / 100)
# Indicatori agrometeorologici
df['growing_degree_days'] = np.maximum(0, df['temp'] - 10) # base 10°C
df['chill_hours'] = (df['temp'] < 7).astype(int)
df['evapotranspiration_proxy'] = df['vpd'] * df['solarradiation'] * (1 + 0.536 * df['windspeed'])
return df

207
src/models/callbacks.py Normal file
View File

@ -0,0 +1,207 @@
import tensorflow as tf
import numpy as np
from typing import Dict, Optional, List
import os
import json
from datetime import datetime
@tf.keras.saving.register_keras_serializable()
class CustomCallback(tf.keras.callbacks.Callback):
"""
Callback personalizzato per monitorare la non-negatività delle predizioni
e altre metriche durante il training.
"""
def __init__(self, validation_data: Optional[tuple] = None):
super().__init__()
self.validation_data = validation_data
def on_epoch_end(self, epoch: int, logs: Optional[Dict] = None):
try:
if hasattr(self.model, 'validation_data'):
val_x = self.model.validation_data[0]
if isinstance(val_x, list): # Per il modello della radiazione
val_pred = self.model.predict(val_x, verbose=0)
else:
val_pred = self.model.predict(val_x, verbose=0)
# Verifica non-negatività
if np.any(val_pred < 0):
print("\nWarning: Rilevati valori negativi nelle predizioni")
print(f"Min value: {np.min(val_pred)}")
# Statistiche predizioni
print(f"\nStatistiche predizioni epoca {epoch}:")
print(f"Min: {np.min(val_pred):.4f}")
print(f"Max: {np.max(val_pred):.4f}")
print(f"Media: {np.mean(val_pred):.4f}")
# Aggiunge le metriche ai logs
if logs is not None:
logs['val_pred_min'] = np.min(val_pred)
logs['val_pred_max'] = np.max(val_pred)
logs['val_pred_mean'] = np.mean(val_pred)
except Exception as e:
print(f"\nWarning nel CustomCallback: {str(e)}")
@tf.keras.saving.register_keras_serializable()
class WarmUpLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
"""
Schedule del learning rate con warm-up lineare e decay esponenziale.
"""
def __init__(self, initial_learning_rate: float = 1e-3,
warmup_steps: int = 500,
decay_steps: int = 5000):
super().__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.decay_steps = decay_steps
def __call__(self, step):
warmup_pct = tf.cast(step, tf.float32) / self.warmup_steps
warmup_lr = self.initial_learning_rate * warmup_pct
decay_factor = tf.pow(0.1, tf.cast(step, tf.float32) / self.decay_steps)
decayed_lr = self.initial_learning_rate * decay_factor
return tf.where(step < self.warmup_steps, warmup_lr, decayed_lr)
def get_config(self):
return {
'initial_learning_rate': self.initial_learning_rate,
'warmup_steps': self.warmup_steps,
'decay_steps': self.decay_steps
}
class MetricLogger(tf.keras.callbacks.Callback):
"""
Logger avanzato per metriche di training che salva i risultati in JSON
e crea grafici di progresso.
"""
def __init__(self, log_dir: str = './logs',
metric_list: Optional[List[str]] = None,
save_freq: int = 1):
super().__init__()
self.log_dir = log_dir
os.makedirs(log_dir, exist_ok=True)
self.metric_list = metric_list or ['loss', 'val_loss', 'mae', 'val_mae']
self.save_freq = save_freq
self.history = {metric: [] for metric in self.metric_list}
# Timestamp per il nome del file
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.log_file = os.path.join(log_dir, f'metrics_{self.timestamp}.json')
def on_epoch_end(self, epoch: int, logs: Dict = None):
# Aggiorna lo storico
for metric in self.metric_list:
if metric in logs:
self.history[metric].append(float(logs[metric]))
# Salva i log periodicamente
if (epoch + 1) % self.save_freq == 0:
self._save_logs()
self._create_plots()
def _save_logs(self):
"""Salva i log in formato JSON."""
with open(self.log_file, 'w') as f:
json.dump({
'history': self.history,
'epochs': len(next(iter(self.history.values())))
}, f, indent=4)
def _create_plots(self):
"""Crea grafici delle metriche."""
import matplotlib.pyplot as plt
# Plot per ogni metrica
for metric in self.metric_list:
if metric in self.history and len(self.history[metric]) > 0:
plt.figure(figsize=(10, 6))
plt.plot(self.history[metric])
plt.title(f'Model {metric}')
plt.ylabel(metric)
plt.xlabel('Epoch')
plt.savefig(os.path.join(self.log_dir, f'{metric}_{self.timestamp}.png'))
plt.close()
class EarlyStoppingWithBest(tf.keras.callbacks.EarlyStopping):
"""
Early stopping avanzato che salva il miglior modello e fornisce
analisi dettagliate sulla convergenza.
"""
def __init__(self,
monitor: str = 'val_loss',
min_delta: float = 0,
patience: int = 0,
verbose: int = 0,
mode: str = 'auto',
baseline: Optional[float] = None,
restore_best_weights: bool = True,
start_from_epoch: int = 0):
super().__init__(
monitor=monitor,
min_delta=min_delta,
patience=patience,
verbose=verbose,
mode=mode,
baseline=baseline,
restore_best_weights=restore_best_weights,
start_from_epoch=start_from_epoch
)
self.best_epoch = 0
self.convergence_history = []
def on_epoch_end(self, epoch: int, logs: Optional[Dict] = None):
current = self.get_monitor_value(logs)
if current is None:
return
# Aggiungi il valore corrente alla storia
self.convergence_history.append(float(current))
# Calcola statistiche di convergenza
if len(self.convergence_history) > 1:
improvement = self.convergence_history[-2] - current
pct_improvement = (improvement / self.convergence_history[-2]) * 100
if self.verbose > 0:
print(f"\nEpoch {epoch + 1}: {self.monitor} improved by {pct_improvement:.2f}%")
# Aggiorna best_epoch se necessario
if self.monitor_op(current - self.min_delta, self.best):
self.best = current
self.best_epoch = epoch
self.wait = 0
else:
self.wait += 1
if self.wait >= self.patience:
self.stopped_epoch = epoch
self.model.stop_training = True
if self.restore_best_weights and self.best_weights is not None:
if self.verbose > 0:
print(f"\nRestoring model weights from epoch {self.best_epoch + 1}")
self.model.set_weights(self.best_weights)
def get_convergence_stats(self) -> Dict:
"""
Restituisce statistiche dettagliate sulla convergenza.
"""
if len(self.convergence_history) < 2:
return {}
improvements = np.diff(self.convergence_history)
return {
'best_epoch': self.best_epoch + 1,
'best_value': float(self.best),
'avg_improvement': float(np.mean(improvements)),
'total_improvement': float(self.convergence_history[0] - self.best),
'convergence_rate': float(np.mean(np.abs(improvements[1:] / improvements[:-1]))),
'final_value': float(self.convergence_history[-1])
}

327
src/models/layers.py Normal file
View File

@ -0,0 +1,327 @@
import tensorflow as tf
from tf.keras import layers
from typing import List, Optional
@tf.keras.saving.register_keras_serializable()
class MultiScaleAttention(layers.Layer):
"""
Layer di attenzione multi-scala per catturare pattern temporali a diverse granularità.
Attributes
----------
num_heads : int
Numero di teste di attenzione
head_dim : int
Dimensionalità per ogni testa
scales : List[int]
Lista delle scale temporali da considerare
"""
def __init__(
self,
num_heads: int = 8,
head_dim: int = 64,
scales: List[int] = [1, 2, 4],
dropout: float = 0.1,
**kwargs
):
super().__init__(**kwargs)
self.num_heads = num_heads
self.head_dim = head_dim
self.scales = scales
self.dropout = dropout
# Creiamo un'attention layer per ogni scala
self.attention_layers = [
layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=head_dim,
dropout=dropout,
name=f'attention_scale_{scale}'
) for scale in scales
]
# Layer per combinare le diverse scale
self.combine = layers.Dense(
head_dim * num_heads,
activation='gelu',
name='scale_combination'
)
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
# Lista per salvare gli output delle diverse scale
scale_outputs = []
for scale, attention in zip(self.scales, self.attention_layers):
# Applica max pooling per ridurre la sequenza alla scala corrente
if scale > 1:
pooled = tf.keras.layers.MaxPool1D(
pool_size=scale,
strides=scale
)(inputs)
else:
pooled = inputs
# Applica attenzione alla sequenza ridotta
attended = attention(pooled, pooled)
# Se necessario, riporta alla dimensione originale
if scale > 1:
attended = tf.keras.layers.UpSampling1D(size=scale)(attended)
# Taglia eventuali timestep in eccesso
attended = attended[:, :tf.shape(inputs)[1], :]
scale_outputs.append(attended)
# Concatena e combina gli output delle diverse scale
concatenated = tf.concat(scale_outputs, axis=-1)
output = self.combine(concatenated)
return output
def get_config(self) -> dict:
config = super().get_config()
config.update({
"num_heads": self.num_heads,
"head_dim": self.head_dim,
"scales": self.scales,
"dropout": self.dropout
})
return config
@tf.keras.saving.register_keras_serializable()
class TemporalConvBlock(layers.Layer):
"""
Blocco di convoluzione temporale con residual connection.
Attributes
----------
filters : int
Numero di filtri convoluzionali
kernel_sizes : List[int]
Lista delle dimensioni dei kernel da utilizzare
dilation_rates : List[int]
Lista dei tassi di dilatazione
"""
def __init__(
self,
filters: int = 64,
kernel_sizes: List[int] = [3, 5, 7],
dilation_rates: List[int] = [1, 2, 4],
dropout: float = 0.1,
**kwargs
):
super().__init__(**kwargs)
self.filters = filters
self.kernel_sizes = kernel_sizes
self.dilation_rates = dilation_rates
self.dropout = dropout
# Crea i layer convoluzionali
self.conv_layers = []
for k_size in kernel_sizes:
for d_rate in dilation_rates:
self.conv_layers.append(
layers.Conv1D(
filters=filters // (len(kernel_sizes) * len(dilation_rates)),
kernel_size=k_size,
dilation_rate=d_rate,
padding='same',
activation='gelu'
)
)
# Layer per il processing finale
self.combine = layers.Conv1D(filters, 1)
self.layernorm = layers.LayerNormalization()
self.dropout = layers.Dropout(dropout)
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
# Lista per gli output di ogni convoluzione
conv_outputs = []
# Applica ogni combinazione di kernel size e dilation rate
for conv in self.conv_layers:
conv_outputs.append(conv(inputs))
# Concatena tutti gli output
concatenated = tf.concat(conv_outputs, axis=-1)
# Combinazione finale
x = self.combine(concatenated)
x = self.layernorm(x)
x = self.dropout(x, training=training)
# Residual connection
return x + inputs
def get_config(self) -> dict:
config = super().get_config()
config.update({
"filters": self.filters,
"kernel_sizes": self.kernel_sizes,
"dilation_rates": self.dilation_rates,
"dropout": self.dropout
})
return config
@tf.keras.saving.register_keras_serializable()
class WeatherEmbedding(layers.Layer):
"""
Layer per l'embedding di feature meteorologiche.
Combina embedding categorici e numerici.
Attributes
----------
embedding_dim : int
Dimensionalità dell'embedding
num_numerical : int
Numero di feature numeriche
categorical_features : dict
Dizionario con feature categoriche e loro cardinalità
"""
def __init__(
self,
embedding_dim: int = 32,
num_numerical: int = 8,
categorical_features: Optional[dict] = None,
**kwargs
):
super().__init__(**kwargs)
self.embedding_dim = embedding_dim
self.num_numerical = num_numerical
self.categorical_features = categorical_features or {
'season': 4,
'time_period': 4,
'weather_condition': 10
}
# Layer per feature numeriche
self.numerical_projection = layers.Dense(
embedding_dim,
activation='gelu'
)
# Layer per feature categoriche
self.categorical_embeddings = {
name: layers.Embedding(
input_dim=num_categories,
output_dim=embedding_dim
)
for name, num_categories in self.categorical_features.items()
}
# Layer di combinazione finale
self.combine = layers.Dense(embedding_dim, activation='gelu')
def call(self, inputs: dict) -> tf.Tensor:
# Processa feature numeriche
numerical = self.numerical_projection(inputs['numerical'])
# Lista per gli embedding categorici
categorical_outputs = []
# Processa ogni feature categorica
for name, embedding_layer in self.categorical_embeddings.items():
if name in inputs['categorical']:
embedded = embedding_layer(inputs['categorical'][name])
categorical_outputs.append(embedded)
# Combina tutti gli embedding
if categorical_outputs:
categorical = tf.reduce_mean(tf.stack(categorical_outputs, axis=1), axis=1)
combined = tf.concat([numerical, categorical], axis=-1)
else:
combined = numerical
return self.combine(combined)
def get_config(self) -> dict:
config = super().get_config()
config.update({
"embedding_dim": self.embedding_dim,
"num_numerical": self.num_numerical,
"categorical_features": self.categorical_features
})
return config
@tf.keras.saving.register_keras_serializable()
class OliveVarietyEmbedding(layers.Layer):
"""
Layer per l'embedding delle varietà di olive e delle loro caratteristiche.
Attributes
----------
embedding_dim : int
Dimensionalità dell'embedding
num_varieties : int
Numero di varietà di olive
num_techniques : int
Numero di tecniche di coltivazione
"""
def __init__(
self,
embedding_dim: int = 32,
num_varieties: int = 11,
num_techniques: int = 3,
**kwargs
):
super().__init__(**kwargs)
self.embedding_dim = embedding_dim
self.num_varieties = num_varieties
self.num_techniques = num_techniques
# Embedding per varietà e tecniche
self.variety_embedding = layers.Embedding(
input_dim=num_varieties,
output_dim=embedding_dim
)
self.technique_embedding = layers.Embedding(
input_dim=num_techniques,
output_dim=embedding_dim
)
# Layer per feature continue
self.continuous_projection = layers.Dense(
embedding_dim,
activation='gelu'
)
# Layer di combinazione
self.combine = layers.Dense(embedding_dim, activation='gelu')
def call(self, inputs: dict) -> tf.Tensor:
# Embedding varietà
variety_embedded = self.variety_embedding(inputs['variety'])
# Embedding tecniche
technique_embedded = self.technique_embedding(inputs['technique'])
# Proiezione feature continue
continuous_projected = self.continuous_projection(inputs['continuous'])
# Combinazione
combined = tf.concat([
variety_embedded,
technique_embedded,
continuous_projected
], axis=-1)
return self.combine(combined)
def get_config(self) -> dict:
config = super().get_config()
config.update({
"embedding_dim": self.embedding_dim,
"num_varieties": self.num_varieties,
"num_techniques": self.num_techniques
})
return config

204
src/models/solar_models.py Normal file
View File

@ -0,0 +1,204 @@
import tensorflow as tf
import tf.keras.layers as layers
def create_radiation_model(input_shape, solar_params_shape=(3,)):
"""
Modello per la radiazione solare con vincoli di non-negatività.
"""
# Input layers
main_input = layers.Input(shape=input_shape, name='main_input')
solar_input = layers.Input(shape=solar_params_shape, name='solar_params')
# Branch CNN
x1 = layers.Conv1D(32, 3, padding='same')(main_input)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.Conv1D(64, 3, padding='same')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.GlobalAveragePooling1D()(x1)
# Branch LSTM
x2 = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(main_input)
x2 = layers.Bidirectional(layers.LSTM(32))(x2)
x2 = layers.BatchNormalization()(x2)
# Solar parameters processing
x3 = layers.Dense(32)(solar_input)
x3 = layers.BatchNormalization()(x3)
x3 = layers.Activation('relu')(x3)
# Combine all branches
x = layers.concatenate([x1, x2, x3])
# Dense layers with non-negativity constraints
x = layers.Dense(64, kernel_constraint=tf.keras.constraints.NonNeg())(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(32, kernel_constraint=tf.keras.constraints.NonNeg())(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
# Output layer con vincoli di non-negatività
output = layers.Dense(1,
kernel_constraint=tf.keras.constraints.NonNeg(),
activation='relu')(x)
model = layers.Model(inputs=[main_input, solar_input], outputs=output, name="SolarRadiation")
return model
def create_energy_model(input_shape):
"""
Modello migliorato per l'energia solare che sfrutta la relazione con la radiazione.
Include vincoli di non-negatività e migliore gestione delle dipendenze temporali.
"""
inputs = layers.Input(shape=input_shape)
# Branch 1: Elaborazione temporale con attention
# Multi-head attention per catturare relazioni temporali
x1 = layers.MultiHeadAttention(num_heads=8, key_dim=32)(inputs, inputs)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
# Temporal Convolution branch per catturare pattern locali
x2 = layers.Conv1D(
filters=64,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(inputs)
x2 = layers.BatchNormalization()(x2)
x2 = layers.Activation('relu')(x2)
x2 = layers.Conv1D(
filters=32,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(x2)
x2 = layers.BatchNormalization()(x2)
x2 = layers.Activation('relu')(x2)
# LSTM branch per memoria a lungo termine
x3 = layers.LSTM(64, return_sequences=True)(inputs)
x3 = layers.LSTM(32, return_sequences=False)(x3)
x3 = layers.BatchNormalization()(x3)
x3 = layers.Activation('relu')(x3)
# Global pooling per ogni branch
x1 = layers.GlobalAveragePooling1D()(x1)
x2 = layers.GlobalAveragePooling1D()(x2)
# Concatena tutti i branch
x = layers.concatenate([x1, x2, x3])
# Dense layers con vincoli di non-negatività
x = layers.Dense(
128,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(
64,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.2)(x)
# Output layer con vincolo di non-negatività
output = layers.Dense(
1,
kernel_constraint=tf.keras.constraints.NonNeg(),
activation='relu', # Garantisce output non negativo
kernel_regularizer=layers.l2(0.01)
)(x)
model = layers.Model(inputs=inputs, outputs=output, name="SolarEnergy")
return model
def create_uv_model(input_shape):
"""
Modello migliorato per l'indice UV che sfrutta sia radiazione che energia solare.
Include vincoli di non-negatività e considera le relazioni non lineari tra le variabili.
"""
inputs = layers.Input(shape=input_shape)
# CNN branch per pattern locali
x1 = layers.Conv1D(
filters=64,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(inputs)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.MaxPooling1D(pool_size=2)(x1)
x1 = layers.Conv1D(
filters=32,
kernel_size=3,
padding='same',
kernel_constraint=tf.keras.constraints.NonNeg()
)(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Activation('relu')(x1)
x1 = layers.GlobalAveragePooling1D()(x1)
# Attention branch per relazioni complesse
# Specialmente utile per le relazioni con radiazione ed energia
x2 = layers.MultiHeadAttention(num_heads=4, key_dim=32)(inputs, inputs)
x2 = layers.BatchNormalization()(x2)
x2 = layers.Activation('relu')(x2)
x2 = layers.GlobalAveragePooling1D()(x2)
# Dense branch per le feature più recenti
x3 = layers.GlobalAveragePooling1D()(inputs)
x3 = layers.Dense(
64,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x3)
x3 = layers.BatchNormalization()(x3)
x3 = layers.Activation('relu')(x3)
# Fusion dei branch
x = layers.concatenate([x1, x2, x3])
# Dense layers con vincoli di non-negatività
x = layers.Dense(
128,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(
64,
kernel_constraint=tf.keras.constraints.NonNeg(),
kernel_regularizer=layers.l2(0.01)
)(x)
x = layers.BatchNormalization()(x)
x = layers.Activation('relu')(x)
x = layers.Dropout(0.2)(x)
# Output layer con vincolo di non-negatività
output = layers.Dense(
1,
kernel_constraint=tf.keras.constraints.NonNeg(),
activation='relu', # Garantisce output non negativo
kernel_regularizer=layers.l2(0.01)
)(x)
model = layers.Model(inputs=inputs, outputs=output, name="SolarUV")
return model

385
src/models/training.py Normal file
View File

@ -0,0 +1,385 @@
import tensorflow as tf
import numpy as np
from typing import Dict, Tuple, List
import os
import keras
from src.models.transformer import create_olive_oil_transformer
from src.models.callbacks import CustomCallback, WarmUpLearningRateSchedule
def compile_model(model: tf.keras.Model, learning_rate: float = 1e-3) -> tf.keras.Model:
"""
Compila il modello con le impostazioni ottimizzate.
Parameters
----------
model : tf.keras.Model
Modello da compilare
learning_rate : float
Learning rate iniziale
Returns
-------
tf.keras.Model
Modello compilato
"""
lr_schedule = WarmUpLearningRateSchedule(
initial_learning_rate=learning_rate,
warmup_steps=500,
decay_steps=5000
)
model.compile(
optimizer=tf.keras.optimizers.AdamW(
learning_rate=lr_schedule,
weight_decay=0.01
),
loss=tf.keras.losses.Huber(),
metrics=['mae']
)
return model
def create_callbacks(target_names: List[str],
val_data: Dict,
val_targets: np.ndarray) -> List[tf.keras.callbacks.Callback]:
"""
Crea i callbacks per il training del modello.
Parameters
----------
target_names : list
Lista dei nomi dei target
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
Returns
-------
list
Lista dei callbacks configurati
"""
class TargetSpecificMetric(tf.keras.callbacks.Callback):
def __init__(self, validation_data, target_names):
super().__init__()
self.validation_data = validation_data
self.target_names = target_names
def on_epoch_end(self, epoch, logs={}):
x_val, y_val = self.validation_data
y_pred = self.model.predict(x_val, verbose=0)
for i, name in enumerate(self.target_names):
mae = np.mean(np.abs(y_val[:, i] - y_pred[:, i]))
logs[f'val_{name}_mae'] = mae
# Crea le cartelle per i checkpoint e i log
os.makedirs('./kaggle/working/models/oil_transformer/checkpoints', exist_ok=True)
os.makedirs('./kaggle/working/models/oil_transformer/logs', exist_ok=True)
callbacks = [
# Early Stopping
tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=20,
restore_best_weights=True,
min_delta=0.0005,
mode='min'
),
# Model Checkpoint
tf.keras.callbacks.ModelCheckpoint(
filepath='./kaggle/working/models/oil_transformer/checkpoints/model_{epoch:02d}_{val_loss:.4f}.h5',
monitor='val_loss',
save_best_only=True,
mode='min',
save_weights_only=True
),
# Target specific metrics
TargetSpecificMetric(
validation_data=(val_data, val_targets),
target_names=target_names
),
# Reduce LR on Plateau
tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=10,
min_lr=1e-6,
verbose=1
),
# TensorBoard logging
tf.keras.callbacks.TensorBoard(
log_dir='./kaggle/working/models/oil_transformer/logs',
histogram_freq=1,
write_graph=True,
update_freq='epoch'
)
]
return callbacks
def setup_transformer_training(train_data: Dict,
train_targets: np.ndarray,
val_data: Dict,
val_targets: np.ndarray) -> Tuple[tf.keras.Model, List, List[str]]:
"""
Configura e prepara il transformer con dimensioni dinamiche.
Parameters
----------
train_data : dict
Dati di training
train_targets : np.ndarray
Target di training
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
Returns
-------
tuple
(model, callbacks, target_names)
"""
# Estrai le shape dai dati
temporal_shape = (train_data['temporal'].shape[1], train_data['temporal'].shape[2])
static_shape = (train_data['static'].shape[1],)
num_outputs = train_targets.shape[1]
print(f"Shape rilevate:")
print(f"- Temporal shape: {temporal_shape}")
print(f"- Static shape: {static_shape}")
print(f"- Numero di output: {num_outputs}")
# Target names
target_names = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']
assert len(target_names) == num_outputs, \
f"Il numero di target names ({len(target_names)}) non corrisponde al numero di output ({num_outputs})"
# Crea il modello
model = create_olive_oil_transformer(
temporal_shape=temporal_shape,
static_shape=static_shape,
num_outputs=num_outputs
)
# Compila il modello
model = compile_model(model)
# Crea i callbacks
callbacks = create_callbacks(target_names, val_data, val_targets)
return model, callbacks, target_names
def train_transformer(train_data: Dict,
train_targets: np.ndarray,
val_data: Dict,
val_targets: np.ndarray,
epochs: int = 150,
batch_size: int = 64,
save_name: str = 'final_model') -> Tuple[tf.keras.Model, tf.keras.callbacks.History]:
"""
Funzione principale per l'addestramento del transformer.
Parameters
----------
train_data : dict
Dati di training
train_targets : np.ndarray
Target di training
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
epochs : int
Numero di epoche
batch_size : int
Dimensione del batch
save_name : str
Nome per salvare il modello
Returns
-------
tuple
(model, history)
"""
# Setup del modello
model, callbacks, target_names = setup_transformer_training(
train_data, train_targets, val_data, val_targets
)
# Mostra il summary del modello
model.summary()
os.makedirs(f"./kaggle/working/models/oil_transformer/", exist_ok=True)
keras.utils.plot_model(model, f"./kaggle/working/models/oil_transformer/{save_name}.png", show_shapes=True)
# Training
history = model.fit(
x=train_data,
y=train_targets,
validation_data=(val_data, val_targets),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
verbose=1,
shuffle=True
)
# Salva il modello
save_path = f'./kaggle/working/models/oil_transformer/{save_name}.keras'
model.save(save_path, save_format='keras')
os.makedirs(f'./kaggle/working/models/oil_transformer/weights/', exist_ok=True)
model.save_weights(f'./kaggle/working/models/oil_transformer/weights')
print(f"\nModello salvato in: {save_path}")
return model, history
def retrain_model(base_model: tf.keras.Model,
train_data: Dict,
train_targets: np.ndarray,
val_data: Dict,
val_targets: np.ndarray,
test_data: Dict,
test_targets: np.ndarray,
epochs: int = 50,
batch_size: int = 128) -> Tuple[tf.keras.Model, tf.keras.callbacks.History, Dict]:
"""
Implementa il retraining del modello con i dati combinati.
Parameters
----------
base_model : tf.keras.Model
Modello base da riaddestrate
train_data : dict
Dati di training
train_targets : np.ndarray
Target di training
val_data : dict
Dati di validazione
val_targets : np.ndarray
Target di validazione
test_data : dict
Dati di test
test_targets : np.ndarray
Target di test
epochs : int
Numero di epoche
batch_size : int
Dimensione del batch
Returns
-------
tuple
(model, history, final_metrics)
"""
print("Valutazione performance iniziali del modello...")
initial_metrics = {
'train': evaluate_model_performance(base_model, train_data, train_targets, "training"),
'val': evaluate_model_performance(base_model, val_data, val_targets, "validazione"),
'test': evaluate_model_performance(base_model, test_data, test_targets, "test")
}
# Combina i dati
combined_data = {
'temporal': np.concatenate([
train_data['temporal'],
val_data['temporal'],
test_data['temporal']
]),
'static': np.concatenate([
train_data['static'],
val_data['static'],
test_data['static']
])
}
combined_targets = np.concatenate([train_targets, val_targets, test_targets])
# Nuova suddivisione
indices = np.arange(len(combined_targets))
np.random.shuffle(indices)
split_idx = int(len(indices) * 0.9)
train_idx, val_idx = indices[:split_idx], indices[split_idx:]
# Prepara i dati per il retraining
retrain_data = {k: v[train_idx] for k, v in combined_data.items()}
retrain_targets = combined_targets[train_idx]
retrain_val_data = {k: v[val_idx] for k, v in combined_data.items()}
retrain_val_targets = combined_targets[val_idx]
# Callbacks
callbacks = [
tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True,
min_delta=0.0001
),
tf.keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.2,
patience=5,
min_lr=1e-6,
verbose=1
),
tf.keras.callbacks.ModelCheckpoint(
filepath='./kaggle/working/models/oil_transformer/retrain_checkpoints/model_{epoch:02d}_{val_loss:.4f}.keras',
monitor='val_loss',
save_best_only=True,
mode='min',
save_weights_only=True
)
]
# Ricompila il modello
base_model = compile_model(
base_model,
learning_rate=1e-4 # Learning rate più basso per il fine-tuning
)
print("\nAvvio retraining...")
history = base_model.fit(
retrain_data,
retrain_targets,
validation_data=(retrain_val_data, retrain_val_targets),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
verbose=1
)
print("\nValutazione performance finali...")
final_metrics = {
'train': evaluate_model_performance(base_model, train_data, train_targets, "training"),
'val': evaluate_model_performance(base_model, val_data, val_targets, "validazione"),
'test': evaluate_model_performance(base_model, test_data, test_targets, "test")
}
# Salva il modello
save_path = './kaggle/working/models/oil_transformer/retrained_model.keras'
base_model.save(save_path, save_format='keras')
print(f"\nModello riaddestrato salvato in: {save_path}")
# Report miglioramenti
print("\nMiglioramenti delle performance:")
for dataset in ['train', 'val', 'test']:
print(f"\nSet {dataset}:")
for metric in initial_metrics[dataset].keys():
initial = initial_metrics[dataset][metric]
final = final_metrics[dataset][metric]
improvement = ((initial - final) / initial) * 100
print(f"{metric}: {improvement:.2f}% di miglioramento")
return base_model, history, final_metrics

332
src/models/transformer.py Normal file
View File

@ -0,0 +1,332 @@
import tensorflow as tf
from tf.keras import layers
from typing import Tuple, Optional, List
@tf.keras.saving.register_keras_serializable()
class DataAugmentation(layers.Layer):
"""
Layer personalizzato per l'augmentation dei dati temporali.
Attributes
----------
noise_stddev : float
Deviazione standard del rumore gaussiano
"""
def __init__(self, noise_stddev: float = 0.03, **kwargs):
super().__init__(**kwargs)
self.noise_stddev = noise_stddev
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
"""
Applica l'augmentation durante il training.
Parameters
----------
inputs : tf.Tensor
Dati di input
training : bool, optional
Flag che indica se siamo in fase di training
Returns
-------
tf.Tensor
Dati aumentati se in training, altrimenti dati originali
"""
if training:
return inputs + tf.random.normal(
shape=tf.shape(inputs),
mean=0.0,
stddev=self.noise_stddev
)
return inputs
def get_config(self) -> dict:
config = super().get_config()
config.update({"noise_stddev": self.noise_stddev})
return config
@tf.keras.saving.register_keras_serializable()
class PositionalEncoding(layers.Layer):
"""
Layer per l'encoding posizionale nel transformer.
Attributes
----------
d_model : int
Dimensionalità del modello
"""
def __init__(self, d_model: int, **kwargs):
super().__init__(**kwargs)
self.d_model = d_model
def build(self, input_shape: tf.TensorShape):
"""
Costruisce la matrice di encoding posizionale.
Parameters
----------
input_shape : tf.TensorShape
Shape dell'input
"""
_, seq_length, _ = input_shape
# Crea la matrice di encoding posizionale
position = tf.range(seq_length, dtype=tf.float32)[:, tf.newaxis]
div_term = tf.exp(
tf.range(0, self.d_model, 2, dtype=tf.float32) *
(-tf.math.log(10000.0) / self.d_model)
)
# Calcola sin e cos
pos_encoding = tf.zeros((1, seq_length, self.d_model))
pos_encoding_even = tf.sin(position * div_term)
pos_encoding_odd = tf.cos(position * div_term)
# Assegna i valori alle posizioni pari e dispari
pos_encoding = tf.concat(
[tf.expand_dims(pos_encoding_even, -1),
tf.expand_dims(pos_encoding_odd, -1)],
axis=-1
)
pos_encoding = tf.reshape(pos_encoding, (1, seq_length, -1))
pos_encoding = pos_encoding[:, :, :self.d_model]
# Salva l'encoding come peso non trainabile
self.pos_encoding = self.add_weight(
shape=(1, seq_length, self.d_model),
initializer=tf.keras.initializers.Constant(pos_encoding),
trainable=False,
name='positional_encoding'
)
super().build(input_shape)
def call(self, inputs: tf.Tensor) -> tf.Tensor:
"""
Applica l'encoding posizionale.
Parameters
----------
inputs : tf.Tensor
Dati di input
Returns
-------
tf.Tensor
Dati con encoding posizionale aggiunto
"""
batch_size = tf.shape(inputs)[0]
return inputs + tf.tile(self.pos_encoding, [batch_size, 1, 1])
def get_config(self) -> dict:
config = super().get_config()
config.update({"d_model": self.d_model})
return config
@tf.keras.saving.register_keras_serializable()
class OliveTransformerBlock(layers.Layer):
"""
Blocco transformer personalizzato per dati di produzione olive.
Attributes
----------
num_heads : int
Numero di teste di attenzione
key_dim : int
Dimensione delle chiavi
ff_dim : int
Dimensione del feed-forward network
dropout : float
Tasso di dropout
"""
def __init__(self, num_heads: int, key_dim: int, ff_dim: int, dropout: float = 0.1, **kwargs):
super().__init__(**kwargs)
self.num_heads = num_heads
self.key_dim = key_dim
self.ff_dim = ff_dim
self.dropout = dropout
# Multi-head attention
self.mha = layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=key_dim,
dropout=dropout
)
# Feed-forward network
self.ffn = tf.keras.Sequential([
layers.Dense(ff_dim, activation="gelu"),
layers.Dropout(dropout),
layers.Dense(key_dim)
])
# Layer normalization
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
# Dropout layers
self.dropout1 = layers.Dropout(dropout)
self.dropout2 = layers.Dropout(dropout)
def call(self, inputs: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
"""
Forward pass del blocco transformer.
Parameters
----------
inputs : tf.Tensor
Dati di input
training : bool, optional
Flag di training
Returns
-------
tf.Tensor
Output del blocco transformer
"""
# Multi-head attention
attn_output = self.mha(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
# Feed-forward network
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def get_config(self) -> dict:
config = super().get_config()
config.update({
"num_heads": self.num_heads,
"key_dim": self.key_dim,
"ff_dim": self.ff_dim,
"dropout": self.dropout
})
return config
def create_olive_oil_transformer(
temporal_shape: Tuple[int, int],
static_shape: Tuple[int],
num_outputs: int,
d_model: int = 128,
num_heads: int = 8,
ff_dim: int = 256,
num_transformer_blocks: int = 4,
mlp_units: List[int] = [256, 128, 64],
dropout: float = 0.2
) -> tf.keras.Model:
"""
Crea un transformer per la predizione della produzione di olio d'oliva.
Parameters
----------
temporal_shape : tuple
Shape dei dati temporali (timesteps, features)
static_shape : tuple
Shape dei dati statici (features,)
num_outputs : int
Numero di output del modello
d_model : int
Dimensionalità del modello
num_heads : int
Numero di teste di attenzione
ff_dim : int
Dimensione del feed-forward network
num_transformer_blocks : int
Numero di blocchi transformer
mlp_units : list
Unità nei layer MLP
dropout : float
Tasso di dropout
Returns
-------
tf.keras.Model
Modello transformer configurato
"""
# Input layers
temporal_input = layers.Input(shape=temporal_shape, name='temporal')
static_input = layers.Input(shape=static_shape, name='static')
# === TEMPORAL PATH ===
x = layers.LayerNormalization(epsilon=1e-6)(temporal_input)
x = DataAugmentation()(x)
# Temporal projection
x = layers.Dense(d_model // 2, activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-5))(x)
x = layers.Dropout(dropout)(x)
x = layers.Dense(d_model, activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-5))(x)
# Positional encoding
x = PositionalEncoding(d_model)(x)
# Transformer blocks
skip_connection = x
for _ in range(num_transformer_blocks):
x = OliveTransformerBlock(num_heads, d_model, ff_dim, dropout)(x)
# Add final skip connection
x = layers.Add()([x, skip_connection])
# Temporal pooling
attention_pooled = layers.MultiHeadAttention(
num_heads=num_heads,
key_dim=d_model // 4
)(x, x)
attention_pooled = layers.GlobalAveragePooling1D()(attention_pooled)
# Additional pooling operations
avg_pooled = layers.GlobalAveragePooling1D()(x)
max_pooled = layers.GlobalMaxPooling1D()(x)
# Combine pooling results
temporal_features = layers.Concatenate()([attention_pooled, avg_pooled, max_pooled])
# === STATIC PATH ===
static_features = layers.LayerNormalization(epsilon=1e-6)(static_input)
for units in [256, 128, 64]:
static_features = layers.Dense(
units,
activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-5)
)(static_features)
static_features = layers.Dropout(dropout)(static_features)
# === FEATURE FUSION ===
combined = layers.Concatenate()([temporal_features, static_features])
# === MLP HEAD ===
x = combined
for units in mlp_units:
x = layers.BatchNormalization()(x)
x = layers.Dense(
units,
activation="gelu",
kernel_regularizer=tf.keras.regularizers.l2(1e-5)
)(x)
x = layers.Dropout(dropout)(x)
# Output layer
outputs = layers.Dense(
num_outputs,
activation='linear',
kernel_regularizer=tf.keras.regularizers.l2(1e-5)
)(x)
# Create model
model = tf.keras.Model(
inputs={'temporal': temporal_input, 'static': static_input},
outputs=outputs,
name='OliveOilTransformer'
)
return model

1862
src/olive-oil-dashboard.py Normal file

File diff suppressed because it is too large Load Diff

36
src/olive_config.json Normal file
View File

@ -0,0 +1,36 @@
{
"oliveto": {
"hectares": 10,
"varieties": [
{
"variety": "Nocellara dell'Etna",
"technique": "Tradizionale",
"percentage": 70
},
{
"variety": "Frantoio",
"technique": "Tradizionale",
"percentage": 30
}
]
},
"costs": {
"fixed": {
"ammortamento": 2000,
"assicurazione": 500,
"manutenzione": 800
},
"variable": {
"raccolta": 0.35,
"potatura": 600,
"fertilizzanti": 400
},
"transformation": {
"molitura": 0.15,
"stoccaggio": 0.2,
"bottiglia": 1.2,
"etichettatura": 0.3
},
"selling_price": 12
}
}

502
src/training-notebook.ipynb Normal file
View File

@ -0,0 +1,502 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Addestramento Modello per Previsione Produzione Olio d'Oliva\n",
"\n",
"Questo notebook utilizza le funzioni modularizzate per:\n",
"1. Caricare e preprocessare i dati meteorologici\n",
"2. Preparare i dati per il training\n",
"3. Configurare e addestrare il modello"
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-11-07T17:19:09.011001Z",
"start_time": "2024-11-07T17:18:13.513Z"
}
},
"cell_type": "code",
"source": [
"#!apt-get update\n",
"#!apt-get install graphviz -y\n",
"\n",
"!pip install tensorflow\n",
"!pip install numpy\n",
"!pip install pandas\n",
"\n",
"!pip install keras\n",
"!pip install scikit-learn\n",
"!pip install matplotlib\n",
"!pip install joblib\n",
"!pip install pyarrow\n",
"!pip install fastparquet\n",
"!pip install scipy\n",
"!pip install seaborn\n",
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io\n",
"!pip install pvlib"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tensorflow in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (2.16.2)\r\n",
"Requirement already satisfied: absl-py>=1.0.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.1.0)\r\n",
"Requirement already satisfied: astunparse>=1.6.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.6.3)\r\n",
"Requirement already satisfied: flatbuffers>=23.5.26 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (24.3.25)\r\n",
"Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.4.0)\r\n",
"Requirement already satisfied: google-pasta>=0.1.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.2.0)\r\n",
"Requirement already satisfied: h5py>=3.10.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.11.0)\r\n",
"Requirement already satisfied: libclang>=13.0.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (18.1.1)\r\n",
"Requirement already satisfied: ml-dtypes~=0.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.3.2)\r\n",
"Requirement already satisfied: opt-einsum>=2.3.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.3.0)\r\n",
"Requirement already satisfied: packaging in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (24.1)\r\n",
"Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.20.3)\r\n",
"Requirement already satisfied: requests<3,>=2.21.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.32.3)\r\n",
"Requirement already satisfied: setuptools in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (72.1.0)\r\n",
"Requirement already satisfied: six>=1.12.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.16.0)\r\n",
"Requirement already satisfied: termcolor>=1.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.1.0)\r\n",
"Requirement already satisfied: typing-extensions>=3.6.6 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (4.11.0)\r\n",
"Requirement already satisfied: wrapt>=1.11.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.14.1)\r\n",
"Requirement already satisfied: grpcio<2.0,>=1.24.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.48.2)\r\n",
"Requirement already satisfied: tensorboard<2.17,>=2.16 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (2.16.2)\r\n",
"Requirement already satisfied: keras>=3.0.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (3.5.0)\r\n",
"Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (0.37.1)\r\n",
"Requirement already satisfied: numpy<2.0.0,>=1.23.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow) (1.23.5)\r\n",
"Requirement already satisfied: wheel<1.0,>=0.23.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from astunparse>=1.6.0->tensorflow) (0.43.0)\r\n",
"Requirement already satisfied: rich in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras>=3.0.0->tensorflow) (13.8.0)\r\n",
"Requirement already satisfied: namex in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras>=3.0.0->tensorflow) (0.0.8)\r\n",
"Requirement already satisfied: optree in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras>=3.0.0->tensorflow) (0.12.1)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (3.3.2)\r\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (3.7)\r\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (2.2.2)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests<3,>=2.21.0->tensorflow) (2024.8.30)\r\n",
"Requirement already satisfied: markdown>=2.6.8 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (3.4.1)\r\n",
"Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (0.7.0)\r\n",
"Requirement already satisfied: werkzeug>=1.0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorboard<2.17,>=2.16->tensorflow) (3.0.3)\r\n",
"Requirement already satisfied: importlib-metadata>=4.4 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from markdown>=2.6.8->tensorboard<2.17,>=2.16->tensorflow) (7.0.1)\r\n",
"Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from werkzeug>=1.0.1->tensorboard<2.17,>=2.16->tensorflow) (2.1.3)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras>=3.0.0->tensorflow) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras>=3.0.0->tensorflow) (2.15.1)\r\n",
"Requirement already satisfied: zipp>=0.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.17,>=2.16->tensorflow) (3.17.0)\r\n",
"Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.0.0->tensorflow) (0.1.2)\r\n",
"Requirement already satisfied: numpy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.23.5)\r\n",
"Requirement already satisfied: pandas in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (2.2.2)\r\n",
"Requirement already satisfied: numpy>=1.22.4 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (1.23.5)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas) (2023.3)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
"Requirement already satisfied: keras in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (3.5.0)\r\n",
"Requirement already satisfied: absl-py in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (2.1.0)\r\n",
"Requirement already satisfied: numpy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (1.23.5)\r\n",
"Requirement already satisfied: rich in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (13.8.0)\r\n",
"Requirement already satisfied: namex in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (0.0.8)\r\n",
"Requirement already satisfied: h5py in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (3.11.0)\r\n",
"Requirement already satisfied: optree in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (0.12.1)\r\n",
"Requirement already satisfied: ml-dtypes in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (0.3.2)\r\n",
"Requirement already satisfied: packaging in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from keras) (24.1)\r\n",
"Requirement already satisfied: typing-extensions>=4.5.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from optree->keras) (4.11.0)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from rich->keras) (2.15.1)\r\n",
"Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from markdown-it-py>=2.2.0->rich->keras) (0.1.2)\r\n",
"Requirement already satisfied: scikit-learn in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.5.1)\r\n",
"Requirement already satisfied: numpy>=1.19.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (1.23.5)\r\n",
"Requirement already satisfied: scipy>=1.6.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (1.11.4)\r\n",
"Requirement already satisfied: joblib>=1.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (1.4.2)\r\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scikit-learn) (3.5.0)\r\n",
"Requirement already satisfied: matplotlib in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (3.8.4)\r\n",
"Requirement already satisfied: contourpy>=1.0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (1.2.0)\r\n",
"Requirement already satisfied: cycler>=0.10 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (0.11.0)\r\n",
"Requirement already satisfied: fonttools>=4.22.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (4.51.0)\r\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (1.4.4)\r\n",
"Requirement already satisfied: numpy>=1.21 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (1.23.5)\r\n",
"Requirement already satisfied: packaging>=20.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (24.1)\r\n",
"Requirement already satisfied: pillow>=8 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (10.4.0)\r\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (3.0.9)\r\n",
"Requirement already satisfied: python-dateutil>=2.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (2.9.0.post0)\r\n",
"Requirement already satisfied: importlib-resources>=3.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib) (6.4.0)\r\n",
"Requirement already satisfied: zipp>=3.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.17.0)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\r\n",
"Requirement already satisfied: joblib in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.4.2)\r\n",
"Requirement already satisfied: pyarrow in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (17.0.0)\r\n",
"Requirement already satisfied: numpy>=1.16.6 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pyarrow) (1.23.5)\r\n",
"Requirement already satisfied: fastparquet in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (2024.5.0)\r\n",
"Requirement already satisfied: pandas>=1.5.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (2.2.2)\r\n",
"Requirement already satisfied: numpy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (1.23.5)\r\n",
"Requirement already satisfied: cramjam>=2.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (2.8.3)\r\n",
"Requirement already satisfied: fsspec in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (2024.6.1)\r\n",
"Requirement already satisfied: packaging in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from fastparquet) (24.1)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.5.0->fastparquet) (2.9.0.post0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.5.0->fastparquet) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.5.0->fastparquet) (2023.3)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->fastparquet) (1.16.0)\r\n",
"Requirement already satisfied: scipy in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (1.11.4)\r\n",
"Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from scipy) (1.23.5)\r\n",
"Requirement already satisfied: seaborn in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (0.13.2)\r\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from seaborn) (1.23.5)\r\n",
"Requirement already satisfied: pandas>=1.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from seaborn) (2.2.2)\r\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from seaborn) (3.8.4)\r\n",
"Requirement already satisfied: contourpy>=1.0.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.0)\r\n",
"Requirement already satisfied: cycler>=0.10 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.11.0)\r\n",
"Requirement already satisfied: fonttools>=4.22.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.51.0)\r\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.4)\r\n",
"Requirement already satisfied: packaging>=20.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)\r\n",
"Requirement already satisfied: pillow>=8 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)\r\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.0.9)\r\n",
"Requirement already satisfied: python-dateutil>=2.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)\r\n",
"Requirement already satisfied: importlib-resources>=3.2.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (6.4.0)\r\n",
"Requirement already satisfied: pytz>=2020.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.2->seaborn) (2024.1)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.2->seaborn) (2023.3)\r\n",
"Requirement already satisfied: zipp>=3.1.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.4->seaborn) (3.17.0)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)\r\n",
"Collecting tqdm\r\n",
" Downloading tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)\r\n",
"Downloading tqdm-4.67.0-py3-none-any.whl (78 kB)\r\n",
"Installing collected packages: tqdm\r\n",
"Successfully installed tqdm-4.67.0\r\n",
"Collecting pydot\r\n",
" Downloading pydot-3.0.2-py3-none-any.whl.metadata (10 kB)\r\n",
"Requirement already satisfied: pyparsing>=3.0.9 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pydot) (3.0.9)\r\n",
"Downloading pydot-3.0.2-py3-none-any.whl (35 kB)\r\n",
"Installing collected packages: pydot\r\n",
"Successfully installed pydot-3.0.2\r\n",
"Collecting tensorflow-io\r\n",
" Downloading tensorflow_io-0.37.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (14 kB)\r\n",
"Requirement already satisfied: tensorflow-io-gcs-filesystem==0.37.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from tensorflow-io) (0.37.1)\r\n",
"Downloading tensorflow_io-0.37.1-cp39-cp39-macosx_12_0_arm64.whl (31.8 MB)\r\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m31.8/31.8 MB\u001B[0m \u001B[31m1.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m0m\r\n",
"\u001B[?25hInstalling collected packages: tensorflow-io\r\n",
"Successfully installed tensorflow-io-0.37.1\r\n",
"Collecting pvlib\r\n",
" Downloading pvlib-0.11.1-py3-none-any.whl.metadata (2.8 kB)\r\n",
"Requirement already satisfied: numpy>=1.19.3 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (1.23.5)\r\n",
"Requirement already satisfied: pandas>=1.3.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (2.2.2)\r\n",
"Requirement already satisfied: pytz in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (2024.1)\r\n",
"Requirement already satisfied: requests in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (2.32.3)\r\n",
"Requirement already satisfied: scipy>=1.6.0 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (1.11.4)\r\n",
"Requirement already satisfied: h5py in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pvlib) (3.11.0)\r\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.3.0->pvlib) (2.9.0.post0)\r\n",
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from pandas>=1.3.0->pvlib) (2023.3)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (3.3.2)\r\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (3.7)\r\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (2.2.2)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from requests->pvlib) (2024.8.30)\r\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/anaconda3/envs/ml_env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas>=1.3.0->pvlib) (1.16.0)\r\n",
"Downloading pvlib-0.11.1-py3-none-any.whl (29.5 MB)\r\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m29.5/29.5 MB\u001B[0m \u001B[31m2.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
"\u001B[?25hInstalling collected packages: pvlib\r\n",
"Successfully installed pvlib-0.11.1\r\n"
]
}
],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"import tensorflow as tf\n",
"import keras\n",
"\n",
"print(f\"Keras version: {keras.__version__}\")\n",
"print(f\"TensorFlow version: {tf.__version__}\")\n",
"print(f\"TensorFlow version: {tf.__version__}\")\n",
"print(f\"CUDA available: {tf.test.is_built_with_cuda()}\")\n",
"print(f\"GPU devices: {tf.config.list_physical_devices('GPU')}\")\n",
"\n",
"# GPU configuration\n",
"gpus = tf.config.experimental.list_physical_devices('GPU')\n",
"if gpus:\n",
" try:\n",
" for gpu in gpus:\n",
" tf.config.experimental.set_memory_growth(gpu, True)\n",
" logical_gpus = tf.config.experimental.list_logical_devices('GPU')\n",
" print(len(gpus), \"Physical GPUs,\", len(logical_gpus), \"Logical GPUs\")\n",
" except RuntimeError as e:\n",
" print(e)"
]
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Test semplice per verificare che la GPU funzioni\n",
"def test_gpu():\n",
" print(\"TensorFlow version:\", tf.__version__)\n",
" print(\"\\nDispositivi disponibili:\")\n",
" print(tf.config.list_physical_devices())\n",
"\n",
" # Creiamo e moltiplichiamo due tensori sulla GPU\n",
" with tf.device('/GPU:0'):\n",
" a = tf.random.normal([10000, 10000])\n",
" b = tf.random.normal([10000, 10000])\n",
" c = tf.matmul(a, b)\n",
"\n",
" print(\"\\nShape del risultato:\", c.shape)\n",
" print(\"Device del tensore:\", c.device)\n",
" return \"Test completato con successo!\"\n",
"\n",
"\n",
"test_gpu()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Imports necessari\n",
"from src.data.data_loader import load_weather_data, load_olive_varieties\n",
"from src.data.data_processor import prepare_solar_data, prepare_transformer_data\n",
"from src.features.weather_features import add_solar_features, add_environmental_features\n",
"from src.features.temporal_features import add_time_features\n",
"from src.models.training import train_transformer, setup_transformer_training\n",
"from src.utils.helpers import get_optimal_workers\n",
"from src.visualization.plots import plot_correlation_matrix\n",
"import pandas as pd\n",
"import os"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Caricamento e Preparazione Dati"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"random_state_value = 42\n",
"\n",
"base_dir = './kaggle'\n",
"input_dir = f'{base_dir}/input'\n",
"working_dir = f'{base_dir}/working'\n",
"working_data_dir = f'{working_dir}/data'\n",
"data_models_dir = f'{working_data_dir}/models'\n",
"\n",
"os.makedirs(working_dir, exist_ok=True)\n",
"os.makedirs(working_data_dir, exist_ok=True)\n",
"os.makedirs(data_models_dir, exist_ok=True)\n",
"\n",
"# Carica i dati meteorologici\n",
"weather_data = load_weather_data(\n",
" f'{input_dir}/olive-oil/weather_data.parquet',\n",
" start_year=2010\n",
")\n",
"\n",
"# Carica i dati delle varietà di olive\n",
"olive_varieties = load_olive_varieties(\n",
" f'{input_dir}/olive-oil/variety_olive_oil_production.csv'\n",
")\n",
"\n",
"print(f\"Shape dati meteo: {weather_data.shape}\")\n",
"print(f\"Shape dati olive: {olive_varieties.shape}\")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Feature Engineering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Aggiungi feature temporali\n",
"weather_data = add_time_features(weather_data)\n",
"\n",
"# Aggiungi feature solari e ambientali\n",
"weather_data = add_solar_features(weather_data)\n",
"weather_data = add_environmental_features(weather_data)\n",
"\n",
"# Definisci le feature da utilizzare\n",
"features = [\n",
" 'temp', 'tempmin', 'tempmax', 'humidity', 'cloudcover',\n",
" 'windspeed', 'pressure', 'visibility',\n",
" 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',\n",
" 'day_of_year_sin', 'day_of_year_cos',\n",
" 'temp_humidity', 'temp_cloudcover', 'visibility_cloudcover',\n",
" 'clear_sky_factor', 'day_length',\n",
" 'temp_1h_lag', 'cloudcover_1h_lag', 'humidity_1h_lag',\n",
" 'temp_rolling_mean_6h', 'cloudcover_rolling_mean_6h'\n",
" ] + [col for col in weather_data.columns if 'season_' in col or 'time_period_' in col]\n",
"\n",
"print(f\"Numero totale di feature: {len(features)}\")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Analisi delle Correlazioni"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Analizza correlazioni tra feature\n",
"plot_correlation_matrix(\n",
" weather_data[features + ['solarradiation', 'solarenergy', 'uvindex']],\n",
" title='Correlazioni tra Feature Meteorologiche'\n",
")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Preparazione Dati per il Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Prepara i dati per il modello\n",
"X_scaled, scaler_X, y_scaled, scaler_y, data_after_2010 = prepare_solar_data(\n",
" weather_data,\n",
" features\n",
")\n",
"\n",
"# Prepara i dati per il transformer\n",
"(train_data, train_targets), (val_data, val_targets), (test_data, test_targets), scalers = prepare_transformer_data(\n",
" data_after_2010, olive_varieties)\n",
"\n",
"print(\"\\nShape dei dati:\")\n",
"print(f\"Training - Temporal: {train_data['temporal'].shape}, Static: {train_data['static'].shape}\")\n",
"print(f\"Validation - Temporal: {val_data['temporal'].shape}, Static: {val_data['static'].shape}\")\n",
"print(f\"Test - Temporal: {test_data['temporal'].shape}, Static: {test_data['static'].shape}\")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Training del Modello"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Training del transformer\n",
"model, history = train_transformer(\n",
" train_data=train_data,\n",
" train_targets=train_targets,\n",
" val_data=val_data,\n",
" val_targets=val_targets,\n",
" epochs=150,\n",
" batch_size=64,\n",
" save_name='weather_transformer'\n",
")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Valutazione del Modello"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"from src.utils.metrics import calculate_real_error, evaluate_model_performance\n",
"\n",
"# Calcola gli errori reali\n",
"percentage_errors, absolute_errors = calculate_real_error(\n",
" model,\n",
" test_data,\n",
" test_targets,\n",
" scaler_y,\n",
" target_names=['solarradiation', 'solarenergy', 'uvindex']\n",
")\n",
"\n",
"# Valuta le performance del modello\n",
"metrics = evaluate_model_performance(\n",
" model,\n",
" test_data,\n",
" test_targets,\n",
" 'test'\n",
")"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Visualizzazione dei Risultati"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"from src.visualization.plots import (\n",
" plot_production_trends,\n",
" plot_correlation_matrix\n",
")\n",
"\n",
"# Plot dei trend di produzione\n",
"predictions = model.predict(test_data)\n",
"predictions_real = scaler_y.inverse_transform(predictions)\n",
"\n",
"# Crea DataFrame con predizioni\n",
"results_df = pd.DataFrame(\n",
" predictions_real,\n",
" columns=['solarradiation', 'solarenergy', 'uvindex']\n",
")\n",
"\n",
"# Plot delle correlazioni tra predizioni\n",
"plot_correlation_matrix(\n",
" results_df,\n",
" title='Correlazioni tra Predizioni'\n",
")\n",
"\n",
"# Plot dei trend temporali\n",
"plot_production_trends(results_df)"
],
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

215
src/utils/helpers.py Normal file
View File

@ -0,0 +1,215 @@
import psutil
import multiprocessing
import re
import pandas as pd
from typing import List
def get_optimal_workers() -> int:
"""
Calcola il numero ottimale di workers basandosi sulle risorse del sistema.
Returns
-------
int
Numero ottimale di workers
"""
# Ottiene il numero di CPU logiche (inclusi i thread virtuali)
cpu_count = multiprocessing.cpu_count()
# Ottiene la memoria totale e disponibile in GB
memory = psutil.virtual_memory()
total_memory_gb = memory.total / (1024 ** 3)
available_memory_gb = memory.available / (1024 ** 3)
# Stima della memoria necessaria per worker (esempio: 2GB per worker)
memory_per_worker_gb = 2
# Calcola il numero massimo di workers basato sulla memoria disponibile
max_workers_by_memory = int(available_memory_gb / memory_per_worker_gb)
# Usa il minimo tra:
# - numero di CPU disponibili - 1 (lascia una CPU libera per il sistema)
# - numero massimo di workers basato sulla memoria
# - un limite massimo arbitrario (es. 32) per evitare troppo overhead
optimal_workers = min(
cpu_count - 1,
max_workers_by_memory,
32 # limite massimo arbitrario
)
# Assicura almeno 1 worker
return max(1, optimal_workers)
def clean_column_name(name: str) -> str:
"""
Rimuove caratteri speciali e spazi, converte in snake_case e abbrevia.
Parameters
----------
name : str
Nome della colonna da pulire
Returns
-------
str
Nome della colonna pulito
"""
# Rimuove caratteri speciali
name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
# Converte in snake_case
name = name.lower().replace(' ', '_')
# Abbreviazioni comuni
abbreviations = {
'production': 'prod',
'percentage': 'pct',
'hectare': 'ha',
'tonnes': 't',
'litres': 'l',
'minimum': 'min',
'maximum': 'max',
'average': 'avg'
}
for full, abbr in abbreviations.items():
name = name.replace(full, abbr)
return name
def clean_column_names(df: pd.DataFrame) -> List[str]:
"""
Pulisce tutti i nomi delle colonne in un DataFrame.
Parameters
----------
df : pd.DataFrame
DataFrame con le colonne da pulire
Returns
-------
list
Lista dei nuovi nomi delle colonne puliti
"""
new_columns = []
for col in df.columns:
# Usa regex per separare le varietà
varieties = re.findall(r'([a-z]+)_([a-z_]+)', col)
if varieties:
new_columns.append(f"{varieties[0][0]}_{varieties[0][1]}")
else:
new_columns.append(col)
return new_columns
def to_camel_case(text: str) -> str:
"""
Converte una stringa in camelCase.
Gestisce stringhe con spazi, trattini o underscore.
Se è una sola parola, la restituisce in minuscolo.
Parameters
----------
text : str
Testo da convertire
Returns
-------
str
Testo convertito in camelCase
"""
# Rimuove eventuali spazi iniziali e finali
text = text.strip()
# Se la stringa è vuota, ritorna stringa vuota
if not text:
return ""
# Sostituisce trattini e underscore con spazi
text = text.replace('-', ' ').replace('_', ' ')
# Divide la stringa in parole
words = text.split()
# Se non ci sono parole dopo lo split, ritorna stringa vuota
if not words:
return ""
# Se c'è una sola parola, ritorna in minuscolo
if len(words) == 1:
return words[0].lower()
# Altrimenti procedi con il camelCase
result = words[0].lower()
for word in words[1:]:
result += word.capitalize()
return result
def get_full_data(simulated_data: pd.DataFrame,
olive_varieties: pd.DataFrame) -> pd.DataFrame:
"""
Ottiene il dataset completo combinando dati simulati e varietà di olive.
Parameters
----------
simulated_data : pd.DataFrame
DataFrame con i dati simulati
olive_varieties : pd.DataFrame
DataFrame con le informazioni sulle varietà
Returns
-------
pd.DataFrame
DataFrame completo con tutte le informazioni
"""
# Colonne base rilevanti
relevant_columns = [
'year', 'temp_mean', 'precip_sum', 'solar_energy_sum',
'ha', 'zone', 'olive_prod'
]
# Aggiungi colonne specifiche per varietà
all_varieties = olive_varieties['Varietà di Olive'].unique()
varieties = [clean_column_name(variety) for variety in all_varieties]
for variety in varieties:
relevant_columns.extend([
f'{variety}_olive_prod',
f'{variety}_tech'
])
# Seleziona solo le colonne rilevanti
full_data = simulated_data[relevant_columns].copy()
# Aggiungi feature calcolate
for variety in varieties:
# Calcola efficienza produttiva
if f'{variety}_olive_prod' in full_data.columns:
full_data[f'{variety}_efficiency'] = (
full_data[f'{variety}_olive_prod'] / full_data['ha']
)
# Aggiungi indicatori tecnici
if f'{variety}_tech' in full_data.columns:
technique_dummies = pd.get_dummies(
full_data[f'{variety}_tech'],
prefix=f'{variety}_technique'
)
full_data = pd.concat([full_data, technique_dummies], axis=1)
# Aggiungi feature temporali
full_data['month'] = 1 # Assumiamo dati annuali
full_data['day'] = 1 # Assumiamo dati annuali
# Calcola medie mobili
for col in ['temp_mean', 'precip_sum', 'solar_energy_sum']:
full_data[f'{col}_ma3'] = full_data[col].rolling(window=3, min_periods=1).mean()
full_data[f'{col}_ma5'] = full_data[col].rolling(window=5, min_periods=1).mean()
return full_data

282
src/utils/metrics.py Normal file
View File

@ -0,0 +1,282 @@
import numpy as np
from typing import Dict, Tuple, List, Optional
from scipy import stats
def calculate_real_error(
model,
test_data: Dict,
test_targets: np.ndarray,
scaler_y,
target_names: Optional[List[str]] = None
) -> Tuple[List[float], List[float]]:
"""
Calcola l'errore reale denormalizzando le predizioni.
Parameters
----------
model : tf.keras.Model
Modello addestrato
test_data : dict
Dati di test
test_targets : np.ndarray
Target di test
scaler_y : scaler
Scaler utilizzato per normalizzare i target
target_names : list, optional
Nomi dei target
Returns
-------
tuple
(percentage_errors, absolute_errors)
"""
# Predizioni
predictions = model.predict(test_data)
# Denormalizza predizioni e target
predictions_real = scaler_y.inverse_transform(predictions)
targets_real = scaler_y.inverse_transform(test_targets)
# Calcola errori percentuali e assoluti
percentage_errors = []
absolute_errors = []
if target_names is None:
target_names = [f'target_{i}' for i in range(predictions_real.shape[1])]
# Calcola errori per ogni target
for i in range(predictions_real.shape[1]):
mae = np.mean(np.abs(predictions_real[:, i] - targets_real[:, i]))
mape = np.mean(np.abs((predictions_real[:, i] - targets_real[:, i]) / targets_real[:, i])) * 100
percentage_errors.append(mape)
absolute_errors.append(mae)
print(f"\n{target_names[i]}:")
print(f"MAE assoluto: {mae:.2f}")
print(f"Errore percentuale medio: {mape:.2f}%")
print(f"Precisione: {100 - mape:.2f}%")
print("-" * 50)
return percentage_errors, absolute_errors
def evaluate_model_performance(
model,
data: Dict,
targets: np.ndarray,
set_name: str = "",
threshold: Optional[float] = None
) -> Dict:
"""
Valuta le performance del modello su un set di dati.
Parameters
----------
model : tf.keras.Model
Modello da valutare
data : dict
Dati di input
targets : np.ndarray
Target reali
set_name : str
Nome del set di dati
threshold : float, optional
Soglia per calcolare accuracy binaria
Returns
-------
dict
Dizionario con le metriche calcolate
"""
predictions = model.predict(data, verbose=0)
metrics = {}
target_names = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']
for i, name in enumerate(target_names):
# Metriche di base
mae = np.mean(np.abs(targets[:, i] - predictions[:, i]))
mse = np.mean(np.square(targets[:, i] - predictions[:, i]))
rmse = np.sqrt(mse)
mape = np.mean(np.abs((targets[:, i] - predictions[:, i]) / (targets[:, i] + 1e-7))) * 100
# R2 score
ss_res = np.sum(np.square(targets[:, i] - predictions[:, i]))
ss_tot = np.sum(np.square(targets[:, i] - np.mean(targets[:, i])))
r2 = 1 - (ss_res / (ss_tot + 1e-7))
# Salva le metriche
metrics[f"{name}_mae"] = mae
metrics[f"{name}_rmse"] = rmse
metrics[f"{name}_mape"] = mape
metrics[f"{name}_r2"] = r2
# Calcola accuracy binaria se fornita una soglia
if threshold is not None:
binary_acc = np.mean(
(predictions[:, i] > threshold) == (targets[:, i] > threshold)
)
metrics[f"{name}_binary_acc"] = binary_acc
if set_name:
print(f"\nPerformance sul set {set_name}:")
for metric, value in metrics.items():
print(f"{metric}: {value:.4f}")
return metrics
def calculate_efficiency_metrics(
predictions: np.ndarray,
targets: np.ndarray,
resource_usage: np.ndarray
) -> Dict:
"""
Calcola metriche di efficienza basate sull'utilizzo delle risorse.
Parameters
----------
predictions : np.ndarray
Predizioni del modello
targets : np.ndarray
Target reali
resource_usage : np.ndarray
Dati sull'utilizzo delle risorse
Returns
-------
dict
Metriche di efficienza
"""
metrics = {}
# Efficienza di produzione
production_efficiency = predictions / (resource_usage + 1e-7)
target_efficiency = targets / (resource_usage + 1e-7)
# Calcola metriche
metrics['mean_efficiency'] = np.mean(production_efficiency)
metrics['efficiency_error'] = np.mean(np.abs(production_efficiency - target_efficiency))
metrics['efficiency_std'] = np.std(production_efficiency)
# ROI stimato
estimated_roi = (predictions - resource_usage) / (resource_usage + 1e-7)
actual_roi = (targets - resource_usage) / (resource_usage + 1e-7)
metrics['roi_error'] = np.mean(np.abs(estimated_roi - actual_roi))
# Sostenibilità
metrics['resource_utilization'] = np.mean(predictions / resource_usage)
metrics['efficiency_improvement'] = (
np.mean(production_efficiency) - np.mean(target_efficiency)
) / np.mean(target_efficiency) * 100
return metrics
def calculate_forecast_accuracy(
predictions: np.ndarray,
targets: np.ndarray,
horizons: List[int]
) -> Dict:
"""
Calcola l'accuratezza delle previsioni per diversi orizzonti temporali.
Parameters
----------
predictions : np.ndarray
Predizioni del modello
targets : np.ndarray
Target reali
horizons : list
Lista degli orizzonti temporali da valutare
Returns
-------
dict
Accuratezza per ogni orizzonte
"""
accuracy_metrics = {}
for horizon in horizons:
# Seleziona dati per l'orizzonte corrente
pred_horizon = predictions[:-horizon]
target_horizon = targets[horizon:]
# Calcola metriche
mae = np.mean(np.abs(pred_horizon - target_horizon))
mape = np.mean(np.abs((pred_horizon - target_horizon) / (target_horizon + 1e-7))) * 100
rmse = np.sqrt(np.mean(np.square(pred_horizon - target_horizon)))
# Calcola il coefficiente di correlazione
corr = np.corrcoef(pred_horizon.flatten(), target_horizon.flatten())[0, 1]
# Salva le metriche
accuracy_metrics[f'horizon_{horizon}'] = {
'mae': mae,
'mape': mape,
'rmse': rmse,
'correlation': corr
}
print(f"\nMetriche per orizzonte {horizon}:")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"RMSE: {rmse:.4f}")
print(f"Correlazione: {corr:.4f}")
return accuracy_metrics
def compute_confidence_intervals(
predictions: np.ndarray,
alpha: float = 0.05,
n_bootstrap: int = 1000
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Calcola intervalli di confidenza usando bootstrap.
Parameters
----------
predictions : np.ndarray
Predizioni del modello
alpha : float
Livello di significatività
n_bootstrap : int
Numero di campioni bootstrap
Returns
-------
tuple
(lower_bound, upper_bound, mean_predictions)
"""
n_samples, n_targets = predictions.shape
bootstrap_means = np.zeros((n_bootstrap, n_targets))
# Bootstrap sampling
for i in range(n_bootstrap):
indices = np.random.randint(0, n_samples, size=n_samples)
bootstrap_sample = predictions[indices]
bootstrap_means[i] = np.mean(bootstrap_sample, axis=0)
# Calcola intervalli di confidenza
lower_percentile = alpha / 2 * 100
upper_percentile = (1 - alpha / 2) * 100
lower_bound = np.percentile(bootstrap_means, lower_percentile, axis=0)
upper_bound = np.percentile(bootstrap_means, upper_percentile, axis=0)
mean_predictions = np.mean(predictions, axis=0)
# Calcola intervalli usando t-distribution
std_error = np.std(bootstrap_means, axis=0)
t_value = stats.t.ppf(1 - alpha / 2, df=n_samples - 1)
margin_error = t_value * std_error
print("\nIntervalli di Confidenza:")
for i in range(n_targets):
print(f"\nTarget {i + 1}:")
print(f"Media: {mean_predictions[i]:.4f}")
print(f"Intervallo: [{lower_bound[i]:.4f}, {upper_bound[i]:.4f}]")
print(f"Margine di errore: ±{margin_error[i]:.4f}")
return lower_bound, upper_bound, mean_predictions

255
src/visualization/plots.py Normal file
View File

@ -0,0 +1,255 @@
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from typing import Optional
def save_plot(plt: plt, title: str, output_dir: str = './kaggle/working/plots'):
"""
Salva il plot corrente con un nome formattato.
Parameters
----------
plt : matplotlib.pyplot
Riferimento a pyplot
title : str
Titolo del plot
output_dir : str
Directory di output per i plot
"""
os.makedirs(output_dir, exist_ok=True)
# Pulisci il nome del file
filename = "".join(x for x in title if x.isalnum() or x in [' ', '-', '_']).rstrip()
filename = filename.replace(' ', '_').lower()
filepath = os.path.join(output_dir, f"{filename}.png")
plt.savefig(filepath, bbox_inches='tight', dpi=300)
print(f"Plot salvato come: {filepath}")
def plot_variety_comparison(comparison_data: pd.DataFrame, metric: str):
"""
Crea un grafico a barre per confrontare le varietà di olive su una metrica specifica.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
metric : str
Nome della metrica da visualizzare
"""
plt.figure(figsize=(12, 6))
bars = plt.bar(comparison_data['Variety'], comparison_data[metric])
plt.title(f'Confronto di {metric} tra Varietà di Olive')
plt.xlabel('Varietà')
plt.ylabel(metric)
plt.xticks(rotation=45, ha='right')
# Aggiungi etichette sopra le barre
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2., height,
f'{height:.2f}',
ha='center', va='bottom')
plt.tight_layout()
plt.show()
# Salva il plot
save_plot(plt,
f'variety_comparison_{metric.lower().replace(" ", "_").replace("/", "_").replace("(", "").replace(")", "")}')
plt.close()
def plot_efficiency_vs_production(comparison_data: pd.DataFrame):
"""
Crea uno scatter plot dell'efficienza vs produzione.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
"""
plt.figure(figsize=(10, 6))
plt.scatter(comparison_data['Avg Olive Production (kg/ha)'],
comparison_data['Oil Efficiency (L/kg)'],
s=100)
# Aggiungi etichette per ogni punto
for i, row in comparison_data.iterrows():
plt.annotate(row['Variety'],
(row['Avg Olive Production (kg/ha)'], row['Oil Efficiency (L/kg)']),
xytext=(5, 5), textcoords='offset points')
plt.title('Efficienza Olio vs Produzione Olive per Varietà')
plt.xlabel('Produzione Media Olive (kg/ha)')
plt.ylabel('Efficienza Olio (L olio / kg olive)')
plt.tight_layout()
# Salva il plot
save_plot(plt, 'efficiency_vs_production')
plt.close()
def plot_water_efficiency_vs_production(comparison_data: pd.DataFrame):
"""
Crea uno scatter plot dell'efficienza idrica vs produzione.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
"""
plt.figure(figsize=(10, 6))
plt.scatter(comparison_data['Avg Olive Production (kg/ha)'],
comparison_data['Water Efficiency (L oil/m³ water)'],
s=100)
# Aggiungi etichette per ogni punto
for i, row in comparison_data.iterrows():
plt.annotate(row['Variety'],
(row['Avg Olive Production (kg/ha)'],
row['Water Efficiency (L oil/m³ water)']),
xytext=(5, 5), textcoords='offset points')
plt.title('Efficienza Idrica vs Produzione Olive per Varietà')
plt.xlabel('Produzione Media Olive (kg/ha)')
plt.ylabel('Efficienza Idrica (L olio / m³ acqua)')
plt.tight_layout()
plt.show()
# Salva il plot
save_plot(plt, 'water_efficiency_vs_production')
plt.close()
def plot_water_need_vs_oil_production(comparison_data: pd.DataFrame):
"""
Crea uno scatter plot del fabbisogno idrico vs produzione di olio.
Parameters
----------
comparison_data : pd.DataFrame
DataFrame contenente i dati di confronto
"""
plt.figure(figsize=(10, 6))
plt.scatter(comparison_data['Avg Water Need (m³/ha)'],
comparison_data['Avg Oil Production (L/ha)'],
s=100)
# Aggiungi etichette per ogni punto
for i, row in comparison_data.iterrows():
plt.annotate(row['Variety'],
(row['Avg Water Need (m³/ha)'],
row['Avg Oil Production (L/ha)']),
xytext=(5, 5), textcoords='offset points')
plt.title('Produzione Olio vs Fabbisogno Idrico per Varietà')
plt.xlabel('Fabbisogno Idrico Medio (m³/ha)')
plt.ylabel('Produzione Media Olio (L/ha)')
plt.tight_layout()
plt.show()
# Salva il plot
save_plot(plt, 'water_need_vs_oil_production')
plt.close()
def plot_production_trends(data: pd.DataFrame,
variety: Optional[str] = None,
metrics: Optional[list] = None):
"""
Crea grafici di trend per le metriche di produzione.
Parameters
----------
data : pd.DataFrame
DataFrame con i dati di produzione
variety : str, optional
Varietà specifica da visualizzare
metrics : list, optional
Lista delle metriche da visualizzare
"""
if metrics is None:
metrics = ['olive_prod', 'oil_prod', 'water_need']
# Filtra per varietà se specificata
if variety:
data = data[data['variety'] == variety]
# Crea subplot per ogni metrica
fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4 * len(metrics)))
if len(metrics) == 1:
axes = [axes]
for ax, metric in zip(axes, metrics):
sns.lineplot(data=data, x='year', y=metric, ax=ax)
if variety:
ax.set_title(f'{metric} per {variety}')
else:
ax.set_title(f'{metric} - Tutte le varietà')
ax.set_xlabel('Anno')
plt.tight_layout()
# Salva il plot
title = f'production_trends{"_" + variety if variety else ""}'
save_plot(plt, title)
plt.close()
def plot_correlation_matrix(data: pd.DataFrame,
variables: Optional[list] = None,
title: str = "Matrice di Correlazione"):
"""
Crea una matrice di correlazione con heatmap.
Parameters
----------
data : pd.DataFrame
DataFrame con i dati
variables : list, optional
Lista delle variabili da includere
title : str
Titolo del plot
"""
if variables:
corr_matrix = data[variables].corr()
else:
corr_matrix = data.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix,
annot=True,
cmap='coolwarm',
center=0,
fmt='.2f')
plt.title(title)
plt.tight_layout()
# Salva il plot
save_plot(plt, 'correlation_matrix')
plt.close()
def setup_plotting_style():
"""
Configura lo stile dei plot per uniformità.
"""
plt.style.use('seaborn')
sns.set_palette("husl")
# Impostazioni personalizzate
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10