wip model

This commit is contained in:
Giuseppe Nucifora 2024-11-08 19:43:36 +01:00
parent d092c0967c
commit 4a6f30b4c7
4 changed files with 334 additions and 243 deletions

View File

@ -1,94 +1 @@
src/data/data_loader.py:
- load_weather_data()
- load_olive_varieties()
- read_json_files()
- load_single_model_and_scalers()
- save_single_model_and_scalers()
src/data/data_processor.py:
- preprocess_weather_data()
- prepare_solar_data()
- prepare_transformer_data()
- create_sequences()
- encode_techniques()
- decode_techniques()
src/data/data_simulator.py:
- simulate_zone()
- simulate_olive_production_parallel()
- calculate_weather_effect()
- calculate_water_need()
- add_olive_water_consumption_correlation()
src/features/temporal_features.py:
- add_time_features()
- get_season()
- get_time_period()
- create_time_based_features()
src/features/weather_features.py:
- add_solar_features()
- add_solar_specific_features()
- add_environmental_features()
- calculate_vpd()
- add_weather_indicators()
src/features/olive_features.py:
- create_technique_mapping()
- add_olive_features()
- calculate_stress_index()
- calculate_quality_indicators()
- add_production_features()
src/models/transformer.py:
- create_olive_oil_transformer()
- OliveTransformerBlock
- PositionalEncoding
- DataAugmentation
src/models/layers.py:
- MultiScaleAttention
- TemporalConvBlock
- WeatherEmbedding
- OliveVarietyEmbedding
src/models/callbacks.py:
- CustomCallback
- WarmUpLearningRateSchedule
- MetricLogger
- EarlyStoppingWithBest
src/models/training.py:
- compile_model()
- setup_transformer_training()
- train_transformer()
- retrain_model()
- create_callbacks()
src/visualization/plots.py:
- plot_variety_comparison()
- plot_efficiency_vs_production()
- plot_water_efficiency_vs_production()
- plot_water_need_vs_oil_production()
- save_plot()
src/visualization/dashboard.py:
- create_production_dashboard()
- create_weather_dashboard()
- create_efficiency_dashboard()
- update_dashboard_data()
- create_forecast_view()
src/utils/metrics.py:
- calculate_real_error()
- evaluate_model_performance()
- calculate_efficiency_metrics()
- calculate_forecast_accuracy()
- compute_confidence_intervals()
src/utils/helpers.py:
- get_optimal_workers()
- clean_column_name()
- clean_column_names()
- to_camel_case()
- get_full_data()
python -m model_train.create_train_dataset --random-seed 42 --num-simulations 100000 --batch-size 10000 --max-workers 7

View File

@ -1,21 +1,147 @@
import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import psutil
from tqdm import tqdm
import os
import argparse
import sys
def get_optimal_workers():
"""Calcola il numero ottimale di workers basato sulle risorse del sistema"""
cpu_count = multiprocessing.cpu_count()
memory = psutil.virtual_memory()
available_memory_gb = memory.available / (1024 ** 3)
memory_per_worker_gb = 2
max_workers_by_memory = int(available_memory_gb / memory_per_worker_gb)
optimal_workers = min(
cpu_count - 1,
max_workers_by_memory,
32
)
print(f'CPU count : {cpu_count} - Memory : {memory} = Max Worker by memory : {max_workers_by_memory}')
return max(1, optimal_workers)
def generate_training_dataset(weather_data, olive_varieties, num_simulations=1000, random_seed=42):
def simulate_single_year(params):
"""
Genera dataset di training combinando le migliori caratteristiche di entrambi gli approcci.
Simula un singolo anno di produzione.
Args:
weather_data: DataFrame con dati meteorologici
olive_varieties: DataFrame con informazioni sulle varietà
num_simulations: Numero di simulazioni da generare
random_seed: Seme per riproducibilità
params: dict contenente:
- weather_annual: dati meteo annuali
- olive_varieties: informazioni sulle varietà
- sim_id: ID simulazione
- random_seed: seed per riproducibilità
"""
np.random.seed(params['random_seed'] + params['sim_id'])
# Seleziona anno base e applica variazioni
weather = params['weather_annual'].sample(n=1, random_state=params['random_seed'] + params['sim_id']).iloc[0].copy()
# Applica variazioni meteorologiche (±20%)
for col in weather.index:
if col != 'year':
weather[col] *= np.random.uniform(0.8, 1.2)
# Genera caratteristiche dell'oliveto
num_varieties = np.random.randint(1, 4)
selected_varieties = np.random.choice(
params['olive_varieties']['Varietà di Olive'].unique(),
size=num_varieties,
replace=False
)
hectares = np.random.uniform(1, 10)
percentages = np.random.dirichlet(np.ones(num_varieties))
annual_results = {
'simulation_id': params['sim_id'],
'year': weather['year'],
'hectares': hectares,
'num_varieties': num_varieties,
'total_olive_production': 0,
'total_oil_production': 0,
'total_water_need': 0
}
# Aggiungi dati meteorologici
for col in weather.index:
if col != 'year':
annual_results[f'weather_{col}'] = weather[col]
variety_details = []
for i, variety in enumerate(selected_varieties):
variety_data = params['olive_varieties'][
params['olive_varieties']['Varietà di Olive'] == variety
]
technique = np.random.choice(variety_data['Tecnica di Coltivazione'].unique())
percentage = percentages[i]
variety_info = variety_data[
variety_data['Tecnica di Coltivazione'] == technique
].iloc[0]
# Calcoli produzione con variabilità
production_data = calculate_production(
variety_info, weather, percentage, hectares,
params['sim_id'] + i
)
variety_details.append(production_data)
# Aggiorna totali
annual_results['total_olive_production'] += production_data['production']
annual_results['total_oil_production'] += production_data['oil_production']
annual_results['total_water_need'] += production_data['water_need']
# Aggiungi dettagli varietà
for i, detail in enumerate(variety_details):
prefix = f'variety_{i + 1}'
for key, value in detail.items():
annual_results[f'{prefix}_{key}'] = value
# Calcola metriche per ettaro e KPI
annual_results['olive_production_ha'] = annual_results['total_olive_production'] / hectares
annual_results['oil_production_ha'] = annual_results['total_oil_production'] / hectares
annual_results['water_need_ha'] = annual_results['total_water_need'] / hectares
# Calcola efficienze
if annual_results['total_olive_production'] > 0:
annual_results['yield_efficiency'] = annual_results['total_oil_production'] / annual_results[
'total_olive_production']
else:
annual_results['yield_efficiency'] = 0
if annual_results['total_water_need'] > 0:
annual_results['water_efficiency'] = annual_results['total_olive_production'] / annual_results[
'total_water_need']
else:
annual_results['water_efficiency'] = 0
return annual_results
def generate_training_dataset_parallel(weather_data, olive_varieties, num_simulations=1000,
random_seed=42, max_workers=None, batch_size=500,
output_path='olive_training_dataset.parquet'):
"""
Genera dataset di training utilizzando multiprocessing.
Args:
weather_data: DataFrame dati meteo
olive_varieties: DataFrame varietà olive
num_simulations: numero di simulazioni
random_seed: seed per riproducibilità
max_workers: numero massimo di workers
batch_size: dimensione batch
output_path: percorso file output
"""
np.random.seed(random_seed)
# Prepara dati meteorologici annuali
# Prepara dati meteo annuali
weather_annual = weather_data.groupby('year').agg({
'temp': ['mean', 'min', 'max', 'std'],
'humidity': ['mean', 'min', 'max'],
@ -24,159 +150,143 @@ def generate_training_dataset(weather_data, olive_varieties, num_simulations=100
'cloudcover': ['mean']
}).reset_index()
# Appiattisci i nomi delle colonne
weather_annual.columns = ['year'] + [
f'{col[0]}_{col[1]}' for col in weather_annual.columns[1:]
]
all_results = []
all_varieties = olive_varieties['Varietà di Olive'].unique()
# Calcola workers ottimali
if max_workers is None:
max_workers = get_optimal_workers()
with tqdm(total=num_simulations, desc="Generazione dataset") as pbar:
for sim in range(num_simulations):
# Seleziona anno base e applica variazioni
selected_year = np.random.choice(weather_annual['year'])
weather = weather_annual[weather_annual['year'] == selected_year].iloc[0].copy()
print(f"Utilizzando {max_workers} workers")
# Applica variazioni meteorologiche (±20%)
for col in weather.index:
if col != 'year':
weather[col] *= np.random.uniform(0.8, 1.2)
# Calcola numero di batch
num_batches = (num_simulations + batch_size - 1) // batch_size
print(f"Elaborazione di {num_simulations} simulazioni in {num_batches} batch")
# Genera caratteristiche dell'oliveto
num_varieties = np.random.randint(1, 4) # 1-3 varietà
selected_varieties = np.random.choice(all_varieties, size=num_varieties, replace=False)
hectares = np.random.uniform(1, 10)
percentages = np.random.dirichlet(np.ones(num_varieties))
# Crea directory output se necessario
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
# Inizializza contatori per l'anno
annual_results = {
'simulation_id': sim + 1,
'year': selected_year,
'hectares': hectares,
'num_varieties': num_varieties,
'total_olive_production': 0,
'total_oil_production': 0,
'total_water_need': 0,
# Lista per contenere tutti i DataFrame dei batch
all_batches = []
for batch_num in range(num_batches):
start_sim = batch_num * batch_size
end_sim = min((batch_num + 1) * batch_size, num_simulations)
current_batch_size = end_sim - start_sim
batch_results = []
# Preparazione parametri per ogni simulazione
simulation_params = [
{
'weather_annual': weather_annual,
'olive_varieties': olive_varieties,
'sim_id': sim_id,
'random_seed': random_seed
}
for sim_id in range(start_sim, end_sim)
]
# Aggiungi dati meteorologici
for col in weather.index:
if col != 'year':
annual_results[f'weather_{col}'] = weather[col]
# Esegui simulazioni in parallelo
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(simulate_single_year, params)
for params in simulation_params]
# Simula per ogni varietà
variety_details = []
for i, variety in enumerate(selected_varieties):
# Seleziona tecnica di coltivazione
variety_data = olive_varieties[olive_varieties['Varietà di Olive'] == variety]
technique = np.random.choice(variety_data['Tecnica di Coltivazione'].unique())
percentage = percentages[i]
with tqdm(total=current_batch_size,
desc=f"Batch {batch_num + 1}/{num_batches}") as pbar:
for future in as_completed(futures):
try:
result = future.result()
batch_results.append(result)
pbar.update(1)
except Exception as e:
print(f"Errore in simulazione: {str(e)}")
continue
# Ottieni dati specifici varietà
variety_info = variety_data[
variety_data['Tecnica di Coltivazione'] == technique
].iloc[0]
# Converti risultati in DataFrame
batch_df = pd.DataFrame(batch_results)
all_batches.append(batch_df)
# Calcola produzione base con variabilità
base_variation = np.random.uniform(0.8, 1.2)
base_production = variety_info['Produzione (tonnellate/ettaro)'] * base_variation
# Libera memoria
del batch_results
# Applica effetti meteorologici
temp_effect = calculate_temperature_effect(
weather['temp_mean'],
variety_info['Temperatura Ottimale']
)
water_effect = calculate_water_effect(
weather['precip_sum'],
variety_info['Resistenza alla Siccità']
)
solar_effect = calculate_solar_effect(
weather['solarradiation_mean']
)
# Concatena tutti i batch e salva
final_df = pd.concat(all_batches, ignore_index=True)
final_df.to_parquet(output_path)
# Calcola produzione effettiva
actual_production = (
base_production *
temp_effect *
water_effect *
solar_effect *
percentage *
hectares
)
print(f"\nDataset salvato in: {output_path}")
# Calcola resa olio con variabilità
oil_yield = np.random.uniform(
variety_info['Min % Resa'],
variety_info['Max % Resa']
)
oil_production = actual_production * oil_yield
# Statistiche finali
print("\nStatistiche finali:")
print(f"Righe totali: {len(final_df)}")
print("\nAnalisi variabilità:")
for col in ['olive_production_ha', 'oil_production_ha', 'water_need_ha']:
cv = final_df[col].std() / final_df[col].mean()
print(f"{col}: CV = {cv:.2%}")
# Calcola fabbisogno idrico
base_water_need = (
variety_info['Fabbisogno Acqua Primavera (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Estate (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Autunno (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Inverno (m³/ettaro)']
) / 4
# Adatta fabbisogno idrico alle condizioni
actual_water_need = (
base_water_need *
(1 + max(0, (weather['temp_mean'] - 20) / 50)) *
max(0.6, 1 - (weather['precip_sum'] / 1000)) *
percentage *
hectares
)
# Salva dettagli varietà
variety_details.append({
'variety': variety,
'technique': technique,
'percentage': percentage,
'production': actual_production,
'oil_production': oil_production,
'water_need': actual_water_need,
'yield': oil_yield,
'base_production': base_production,
'temp_effect': temp_effect,
'water_effect': water_effect,
'solar_effect': solar_effect
})
# Aggiorna totali annuali
annual_results['total_olive_production'] += actual_production
annual_results['total_oil_production'] += oil_production
annual_results['total_water_need'] += actual_water_need
# Calcola metriche per ettaro
annual_results['olive_production_ha'] = annual_results['total_olive_production'] / hectares
annual_results['oil_production_ha'] = annual_results['total_oil_production'] / hectares
annual_results['water_need_ha'] = annual_results['total_water_need'] / hectares
# Aggiungi KPI di efficienza
annual_results['yield_efficiency'] = annual_results['total_oil_production'] / annual_results[
'total_olive_production']
annual_results['water_efficiency'] = annual_results['total_olive_production'] / annual_results[
'total_water_need']
# Aggiungi dettagli varietà al risultato
for i, detail in enumerate(variety_details):
prefix = f'variety_{i + 1}'
for key, value in detail.items():
annual_results[f'{prefix}_{key}'] = value
all_results.append(annual_results)
pbar.update(1)
# Crea DataFrame finale
df = pd.DataFrame(all_results)
return df
return final_df
def calculate_production(variety_info, weather, percentage, hectares, seed):
"""Calcola produzione e parametri correlati per una varietà"""
np.random.seed(seed)
base_production = variety_info['Produzione (tonnellate/ettaro)'] * percentage * hectares
base_production *= np.random.uniform(0.8, 1.2)
# Effetti ambientali
temp_effect = calculate_temperature_effect(
weather['temp_mean'],
variety_info['Temperatura Ottimale']
)
water_effect = calculate_water_effect(
weather['precip_sum'],
variety_info['Resistenza alla Siccità']
)
solar_effect = calculate_solar_effect(
weather['solarradiation_mean']
)
actual_production = base_production * temp_effect * water_effect * solar_effect
# Calcolo olio
oil_yield = np.random.uniform(
variety_info['Min % Resa'],
variety_info['Max % Resa']
)
oil_production = actual_production * oil_yield
# Calcolo acqua
base_water_need = (
variety_info['Fabbisogno Acqua Primavera (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Estate (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Autunno (m³/ettaro)'] +
variety_info['Fabbisogno Acqua Inverno (m³/ettaro)']
) / 4 * percentage * hectares
water_need = (
base_water_need *
(1 + max(0, (weather['temp_mean'] - 20) / 50)) *
max(0.6, 1 - (weather['precip_sum'] / 1000))
)
return {
'variety': variety_info['Varietà di Olive'],
'technique': variety_info['Tecnica di Coltivazione'],
'percentage': percentage,
'production': actual_production,
'oil_production': oil_production,
'water_need': water_need,
'temp_effect': temp_effect,
'water_effect': water_effect,
'solar_effect': solar_effect,
'yield': oil_yield
}
# Funzioni di effetto ambientale rimangono invariate
def calculate_temperature_effect(temp, optimal_temp):
"""Calcola effetto temperatura con variabilità"""
temp_diff = abs(temp - optimal_temp)
if temp_diff <= 5:
return np.random.uniform(0.95, 1.0)
@ -187,7 +297,6 @@ def calculate_temperature_effect(temp, optimal_temp):
def calculate_water_effect(precip, drought_resistance):
"""Calcola effetto precipitazioni con variabilità"""
if 'alta' in str(drought_resistance).lower():
min_precip = 20
elif 'media' in str(drought_resistance).lower():
@ -203,7 +312,6 @@ def calculate_water_effect(precip, drought_resistance):
def calculate_solar_effect(radiation):
"""Calcola effetto radiazione solare con variabilità"""
if radiation >= 200:
return np.random.uniform(0.95, 1.0)
else:
@ -211,16 +319,94 @@ def calculate_solar_effect(radiation):
return base_factor * np.random.uniform(0.8, 1.2)
# Test del codice
def parse_arguments():
"""
Configura e gestisce i parametri da riga di comando
"""
parser = argparse.ArgumentParser(
description='Generatore dataset di training per produzione olive',
formatter_class=argparse.ArgumentDefaultsHelpFormatter # Mostra i valori default nell'help
)
parser.add_argument(
'--random-seed',
type=int,
default=42,
help='Seed per la riproducibilità dei risultati'
)
parser.add_argument(
'--num-simulations',
type=int,
default=1000000,
help='Numero totale di simulazioni da eseguire'
)
parser.add_argument(
'--batch-size',
type=int,
default=10000,
help='Dimensione di ogni batch di simulazioni'
)
parser.add_argument(
'--output-path',
type=str,
default='./sources/olive_training_dataset.parquet',
help='Percorso del file di output'
)
parser.add_argument(
'--max-workers',
type=int,
default=2,
help='Quantità di workers'
)
return parser.parse_args()
# Esempio di utilizzo
if __name__ == "__main__":
print("Generazione dataset di training...")
# Parsing argomenti
args = parse_arguments()
# Carica dati
weather_data = pd.read_parquet('./sources/weather_data_complete.parquet')
olive_varieties = pd.read_parquet('./sources/olive_varieties.parquet')
try:
# Carica dati
weather_data = pd.read_parquet('./sources/weather_data_complete.parquet')
olive_varieties = pd.read_parquet('./sources/olive_varieties.parquet')
except Exception as e:
print(f"Errore nel caricamento dei dati: {str(e)}")
sys.exit(1)
# Stampa configurazione
print("\nConfigurazione:")
print(f"Random seed: {args.random_seed}")
print(f"Numero simulazioni: {args.num_simulations:,}")
print(f"Workers: {args.max_workers:,}")
print(f"Dimensione batch: {args.batch_size:,}")
print(f"File output: {args.output_path}")
# Genera dataset
df = generate_training_dataset(weather_data, olive_varieties, 100000)
try:
df = generate_training_dataset_parallel(
weather_data=weather_data,
olive_varieties=olive_varieties,
random_seed=args.random_seed,
num_simulations=args.num_simulations,
batch_size=args.batch_size,
output_path=args.output_path,
max_workers=args.max_workers
)
except Exception as e:
print(f"Errore durante la generazione del dataset: {str(e)}")
sys.exit(1)
print("\nShape dataset:", df.shape)
print("\nColonne disponibili:")
@ -235,6 +421,4 @@ if __name__ == "__main__":
cv = df[col].std() / df[col].mean()
print(f"{col}: {cv:.2%}")
# Salva dataset
df.to_parquet('./sources/olive_training_dataset.parquet', index=False)
print("\nDataset salvato come 'olive_training_dataset.parquet'")
print("\nDataset salvato './sources/olive_training_dataset.parquet'")