tesi-pegaso/olive-oil-production-analysis-notebook_reset.ipynb
2024-10-29 01:38:07 +01:00

525 lines
93 KiB
Plaintext

{
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.14",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kaggle": {
"accelerator": "gpu",
"dataSources": [
{
"sourceId": 9725208,
"sourceType": "datasetVersion",
"datasetId": 5950719
},
{
"sourceId": 9730815,
"sourceType": "datasetVersion",
"datasetId": 5954901
}
],
"dockerImageVersionId": 30787,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook",
"isGpuEnabled": true
}
},
"nbformat_minor": 4,
"nbformat": 4,
"cells": [
{
"cell_type": "markdown",
"source": "# Analisi e Previsione della Produzione di Olio d'Oliva\n\nQuesto notebook esplora la relazione tra i dati meteorologici e la produzione annuale di olio d'oliva, con l'obiettivo di creare un modello predittivo.",
"metadata": {}
},
{
"cell_type": "code",
"source": "import os\n\n# Rimuove il file se esiste\nif os.path.exists('output.zip'):\n os.remove('output.zip')\n \n!zip -r output.zip /kaggle/working/",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T22:02:37.712657Z",
"iopub.execute_input": "2024-10-28T22:02:37.713572Z",
"iopub.status.idle": "2024-10-28T22:05:13.962589Z",
"shell.execute_reply.started": "2024-10-28T22:02:37.713526Z",
"shell.execute_reply": "2024-10-28T22:05:13.961548Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "import tensorflow as tf\n\nprint(f\"Keras version: {tf.keras.__version__}\")\nprint(f\"TensorFlow version: {tf.__version__}\")\n\n# GPU configuration\ngpus = tf.config.experimental.list_physical_devices('GPU')\nif gpus:\n try:\n for gpu in gpus:\n tf.config.experimental.set_memory_growth(gpu, True)\n logical_gpus = tf.config.experimental.list_logical_devices('GPU')\n print(len(gpus), \"Physical GPUs,\", len(logical_gpus), \"Logical GPUs\")\n except RuntimeError as e:\n print(e)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:17:11.580753Z",
"iopub.execute_input": "2024-10-28T18:17:11.581122Z",
"iopub.status.idle": "2024-10-28T18:17:11.883020Z",
"shell.execute_reply.started": "2024-10-28T18:17:11.581083Z",
"shell.execute_reply": "2024-10-28T18:17:11.881838Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "# Test semplice per verificare che la GPU funzioni\ndef test_gpu():\n print(\"TensorFlow version:\", tf.__version__)\n print(\"\\nDispositivi disponibili:\")\n print(tf.config.list_physical_devices())\n\n # Creiamo e moltiplichiamo due tensori sulla GPU\n with tf.device('/GPU:0'):\n a = tf.random.normal([10000, 10000])\n b = tf.random.normal([10000, 10000])\n c = tf.matmul(a, b)\n\n print(\"\\nShape del risultato:\", c.shape)\n print(\"Device del tensore:\", c.device)\n return \"Test completato con successo!\"\n\n\ntest_gpu()",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:17:14.607427Z",
"iopub.execute_input": "2024-10-28T18:17:14.608081Z",
"iopub.status.idle": "2024-10-28T18:17:14.758117Z",
"shell.execute_reply.started": "2024-10-28T18:17:14.608043Z",
"shell.execute_reply": "2024-10-28T18:17:14.757247Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "!pip install numpy\n!pip install pandas\n\n!pip install keras\n!pip install scikit-learn\n!pip install matplotlib\n!pip install joblib\n!pip install pyarrow\n!pip install fastparquet\n!pip install scipy\n!pip install seaborn\n!pip install tqdm",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T06:41:20.930792Z",
"iopub.execute_input": "2024-10-28T06:41:20.931152Z",
"iopub.status.idle": "2024-10-28T06:43:20.309151Z",
"shell.execute_reply.started": "2024-10-28T06:41:20.931111Z",
"shell.execute_reply": "2024-10-28T06:43:20.308064Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler, StandardScaler\nfrom tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, LayerNormalization, Add, Activation, BatchNormalization, MultiHeadAttention, MaxPooling1D\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.regularizers import l2\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error\nfrom datetime import datetime\nimport os\nimport json\nimport joblib\nimport re\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nfrom tqdm import tqdm\n\nrandom_state_value = 42",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:17:19.003246Z",
"iopub.execute_input": "2024-10-28T18:17:19.004112Z",
"iopub.status.idle": "2024-10-28T18:17:19.020415Z",
"shell.execute_reply.started": "2024-10-28T18:17:19.004072Z",
"shell.execute_reply": "2024-10-28T18:17:19.019550Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## Funzioni di Plot",
"metadata": {}
},
{
"cell_type": "code",
"source": "def save_plot(plt, title, output_dir='/kaggle/working/plots'):\n os.makedirs(output_dir, exist_ok=True)\n filename = \"\".join(x for x in title if x.isalnum() or x in [' ', '-', '_']).rstrip()\n filename = filename.replace(' ', '_').lower()\n filepath = os.path.join(output_dir, f\"{filename}.png\")\n plt.savefig(filepath, bbox_inches='tight', dpi=300)\n print(f\"Plot salvato come: {filepath}\")",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:17:22.785465Z",
"iopub.execute_input": "2024-10-28T18:17:22.785841Z",
"iopub.status.idle": "2024-10-28T18:17:22.792190Z",
"shell.execute_reply.started": "2024-10-28T18:17:22.785803Z",
"shell.execute_reply": "2024-10-28T18:17:22.791093Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## 1. Caricamento e preparazione dei Dati Meteo",
"metadata": {}
},
{
"cell_type": "code",
"source": "# Function to convert csv to parquet\ndef csv_to_parquet(csv_file, parquet_file, chunksize=100000):\n writer = None\n\n for chunk in pd.read_csv(csv_file, chunksize=chunksize):\n if writer is None:\n\n table = pa.Table.from_pandas(chunk)\n writer = pq.ParquetWriter(parquet_file, table.schema)\n else:\n table = pa.Table.from_pandas(chunk)\n\n writer.write_table(table)\n\n if writer:\n writer.close()\n\n print(f\"File conversion completed : {csv_file} -> {parquet_file}\")\n\n\ndef read_json_files(folder_path):\n all_data = []\n\n file_list = sorted(os.listdir(folder_path))\n\n for filename in file_list:\n if filename.endswith('.json'):\n file_path = os.path.join(folder_path, filename)\n try:\n with open(file_path, 'r') as file:\n data = json.load(file)\n all_data.extend(data['days'])\n except Exception as e:\n print(f\"Error processing file '{filename}': {str(e)}\")\n\n return all_data\n\n\ndef create_weather_dataset(data):\n dataset = []\n seen_datetimes = set()\n\n for day in data:\n date = day['datetime']\n for hour in day['hours']:\n datetime_str = f\"{date} {hour['datetime']}\"\n\n # Verifico se questo datetime è già stato visto\n if datetime_str in seen_datetimes:\n continue\n\n seen_datetimes.add(datetime_str)\n\n if isinstance(hour['preciptype'], list):\n preciptype = \"__\".join(hour['preciptype'])\n else:\n preciptype = hour['preciptype'] if hour['preciptype'] else \"\"\n\n conditions = hour['conditions'].replace(', ', '__').replace(' ', '_').lower()\n\n row = {\n 'datetime': datetime_str,\n 'temp': hour['temp'],\n 'feelslike': hour['feelslike'],\n 'humidity': hour['humidity'],\n 'dew': hour['dew'],\n 'precip': hour['precip'],\n 'snow': hour['snow'],\n 'preciptype': preciptype.lower(),\n 'windspeed': hour['windspeed'],\n 'winddir': hour['winddir'],\n 'pressure': hour['pressure'],\n 'cloudcover': hour['cloudcover'],\n 'visibility': hour['visibility'],\n 'solarradiation': hour['solarradiation'],\n 'solarenergy': hour['solarenergy'],\n 'uvindex': hour['uvindex'],\n 'conditions': conditions,\n 'tempmax': day['tempmax'],\n 'tempmin': day['tempmin'],\n 'precipprob': day['precipprob'],\n 'precipcover': day['precipcover']\n }\n dataset.append(row)\n\n dataset.sort(key=lambda x: datetime.strptime(x['datetime'], \"%Y-%m-%d %H:%M:%S\"))\n\n return pd.DataFrame(dataset)\n\n\nfolder_path = './data/weather'\n#raw_data = read_json_files(folder_path)\n#weather_data = create_weather_dataset(raw_data)\n#weather_data['datetime'] = pd.to_datetime(weather_data['datetime'], errors='coerce')\n#weather_data['date'] = weather_data['datetime'].dt.date\n#weather_data = weather_data.dropna(subset=['datetime'])\n#weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])\n#weather_data['year'] = weather_data['datetime'].dt.year\n#weather_data['month'] = weather_data['datetime'].dt.month\n#weather_data['day'] = weather_data['datetime'].dt.day\n#weather_data.head()\n\n#weather_data.to_parquet('./data/weather_data.parquet')",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T06:43:20.340958Z",
"iopub.execute_input": "2024-10-28T06:43:20.341378Z",
"iopub.status.idle": "2024-10-28T06:43:20.358973Z",
"shell.execute_reply.started": "2024-10-28T06:43:20.341335Z",
"shell.execute_reply": "2024-10-28T06:43:20.358037Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "",
"metadata": {}
},
{
"cell_type": "code",
"source": "# Crea le sequenze per LSTM\ndef create_sequences(timesteps, X, y=None):\n \"\"\"\n Crea sequenze temporali dai dati.\n \n Parameters:\n -----------\n X : array-like\n Dati di input\n timesteps : int\n Numero di timestep per ogni sequenza\n y : array-like, optional\n Target values. Se None, crea sequenze solo per X\n \n Returns:\n --------\n tuple o array\n Se y è fornito: (X_sequences, y_sequences)\n Se y è None: X_sequences\n \"\"\"\n Xs = []\n for i in range(len(X) - timesteps):\n Xs.append(X[i:i + timesteps])\n\n if y is not None:\n ys = []\n for i in range(len(X) - timesteps):\n ys.append(y[i + timesteps])\n return np.array(Xs), np.array(ys)\n\n return np.array(Xs)\n\n\n# Funzioni per costruire il modello LSTM avanzato\ndef create_residual_lstm_layer(x, units, dropout_rate, l2_reg=0.01, return_sequences=True):\n residual = x\n x = Bidirectional(LSTM(units, return_sequences=return_sequences, kernel_regularizer=l2(l2_reg)))(x)\n x = LayerNormalization()(x)\n x = Dropout(dropout_rate)(x)\n # Adjust residual dimension and handle return_sequences\n if return_sequences:\n if int(residual.shape[-1]) != 2 * units:\n residual = Dense(2 * units, activation='linear')(residual)\n x = Add()([x, residual])\n return x\n\n\ndef attention_block(x, units, num_heads=8):\n attention = MultiHeadAttention(num_heads=num_heads, key_dim=units)(x, x)\n x = Add()([x, attention])\n x = LayerNormalization()(x)\n return x\n\n\ndef build_advanced_model(input_shape, l2_lambda=0.005):\n inputs = Input(shape=input_shape)\n\n # Primi due layer LSTM con sequenze\n x = create_residual_lstm_layer(inputs, 64, 0.2, l2_lambda, return_sequences=True)\n x = create_residual_lstm_layer(x, 32, 0.2, l2_lambda, return_sequences=True)\n\n # Attention e MaxPooling mentre abbiamo ancora la sequenza\n x = attention_block(x, 32, num_heads=8)\n x = MaxPooling1D()(x)\n\n # Ultimo layer LSTM senza sequenze\n x = create_residual_lstm_layer(x, 16, 0.1, l2_lambda, return_sequences=False)\n\n # Dense layers\n x = Dense(32, kernel_regularizer=l2(l2_lambda))(x)\n x = BatchNormalization()(x)\n x = Activation('swish')(x)\n x = Dropout(0.1)(x)\n\n x = Dense(16, kernel_regularizer=l2(l2_lambda))(x)\n x = BatchNormalization()(x)\n x = Activation('swish')(x)\n x = Dropout(0.1)(x)\n\n outputs = Dense(1, kernel_regularizer=l2(l2_lambda))(x)\n\n model = Model(inputs=inputs, outputs=outputs)\n return model\n\n\ndef get_season(date):\n month = date.month\n day = date.day\n if (month == 12 and day >= 21) or (month <= 3 and day < 20):\n return 'Winter'\n elif (month == 3 and day >= 20) or (month <= 6 and day < 21):\n return 'Spring'\n elif (month == 6 and day >= 21) or (month <= 9 and day < 23):\n return 'Summer'\n elif (month == 9 and day >= 23) or (month <= 12 and day < 21):\n return 'Autumn'\n else:\n return 'Unknown'\n\n\ndef get_time_period(hour):\n if 5 <= hour < 12:\n return 'Morning'\n elif 12 <= hour < 17:\n return 'Afternoon'\n elif 17 <= hour < 21:\n return 'Evening'\n else:\n return 'Night'\n\n\ndef add_time_features(df):\n df['datetime'] = pd.to_datetime(df['datetime'])\n df['timestamp'] = df['datetime'].astype(np.int64) // 10 ** 9\n df['year'] = df['datetime'].dt.year\n df['month'] = df['datetime'].dt.month\n df['day'] = df['datetime'].dt.day\n df['hour'] = df['datetime'].dt.hour\n df['minute'] = df['datetime'].dt.minute\n df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24))\n df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24))\n df['day_of_week'] = df['datetime'].dt.dayofweek\n df['day_of_year'] = df['datetime'].dt.dayofyear\n df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)\n df['quarter'] = df['datetime'].dt.quarter\n df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)\n df['is_quarter_end'] = df['datetime'].dt.is_quarter_end.astype(int)\n df['is_year_end'] = df['datetime'].dt.is_year_end.astype(int)\n df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))\n df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))\n df['day_of_year_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25))\n df['day_of_year_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365.25))\n df['season'] = df['datetime'].apply(get_season)\n df['time_period'] = df['hour'].apply(get_time_period)\n return df\n\n\n# Carica il dataset\nweather_data = pd.read_parquet('/kaggle/input/olive-oil/weather_data.parquet')\n\n# Aggiungi le caratteristiche temporali\nweather_data = add_time_features(weather_data)\n\n# Encoding delle variabili categoriali\nweather_data = pd.get_dummies(weather_data, columns=['season', 'time_period'], drop_first=True)\n\nweather_data.to_parquet('/kaggle/working/weather_data_extended.parquet')\n\n# Dividi i dati in quelli dopo il 2010 e quelli prima del 2010\ndata_after_2010 = weather_data[weather_data['year'] >= 2010].copy()\ndata_before_2010 = weather_data[weather_data['year'] < 2010].copy()\n\n# Aggiorna le target variables se necessario\ntarget_variables = ['solarradiation', 'solarenergy', 'uvindex']\n\n# Seleziona le features\nfeatures = [\n 'temp', 'tempmin', 'tempmax', 'humidity', 'cloudcover', 'windspeed', 'pressure', 'visibility',\n 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos',\n ] + [col for col in weather_data.columns if 'season_' in col or 'time_period_' in col]\n\n# Prepara data_after_2010\ndata_after_2010 = data_after_2010.sort_values('datetime')\ndata_after_2010.set_index('datetime', inplace=True)\n\n# Interpola eventuali valori mancanti nelle variabili target\ncolumns_to_interpolate = target_variables\nfor column in columns_to_interpolate:\n data_after_2010[column] = data_after_2010[column].interpolate(method='time')\n\n# Rimuovi eventuali valori mancanti residui\ndata_after_2010.dropna(subset=features + target_variables, inplace=True)\n\n# Crea X e y\nX = data_after_2010[features].values\ny = data_after_2010[target_variables].values\n\n# Normalizza le features\nscaler_X = MinMaxScaler()\nX_scaled = scaler_X.fit_transform(X)\n\n\ndef prepare_multi_target_datasets(X_scaled, y, target_variables):\n \"\"\"\n Prepara dataset separati per ogni target variable e restituisce anche gli scaler\n per un uso successivo in fase di predizione.\n \n Parameters:\n -----------\n X_scaled : numpy.ndarray\n Features già scalate\n y : numpy.ndarray\n Target variables (matrice con una colonna per ogni target)\n target_variables : list\n Lista dei nomi delle variabili target\n \n Returns:\n --------\n tuple (dict, dict)\n - Primo dict: contiene i dataset per ogni target\n - Secondo dict: contiene gli scaler per ogni target\n \"\"\"\n\n # Inizializza i dizionari per contenere i dataset e gli scaler\n train_datasets = {}\n scalers_dict = {}\n\n # Scala e splitta i dati per ogni target\n for i, target in enumerate(target_variables):\n # Scala il target corrente\n scaler = MinMaxScaler()\n y_scaled_current = scaler.fit_transform(y[:, i].reshape(-1, 1)).flatten()\n scalers_dict[target] = scaler\n\n # Split dei dati per il target corrente\n X_train_full, X_test, y_train_full, y_test = train_test_split(\n X_scaled,\n y_scaled_current,\n test_size=0.2,\n shuffle=False\n )\n\n # Ulteriore split per validation\n X_train, X_val, y_train, y_val = train_test_split(\n X_train_full,\n y_train_full,\n test_size=0.2,\n shuffle=False\n )\n\n # Salva i dataset per questo target\n train_datasets[target] = {\n 'X_train': X_train,\n 'X_val': X_val,\n 'X_test': X_test,\n 'y_train': y_train.reshape(-1, 1),\n 'y_val': y_val.reshape(-1, 1),\n 'y_test': y_test.reshape(-1, 1)\n }\n\n return train_datasets, scalers_dict\n\n\ndatasets, scalers = prepare_multi_target_datasets(X_scaled, y, target_variables)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T06:43:20.360560Z",
"iopub.execute_input": "2024-10-28T06:43:20.361353Z",
"iopub.status.idle": "2024-10-28T06:43:23.932182Z",
"shell.execute_reply.started": "2024-10-28T06:43:20.361319Z",
"shell.execute_reply": "2024-10-28T06:43:23.931121Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "# numero di timesteps (utilizziamo le ultime 24 ore)\ntimesteps = 24\n\n# Costruisci il modello per ogni variabile target\nmodels = {}\nhistories = {}\nfor i, target in enumerate(target_variables):\n target_data = datasets[target]\n target_scaler = scalers[target]\n\n X_train = target_data['X_train']\n y_train = target_data['y_train']\n X_val = target_data['X_val']\n y_val = target_data['y_val']\n X_test = target_data['X_test']\n y_test = target_data['y_test']\n\n num_features = X_train.shape[1]\n\n X_train_seq, y_train_seq = create_sequences(timesteps, X_train, y_train)\n X_val_seq, y_val_seq = create_sequences(timesteps, X_val, y_val)\n X_test_seq, y_test_seq = create_sequences(timesteps, X_test, y_test)\n\n print(X_train_seq.shape, y_train_seq.shape)\n print(X_val_seq.shape, y_val_seq.shape)\n print(X_test_seq.shape, y_test_seq.shape)\n\n print(f\"Addestramento del modello per: {target}\")\n model = build_advanced_model((timesteps, num_features), l2_lambda=0.001)\n optimizer = Adam(learning_rate=0.001, clipnorm=1.0)\n model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])\n early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)\n\n reduce_lr = ReduceLROnPlateau(\n monitor='val_loss',\n factor=0.5,\n patience=5,\n min_lr=1e-6\n )\n \n \n history = model.fit(\n X_train_seq, y_train_seq,\n validation_data=(X_val_seq, y_val_seq),\n epochs=50,\n batch_size=180,\n callbacks=[\n early_stopping,\n reduce_lr,\n # Model Checkpoint\n tf.keras.callbacks.ModelCheckpoint(\n filepath='/kaggle/working/{target}/best_model_{epoch:02d}_{val_loss:.4f}.keras',\n monitor='val_loss',\n save_best_only=True,\n mode='min'\n ),\n # TensorBoard logging\n tf.keras.callbacks.TensorBoard(\n log_dir='/kaggle/working/{target}/logs',\n histogram_freq=1,\n write_graph=True,\n update_freq='epoch'\n )],\n verbose=1\n )\n test_loss = model.evaluate(X_test_seq, y_test_seq)\n mse, mae = test_loss\n print(f'Test MSE per {target}: {mse:.4f}')\n print(f'Test MAE per {target}: {mae:.4f}')\n models[target] = model\n histories[target] = history\n",
"metadata": {
"jupyter": {
"is_executing": true
},
"execution": {
"iopub.status.busy": "2024-10-28T06:43:23.933505Z",
"iopub.execute_input": "2024-10-28T06:43:23.933839Z",
"iopub.status.idle": "2024-10-28T07:14:03.118634Z",
"shell.execute_reply.started": "2024-10-28T06:43:23.933806Z",
"shell.execute_reply": "2024-10-28T07:14:03.117747Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "def save_models_and_scalers(models, scaler_X, scalers_y, target_variables, base_path='/kaggle/working/models'):\n \"\"\"\n Salva i modelli e gli scaler nella cartella models.\n \n Parameters:\n -----------\n models : dict\n Dizionario contenente i modelli per ogni variabile target\n scaler_X : MinMaxScaler\n Scaler unico per tutte le feature di input\n scalers_y : dict\n Dizionario contenente gli scaler per le variabili target\n target_variables : list\n Lista delle variabili target\n base_path : str\n Percorso base dove salvare i modelli (default: 'models')\n \"\"\"\n\n # Crea la cartella se non esiste\n os.makedirs(base_path, exist_ok=True)\n\n # Salva lo scaler X generale\n scaler_x_path = os.path.join(base_path, 'scaler_x.joblib')\n joblib.dump(scaler_X, scaler_x_path)\n\n # Salva i modelli e gli scaler Y per ogni variabile target\n for target in target_variables:\n # Crea una sottocartella per ogni target\n target_path = os.path.join(base_path, target)\n os.makedirs(target_path, exist_ok=True)\n\n # Salva il modello\n model_path = os.path.join(target_path, 'model.joblib')\n joblib.dump(models[target], model_path)\n\n # Salva lo scaler Y\n scaler_y_path = os.path.join(target_path, 'scaler_y.joblib')\n joblib.dump(scalers_y[target], scaler_y_path)\n\n # Salva la lista delle variabili target\n target_vars_path = os.path.join(base_path, 'target_variables.joblib')\n joblib.dump(target_variables, target_vars_path)\n\n print(f\"Modelli e scaler salvati in: {base_path}\")\n\n\ndef load_models_and_scalers(base_path='/kaggle/working/models'):\n \"\"\"\n Carica i modelli e gli scaler dalla cartella models.\n \n Parameters:\n -----------\n base_path : str\n Percorso della cartella contenente i modelli salvati (default: 'models')\n \n Returns:\n --------\n tuple\n (models, scaler_X, scalers_y, target_variables)\n \"\"\"\n\n # Carica la lista delle variabili target\n target_vars_path = os.path.join(base_path, 'target_variables.joblib')\n target_variables = joblib.load(target_vars_path)\n\n # Carica lo scaler X generale\n scaler_x_path = os.path.join(base_path, 'scaler_x.joblib')\n scaler_X = joblib.load(scaler_x_path)\n\n # Inizializza i dizionari\n models = {}\n scalers_y = {}\n\n # Carica i modelli e gli scaler per ogni variabile target\n for target in target_variables:\n target_path = os.path.join(base_path, target)\n\n # Carica il modello\n model_path = os.path.join(target_path, 'model.joblib')\n models[target] = joblib.load(model_path)\n\n # Carica lo scaler Y\n scaler_y_path = os.path.join(target_path, 'scaler_y.joblib')\n scalers_y[target] = joblib.load(scaler_y_path)\n\n print(f\"Modelli e scaler caricati da: {base_path}\")\n return models, scaler_X, scalers_y, target_variables\n\n\n",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:17:50.886746Z",
"iopub.execute_input": "2024-10-28T18:17:50.887388Z",
"iopub.status.idle": "2024-10-28T18:17:50.899266Z",
"shell.execute_reply.started": "2024-10-28T18:17:50.887349Z",
"shell.execute_reply": "2024-10-28T18:17:50.898193Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "save_models_and_scalers(models, scaler_X, scalers, target_variables)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:14:03.136051Z",
"iopub.execute_input": "2024-10-28T07:14:03.136333Z",
"iopub.status.idle": "2024-10-28T07:14:03.558701Z",
"shell.execute_reply.started": "2024-10-28T07:14:03.136303Z",
"shell.execute_reply": "2024-10-28T07:14:03.557717Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "# Previsione delle variabili mancanti per data_before_2010\n# Prepara data_before_2010\ndata_before_2010 = data_before_2010.sort_values('datetime')\ndata_before_2010.set_index('datetime', inplace=True)\n\n# Assicurati che le features non abbiano valori mancanti\ndata_before_2010[features] = data_before_2010[features].ffill()\ndata_before_2010[features] = data_before_2010[features].bfill()",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:14:03.559889Z",
"iopub.execute_input": "2024-10-28T07:14:03.560236Z",
"iopub.status.idle": "2024-10-28T07:14:03.690402Z",
"shell.execute_reply.started": "2024-10-28T07:14:03.560193Z",
"shell.execute_reply": "2024-10-28T07:14:03.689615Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "models, scaler_X, scalers_y, target_variables = load_models_and_scalers()\n\ntimesteps = 24\n# Crea X per data_before_2010\nX_before = data_before_2010[features].values\nX_before_scaled = scaler_X.transform(X_before)\n\n# Crea le sequenze per LSTM\nX_before_seq = create_sequences(timesteps, X_before_scaled)\n\n# Prevedi le variabili mancanti\nfor i, target in enumerate(target_variables):\n print(\"Shape di X_before_seq:\", X_before_seq.shape)\n print(f\"Previsione di {target} per data_before_2010\")\n y_pred_scaled = models[target].predict(X_before_seq)\n print(\"Shape delle predizioni:\", y_pred_scaled.shape)\n # Ricostruisci i valori originali\n scaler = scalers_y[target]\n y_pred = scaler.inverse_transform(y_pred_scaled)\n\n # Allinea le previsioni con le date corrette\n dates = data_before_2010.index[timesteps:]\n data_before_2010.loc[dates, target] = y_pred\n\n# Gestisci eventuali valori iniziali mancanti\ndata_before_2010[target_variables] = data_before_2010[target_variables].bfill()\n\n# Combina data_before_2010 e data_after_2010\nweather_data_complete = pd.concat([data_before_2010, data_after_2010], axis=0)\nweather_data_complete = weather_data_complete.sort_index()\n\n# Salva il dataset completo\nweather_data_complete.reset_index(inplace=True)\nweather_data_complete.to_parquet('/kaggle/working/weather_data_complete.parquet', index=False)\n",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:14:03.691544Z",
"iopub.execute_input": "2024-10-28T07:14:03.691857Z",
"iopub.status.idle": "2024-10-28T07:16:07.983275Z",
"shell.execute_reply.started": "2024-10-28T07:14:03.691824Z",
"shell.execute_reply": "2024-10-28T07:16:07.981975Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## 2. Esplorazione dei Dati Meteo",
"metadata": {}
},
{
"cell_type": "code",
"source": "weather_data = pd.read_parquet('/kaggle/working/weather_data_complete.parquet')",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:16:07.984920Z",
"iopub.execute_input": "2024-10-28T07:16:07.985367Z",
"iopub.status.idle": "2024-10-28T07:16:08.113149Z",
"shell.execute_reply.started": "2024-10-28T07:16:07.985319Z",
"shell.execute_reply": "2024-10-28T07:16:08.112118Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "# Visualizzazione delle tendenze temporali\nfig, axes = plt.subplots(5, 1, figsize=(15, 20))\nweather_data.set_index('date')['temp'].plot(ax=axes[0], title='Temperatura Media Giornaliera')\nweather_data.set_index('date')['humidity'].plot(ax=axes[1], title='Umidità Media Giornaliera')\nweather_data.set_index('date')['solarradiation'].plot(ax=axes[2], title='Radiazione Solare Giornaliera')\nweather_data.set_index('date')['solarenergy'].plot(ax=axes[3], title='Radiazione Solare Giornaliera')\nweather_data.set_index('date')['precip'].plot(ax=axes[4], title='Precipitazioni Giornaliere')\nplt.tight_layout()\nplt.show()\nsave_plot(plt, 'weather_trends', '/kaggle/working/plots')\nplt.close()",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:16:08.114826Z",
"iopub.execute_input": "2024-10-28T07:16:08.115235Z",
"iopub.status.idle": "2024-10-28T07:16:13.275984Z",
"shell.execute_reply.started": "2024-10-28T07:16:08.115192Z",
"shell.execute_reply": "2024-10-28T07:16:13.275040Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## 3. Simulazione dei Dati di Produzione Annuale",
"metadata": {}
},
{
"cell_type": "code",
"source": "\n# Esempio di utilizzo\nolive_varieties = pd.read_csv('/kaggle/input/olive-oil/variety_olive_oil_production.csv')\n\n\ndef add_olive_water_consumption_correlation(dataset):\n # Dati simulati per il fabbisogno d'acqua e la correlazione con la temperatura\n fabbisogno_acqua = {\n \"Nocellara dell'Etna\": {\"Primavera\": 1200, \"Estate\": 2000, \"Autunno\": 1000, \"Inverno\": 500, \"Temperatura Ottimale\": 18, \"Resistenza\": \"Media\"},\n \"Leccino\": {\"Primavera\": 1000, \"Estate\": 1800, \"Autunno\": 800, \"Inverno\": 400, \"Temperatura Ottimale\": 20, \"Resistenza\": \"Alta\"},\n \"Frantoio\": {\"Primavera\": 1100, \"Estate\": 1900, \"Autunno\": 900, \"Inverno\": 450, \"Temperatura Ottimale\": 19, \"Resistenza\": \"Alta\"},\n \"Coratina\": {\"Primavera\": 1300, \"Estate\": 2200, \"Autunno\": 1100, \"Inverno\": 550, \"Temperatura Ottimale\": 17, \"Resistenza\": \"Media\"},\n \"Moraiolo\": {\"Primavera\": 1150, \"Estate\": 2100, \"Autunno\": 900, \"Inverno\": 480, \"Temperatura Ottimale\": 18, \"Resistenza\": \"Media\"},\n \"Pendolino\": {\"Primavera\": 1050, \"Estate\": 1850, \"Autunno\": 850, \"Inverno\": 430, \"Temperatura Ottimale\": 20, \"Resistenza\": \"Alta\"},\n \"Taggiasca\": {\"Primavera\": 1000, \"Estate\": 1750, \"Autunno\": 800, \"Inverno\": 400, \"Temperatura Ottimale\": 19, \"Resistenza\": \"Alta\"},\n \"Canino\": {\"Primavera\": 1100, \"Estate\": 1900, \"Autunno\": 900, \"Inverno\": 450, \"Temperatura Ottimale\": 18, \"Resistenza\": \"Media\"},\n \"Itrana\": {\"Primavera\": 1200, \"Estate\": 2000, \"Autunno\": 1000, \"Inverno\": 500, \"Temperatura Ottimale\": 17, \"Resistenza\": \"Media\"},\n \"Ogliarola\": {\"Primavera\": 1150, \"Estate\": 1950, \"Autunno\": 900, \"Inverno\": 480, \"Temperatura Ottimale\": 18, \"Resistenza\": \"Media\"},\n \"Biancolilla\": {\"Primavera\": 1050, \"Estate\": 1800, \"Autunno\": 850, \"Inverno\": 430, \"Temperatura Ottimale\": 19, \"Resistenza\": \"Alta\"}\n }\n\n # Calcola il fabbisogno idrico annuale per ogni varietà\n for varieta in fabbisogno_acqua:\n fabbisogno_acqua[varieta][\"Annuale\"] = sum([fabbisogno_acqua[varieta][stagione] for stagione in [\"Primavera\", \"Estate\", \"Autunno\", \"Inverno\"]])\n\n # Aggiungiamo le nuove colonne al dataset\n dataset[\"Fabbisogno Acqua Primavera (m³/ettaro)\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Primavera\"])\n dataset[\"Fabbisogno Acqua Estate (m³/ettaro)\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Estate\"])\n dataset[\"Fabbisogno Acqua Autunno (m³/ettaro)\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Autunno\"])\n dataset[\"Fabbisogno Acqua Inverno (m³/ettaro)\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Inverno\"])\n dataset[\"Fabbisogno Idrico Annuale (m³/ettaro)\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Annuale\"])\n dataset[\"Temperatura Ottimale\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Temperatura Ottimale\"])\n dataset[\"Resistenza alla Siccità\"] = dataset[\"Varietà di Olive\"].apply(lambda x: fabbisogno_acqua[x][\"Resistenza\"])\n\n return dataset\n\n\nolive_varieties = add_olive_water_consumption_correlation(olive_varieties)\n\nolive_varieties.to_parquet(\"/kaggle/working/olive_varieties.parquet\")",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:16:13.277678Z",
"iopub.execute_input": "2024-10-28T07:16:13.278023Z",
"iopub.status.idle": "2024-10-28T07:16:13.321438Z",
"shell.execute_reply.started": "2024-10-28T07:16:13.277988Z",
"shell.execute_reply": "2024-10-28T07:16:13.320689Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "def preprocess_weather_data(weather_df):\n # Calcola statistiche mensili per ogni anno\n monthly_weather = weather_df.groupby(['year', 'month']).agg({\n 'temp': ['mean', 'min', 'max'],\n 'humidity': 'mean',\n 'precip': 'sum',\n 'windspeed': 'mean',\n 'cloudcover': 'mean',\n 'solarradiation': 'sum',\n 'solarenergy': 'sum',\n 'uvindex': 'max'\n }).reset_index()\n\n monthly_weather.columns = ['year', 'month'] + [f'{col[0]}_{col[1]}' for col in monthly_weather.columns[2:]]\n return monthly_weather\n\n\ndef get_growth_phase(month):\n if month in [12, 1, 2]:\n return 'dormancy'\n elif month in [3, 4, 5]:\n return 'flowering'\n elif month in [6, 7, 8]:\n return 'fruit_set'\n else:\n return 'ripening'\n\n\ndef calculate_weather_effect(row, optimal_temp):\n # Effetti base\n temp_effect = -0.1 * (row['temp_mean'] - optimal_temp) ** 2\n rain_effect = -0.05 * (row['precip_sum'] - 600) ** 2 / 10000\n sun_effect = 0.1 * row['solarenergy_sum'] / 1000\n\n # Fattori di scala basati sulla fase di crescita\n if row['growth_phase'] == 'dormancy':\n temp_scale = 0.5\n rain_scale = 0.2\n sun_scale = 0.1\n elif row['growth_phase'] == 'flowering':\n temp_scale = 2.0\n rain_scale = 1.5\n sun_scale = 1.0\n elif row['growth_phase'] == 'fruit_set':\n temp_scale = 1.5\n rain_scale = 1.0\n sun_scale = 0.8\n else: # ripening\n temp_scale = 1.0\n rain_scale = 0.5\n sun_scale = 1.2\n\n # Calcolo dell'effetto combinato\n combined_effect = (\n temp_scale * temp_effect +\n rain_scale * rain_effect +\n sun_scale * sun_effect\n )\n\n # Aggiustamenti specifici per fase\n if row['growth_phase'] == 'flowering':\n combined_effect -= 0.5 * max(0, row['precip_sum'] - 50) # Penalità per pioggia eccessiva durante la fioritura\n elif row['growth_phase'] == 'fruit_set':\n combined_effect += 0.3 * max(0, row['temp_mean'] - (optimal_temp + 5)) # Bonus per temperature più alte durante la formazione dei frutti\n\n return combined_effect\n\n\ndef calculate_water_need(weather_data, base_need, optimal_temp):\n # Calcola il fabbisogno idrico basato su temperatura e precipitazioni\n temp_factor = 1 + 0.05 * (weather_data['temp_mean'] - optimal_temp) # Aumenta del 5% per ogni grado sopra l'ottimale\n rain_factor = 1 - 0.001 * weather_data['precip_sum'] # Diminuisce leggermente con l'aumentare delle precipitazioni\n return base_need * temp_factor * rain_factor\n\n\ndef clean_column_name(name):\n # Rimuove caratteri speciali e spazi, converte in snake_case e abbrevia\n name = re.sub(r'[^a-zA-Z0-9\\s]', '', name) # Rimuove caratteri speciali\n name = name.lower().replace(' ', '_') # Converte in snake_case\n\n # Abbreviazioni comuni\n abbreviations = {\n 'production': 'prod',\n 'percentage': 'pct',\n 'hectare': 'ha',\n 'tonnes': 't',\n 'litres': 'l',\n 'minimum': 'min',\n 'maximum': 'max',\n 'average': 'avg'\n }\n\n for full, abbr in abbreviations.items():\n name = name.replace(full, abbr)\n\n return name\n\n\ndef create_technique_mapping(olive_varieties, mapping_path='models/technique_mapping.joblib'):\n # Estrai tutte le tecniche uniche dal dataset e convertile in lowercase\n all_techniques = olive_varieties['Tecnica di Coltivazione'].str.lower().unique()\n\n # Crea il mapping partendo da 1\n technique_mapping = {tech: i + 1 for i, tech in enumerate(sorted(all_techniques))}\n\n # Salva il mapping\n os.makedirs(os.path.dirname(mapping_path), exist_ok=True)\n joblib.dump(technique_mapping, mapping_path)\n\n return technique_mapping\n\n\ndef encode_techniques(df, mapping_path='models/technique_mapping.joblib'):\n if not os.path.exists(mapping_path):\n raise FileNotFoundError(f\"Mapping not found at {mapping_path}. Run create_technique_mapping first.\")\n\n technique_mapping = joblib.load(mapping_path)\n\n # Trova tutte le colonne delle tecniche\n tech_columns = [col for col in df.columns if col.endswith('_tech')]\n\n # Applica il mapping a tutte le colonne delle tecniche\n for col in tech_columns:\n df[col] = df[col].str.lower().map(technique_mapping).fillna(0).astype(int)\n\n return df\n\n\ndef decode_techniques(df, mapping_path='models/technique_mapping.joblib'):\n if not os.path.exists(mapping_path):\n raise FileNotFoundError(f\"Mapping not found at {mapping_path}\")\n\n technique_mapping = joblib.load(mapping_path)\n reverse_mapping = {v: k for k, v in technique_mapping.items()}\n reverse_mapping[0] = '' # Aggiungi un mapping per 0 a stringa vuota\n\n # Trova tutte le colonne delle tecniche\n tech_columns = [col for col in df.columns if col.endswith('_tech')]\n\n # Applica il reverse mapping a tutte le colonne delle tecniche\n for col in tech_columns:\n df[col] = df[col].map(reverse_mapping)\n\n return df\n\n\ndef decode_single_technique(technique_value, mapping_path='models/technique_mapping.joblib'):\n if not os.path.exists(mapping_path):\n raise FileNotFoundError(f\"Mapping not found at {mapping_path}\")\n\n technique_mapping = joblib.load(mapping_path)\n reverse_mapping = {v: k for k, v in technique_mapping.items()}\n reverse_mapping[0] = ''\n\n return reverse_mapping.get(technique_value, '')",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T16:45:14.154883Z",
"iopub.execute_input": "2024-10-28T16:45:14.155261Z",
"iopub.status.idle": "2024-10-28T16:45:14.179174Z",
"shell.execute_reply.started": "2024-10-28T16:45:14.155223Z",
"shell.execute_reply": "2024-10-28T16:45:14.178223Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "def simulate_olive_production(weather_data, olive_varieties, num_simulations=5, random_seed=None):\n \"\"\"\n Simula la produzione di olive per diverse zone e varietà, considerando variazioni meteo specifiche per zona.\n Include barre di avanzamento per monitorare il progresso.\n \n Args:\n weather_data: DataFrame con dati meteorologici storici\n olive_varieties: DataFrame con informazioni sulle varietà di olive\n num_simulations: Numero di simulazioni/zone da generare\n random_seed: Seme per la riproducibilità dei risultati\n \n Returns:\n DataFrame con i risultati delle simulazioni per tutte le zone\n \"\"\"\n if random_seed is not None:\n np.random.seed(random_seed)\n\n create_technique_mapping(olive_varieties)\n monthly_weather = preprocess_weather_data(weather_data)\n all_results = []\n\n # Preparazione dati varietà\n all_varieties = olive_varieties['Varietà di Olive'].unique()\n variety_techniques = {\n variety: olive_varieties[olive_varieties['Varietà di Olive'] == variety]['Tecnica di Coltivazione'].unique()\n for variety in all_varieties\n }\n\n # Barra di avanzamento principale per le simulazioni\n with tqdm(total=num_simulations*num_simulations, desc=\"Simulazioni completate\") as sim_pbar:\n # Per ogni simulazione (anno)\n for sim in range(num_simulations):\n # Seleziona anno di base per questa simulazione\n selected_year = np.random.choice(monthly_weather['year'].unique())\n base_weather = monthly_weather[monthly_weather['year'] == selected_year].copy()\n base_weather.loc[:, 'growth_phase'] = base_weather['month'].apply(get_growth_phase)\n\n # Per ogni zona nella simulazione\n for zone in range(num_simulations):\n # Crea una copia dei dati meteo per questa zona specifica\n zone_weather = base_weather.copy()\n\n # Genera variazioni meteorologiche specifiche per questa zona\n zone_weather['temp_mean'] *= np.random.uniform(0.95, 1.05, len(zone_weather))\n zone_weather['precip_sum'] *= np.random.uniform(0.9, 1.1, len(zone_weather))\n zone_weather['solarenergy_sum'] *= np.random.uniform(0.95, 1.05, len(zone_weather))\n\n # Genera caratteristiche specifiche della zona\n num_varieties = np.random.randint(1, 4) # 1-3 varietà per zona\n selected_varieties = np.random.choice(all_varieties, size=num_varieties, replace=False)\n hectares = np.random.uniform(1, 10) # Dimensione del terreno\n percentages = np.random.dirichlet(np.ones(num_varieties)) # Distribuzione delle varietà\n\n # Inizializzazione contatori annuali\n annual_production = 0\n annual_min_oil = 0\n annual_max_oil = 0\n annual_avg_oil = 0\n annual_water_need = 0\n\n # Inizializzazione dizionario dati varietà\n variety_data = {clean_column_name(variety): {\n 'tech': '',\n 'pct': 0,\n 'prod_t_ha': 0,\n 'oil_prod_t_ha': 0,\n 'oil_prod_l_ha': 0,\n 'min_yield_pct': 0,\n 'max_yield_pct': 0,\n 'min_oil_prod_l_ha': 0,\n 'max_oil_prod_l_ha': 0,\n 'avg_oil_prod_l_ha': 0,\n 'l_per_t': 0,\n 'min_l_per_t': 0,\n 'max_l_per_t': 0,\n 'avg_l_per_t': 0,\n 'olive_prod': 0,\n 'min_oil_prod': 0,\n 'max_oil_prod': 0,\n 'avg_oil_prod': 0,\n 'water_need': 0\n } for variety in all_varieties}\n\n # Simula produzione per ogni varietà selezionata\n for i, variety in enumerate(selected_varieties):\n # Seleziona tecnica di coltivazione casuale per questa varietà\n technique = np.random.choice(variety_techniques[variety])\n percentage = percentages[i]\n\n # Ottieni informazioni specifiche della varietà\n variety_info = olive_varieties[\n (olive_varieties['Varietà di Olive'] == variety) &\n (olive_varieties['Tecnica di Coltivazione'] == technique)\n ].iloc[0]\n\n # Calcola produzione base con variabilità\n base_production = variety_info['Produzione (tonnellate/ettaro)'] * 1000 * percentage * hectares / 12\n base_production *= np.random.uniform(0.9, 1.1) # Aggiungi variabilità alla produzione base\n\n # Calcola effetti meteo sulla produzione\n weather_effect = zone_weather.apply(\n lambda row: calculate_weather_effect(row, variety_info['Temperatura Ottimale']),\n axis=1\n )\n monthly_production = base_production * (1 + weather_effect / 10000)\n monthly_production *= np.random.uniform(0.95, 1.05, len(zone_weather))\n\n # Calcola produzione annuale per questa varietà\n annual_variety_production = monthly_production.sum()\n\n # Calcola rese di olio con variabilità\n min_yield_factor = np.random.uniform(0.95, 1.05)\n max_yield_factor = np.random.uniform(0.95, 1.05)\n avg_yield_factor = (min_yield_factor + max_yield_factor) / 2\n\n min_oil_production = annual_variety_production * variety_info['Min Litri per Tonnellata'] / 1000 * min_yield_factor\n max_oil_production = annual_variety_production * variety_info['Max Litri per Tonnellata'] / 1000 * max_yield_factor\n avg_oil_production = annual_variety_production * variety_info['Media Litri per Tonnellata'] / 1000 * avg_yield_factor\n\n # Calcola fabbisogno idrico\n base_water_need = (\n variety_info['Fabbisogno Acqua Primavera (m³/ettaro)'] +\n variety_info['Fabbisogno Acqua Estate (m³/ettaro)'] +\n variety_info['Fabbisogno Acqua Autunno (m³/ettaro)'] +\n variety_info['Fabbisogno Acqua Inverno (m³/ettaro)']\n ) / 4 # Media stagionale\n\n monthly_water_need = zone_weather.apply(\n lambda row: calculate_water_need(row, base_water_need, variety_info['Temperatura Ottimale']),\n axis=1\n )\n monthly_water_need *= np.random.uniform(0.95, 1.05, len(monthly_water_need))\n annual_variety_water_need = monthly_water_need.sum() * percentage * hectares\n\n # Aggiorna totali annuali\n annual_production += annual_variety_production\n annual_min_oil += min_oil_production\n annual_max_oil += max_oil_production\n annual_avg_oil += avg_oil_production\n annual_water_need += annual_variety_water_need\n\n # Aggiorna dati varietà\n clean_variety = clean_column_name(variety)\n variety_data[clean_variety].update({\n 'tech': clean_column_name(technique),\n 'pct': percentage,\n 'prod_t_ha': variety_info['Produzione (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05),\n 'oil_prod_t_ha': variety_info['Produzione Olio (tonnellate/ettaro)'] * np.random.uniform(0.95, 1.05),\n 'oil_prod_l_ha': variety_info['Produzione Olio (litri/ettaro)'] * np.random.uniform(0.95, 1.05),\n 'min_yield_pct': variety_info['Min % Resa'] * min_yield_factor,\n 'max_yield_pct': variety_info['Max % Resa'] * max_yield_factor,\n 'min_oil_prod_l_ha': variety_info['Min Produzione Olio (litri/ettaro)'] * min_yield_factor,\n 'max_oil_prod_l_ha': variety_info['Max Produzione Olio (litri/ettaro)'] * max_yield_factor,\n 'avg_oil_prod_l_ha': variety_info['Media Produzione Olio (litri/ettaro)'] * avg_yield_factor,\n 'l_per_t': variety_info['Litri per Tonnellata'] * np.random.uniform(0.98, 1.02),\n 'min_l_per_t': variety_info['Min Litri per Tonnellata'] * min_yield_factor,\n 'max_l_per_t': variety_info['Max Litri per Tonnellata'] * max_yield_factor,\n 'avg_l_per_t': variety_info['Media Litri per Tonnellata'] * avg_yield_factor,\n 'olive_prod': annual_variety_production,\n 'min_oil_prod': min_oil_production,\n 'max_oil_prod': max_oil_production,\n 'avg_oil_prod': avg_oil_production,\n 'water_need': annual_variety_water_need\n })\n\n # Appiattisci i dati delle varietà per il DataFrame finale\n flattened_variety_data = {\n f'{variety}_{key}': value\n for variety, data in variety_data.items()\n for key, value in data.items()\n }\n\n # Aggiungi il risultato con tutti i dati della zona\n all_results.append({\n 'simulation_id': sim + 1,\n 'zone_id': zone + 1,\n 'year': selected_year,\n 'temp_mean': zone_weather['temp_mean'].mean(),\n 'precip_sum': zone_weather['precip_sum'].sum(),\n 'solar_energy_sum': zone_weather['solarenergy_sum'].sum(),\n 'ha': hectares,\n 'zone': f\"zone_{zone + 1}\",\n 'olive_prod': annual_production,\n 'min_oil_prod': annual_min_oil,\n 'max_oil_prod': annual_max_oil,\n 'avg_oil_prod': annual_avg_oil,\n 'total_water_need': annual_water_need,\n **flattened_variety_data\n })\n # Aggiorna la barra di avanzamento principale\n sim_pbar.update(1)\n \n\n # Crea DataFrame finale con tutti i risultati\n df_results = pd.DataFrame(all_results)\n return df_results\n\n\nolive_varieties = pd.read_parquet(\"/kaggle/working/olive_varieties.parquet\")\n\nweather_data = pd.read_parquet('/kaggle/working/weather_data_complete.parquet')\n\nsimulated_data = simulate_olive_production(weather_data, olive_varieties, 1000, random_state_value)\n\nsimulated_data.to_parquet(\"/kaggle/working/simulated_data.parquet\")\n\n\n# Funzione per visualizzare il mapping delle tecniche\ndef print_technique_mapping(mapping_path='/kaggle/working/models/technique_mapping.joblib'):\n if not os.path.exists(mapping_path):\n print(\"Mapping file not found.\")\n return\n\n mapping = joblib.load(mapping_path)\n print(\"Technique Mapping:\")\n for technique, code in mapping.items():\n print(f\"{technique}: {code}\")\n\n\n# Visualizza il mapping delle tecniche\nprint_technique_mapping()",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T07:16:13.322975Z",
"iopub.execute_input": "2024-10-28T07:16:13.323323Z",
"iopub.status.idle": "2024-10-28T09:33:50.129328Z",
"shell.execute_reply.started": "2024-10-28T07:16:13.323289Z",
"shell.execute_reply": "2024-10-28T09:33:50.127974Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "simulated_data = pd.read_parquet(\"/kaggle/working/simulated_data.parquet\")\n\n\ndef clean_column_names(df):\n # Funzione per pulire i nomi delle colonne\n new_columns = []\n for col in df.columns:\n # Usa regex per separare le varietà\n varieties = re.findall(r'([a-z]+)_([a-z_]+)', col)\n if varieties:\n new_columns.append(f\"{varieties[0][0]}_{varieties[0][1]}\")\n else:\n new_columns.append(col)\n return new_columns\n\n\ndef prepare_comparison_data(simulated_data, olive_varieties):\n # Pulisci i nomi delle colonne\n df = simulated_data.copy()\n\n df.columns = clean_column_names(df)\n df = encode_techniques(df)\n\n all_varieties = olive_varieties['Varietà di Olive'].unique()\n varieties = [clean_column_name(variety) for variety in all_varieties]\n comparison_data = []\n\n for variety in varieties:\n olive_prod_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_olive_prod')), None)\n oil_prod_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_avg_oil_prod')), None)\n tech_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_tech')), None)\n water_need_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_water_need')), None)\n\n if olive_prod_col and oil_prod_col and tech_col and water_need_col:\n variety_data = df[[olive_prod_col, oil_prod_col, tech_col, water_need_col]]\n variety_data = variety_data[variety_data[tech_col] != 0] # Esclude le righe dove la tecnica è 0\n\n if not variety_data.empty:\n avg_olive_prod = pd.to_numeric(variety_data[olive_prod_col], errors='coerce').mean()\n avg_oil_prod = pd.to_numeric(variety_data[oil_prod_col], errors='coerce').mean()\n avg_water_need = pd.to_numeric(variety_data[water_need_col], errors='coerce').mean()\n efficiency = avg_oil_prod / avg_olive_prod if avg_olive_prod > 0 else 0\n water_efficiency = avg_oil_prod / avg_water_need if avg_water_need > 0 else 0\n\n comparison_data.append({\n 'Variety': variety,\n 'Avg Olive Production (kg/ha)': avg_olive_prod,\n 'Avg Oil Production (L/ha)': avg_oil_prod,\n 'Avg Water Need (m³/ha)': avg_water_need,\n 'Oil Efficiency (L/kg)': efficiency,\n 'Water Efficiency (L oil/m³ water)': water_efficiency\n })\n\n return pd.DataFrame(comparison_data)\n\n\ndef plot_variety_comparison(comparison_data, metric):\n plt.figure(figsize=(12, 6))\n bars = plt.bar(comparison_data['Variety'], comparison_data[metric])\n plt.title(f'Comparison of {metric} across Olive Varieties')\n plt.xlabel('Variety')\n plt.ylabel(metric)\n plt.xticks(rotation=45, ha='right')\n\n for bar in bars:\n height = bar.get_height()\n plt.text(bar.get_x() + bar.get_width() / 2., height,\n f'{height:.2f}',\n ha='center', va='bottom')\n\n plt.tight_layout()\n plt.show()\n save_plot(plt, f'variety_comparison_{metric.lower().replace(\" \", \"_\").replace(\"/\", \"_\").replace(\"(\", \"\").replace(\")\", \"\")}', '/kaggle/working/plots')\n plt.close()\n\n\ndef plot_efficiency_vs_production(comparison_data):\n plt.figure(figsize=(10, 6))\n\n plt.scatter(comparison_data['Avg Olive Production (kg/ha)'],\n comparison_data['Oil Efficiency (L/kg)'],\n s=100)\n\n for i, row in comparison_data.iterrows():\n plt.annotate(row['Variety'],\n (row['Avg Olive Production (kg/ha)'], row['Oil Efficiency (L/kg)']),\n xytext=(5, 5), textcoords='offset points')\n\n plt.title('Oil Efficiency vs Olive Production by Variety')\n plt.xlabel('Average Olive Production (kg/ha)')\n plt.ylabel('Oil Efficiency (L oil / kg olives)')\n plt.tight_layout()\n save_plot(plt, 'efficiency_vs_production', '/kaggle/working/plots')\n plt.close()\n\n\ndef plot_water_efficiency_vs_production(comparison_data):\n plt.figure(figsize=(10, 6))\n\n plt.scatter(comparison_data['Avg Olive Production (kg/ha)'],\n comparison_data['Water Efficiency (L oil/m³ water)'],\n s=100)\n\n for i, row in comparison_data.iterrows():\n plt.annotate(row['Variety'],\n (row['Avg Olive Production (kg/ha)'], row['Water Efficiency (L oil/m³ water)']),\n xytext=(5, 5), textcoords='offset points')\n\n plt.title('Water Efficiency vs Olive Production by Variety')\n plt.xlabel('Average Olive Production (kg/ha)')\n plt.ylabel('Water Efficiency (L oil / m³ water)')\n plt.tight_layout()\n plt.show()\n save_plot(plt, 'water_efficiency_vs_production', '/kaggle/working/plots')\n plt.close()\n\n\ndef plot_water_need_vs_oil_production(comparison_data):\n plt.figure(figsize=(10, 6))\n\n plt.scatter(comparison_data['Avg Water Need (m³/ha)'],\n comparison_data['Avg Oil Production (L/ha)'],\n s=100)\n\n for i, row in comparison_data.iterrows():\n plt.annotate(row['Variety'],\n (row['Avg Water Need (m³/ha)'], row['Avg Oil Production (L/ha)']),\n xytext=(5, 5), textcoords='offset points')\n\n plt.title('Oil Production vs Water Need by Variety')\n plt.xlabel('Average Water Need (m³/ha)')\n plt.ylabel('Average Oil Production (L/ha)')\n plt.tight_layout()\n plt.show()\n save_plot(plt, 'water_need_vs_oil_production', '/kaggle/working/plots')\n plt.close()\n\n\ndef analyze_by_technique(simulated_data, olive_varieties):\n # Pulisci i nomi delle colonne\n df = simulated_data.copy()\n\n df.columns = clean_column_names(df)\n df = encode_techniques(df)\n all_varieties = olive_varieties['Varietà di Olive'].unique()\n varieties = [clean_column_name(variety) for variety in all_varieties]\n\n technique_data = []\n\n for variety in varieties:\n olive_prod_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_olive_prod')), None)\n oil_prod_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_avg_oil_prod')), None)\n tech_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_tech')), None)\n water_need_col = next((col for col in df.columns if col.startswith(f'{variety}_') and col.endswith('_water_need')), None)\n\n if olive_prod_col and oil_prod_col and tech_col and water_need_col:\n variety_data = df[[olive_prod_col, oil_prod_col, tech_col, water_need_col]]\n variety_data = variety_data[variety_data[tech_col] != 0]\n\n if not variety_data.empty:\n for tech in variety_data[tech_col].unique():\n tech_data = variety_data[variety_data[tech_col] == tech]\n\n avg_olive_prod = pd.to_numeric(tech_data[olive_prod_col], errors='coerce').mean()\n avg_oil_prod = pd.to_numeric(tech_data[oil_prod_col], errors='coerce').mean()\n avg_water_need = pd.to_numeric(tech_data[water_need_col], errors='coerce').mean()\n\n efficiency = avg_oil_prod / avg_olive_prod if avg_olive_prod > 0 else 0\n water_efficiency = avg_oil_prod / avg_water_need if avg_water_need > 0 else 0\n\n technique_data.append({\n 'Variety': variety,\n 'Technique': tech,\n 'Technique String': decode_single_technique(tech),\n 'Avg Olive Production (kg/ha)': avg_olive_prod,\n 'Avg Oil Production (L/ha)': avg_oil_prod,\n 'Avg Water Need (m³/ha)': avg_water_need,\n 'Oil Efficiency (L/kg)': efficiency,\n 'Water Efficiency (L oil/m³ water)': water_efficiency\n })\n\n return pd.DataFrame(technique_data)\n\n\n# Esecuzione dell'analisi\ncomparison_data = prepare_comparison_data(simulated_data, olive_varieties)\n\n# Genera i grafici\nplot_variety_comparison(comparison_data, 'Avg Olive Production (kg/ha)')\nplot_variety_comparison(comparison_data, 'Avg Oil Production (L/ha)')\nplot_variety_comparison(comparison_data, 'Avg Water Need (m³/ha)')\nplot_variety_comparison(comparison_data, 'Oil Efficiency (L/kg)')\nplot_variety_comparison(comparison_data, 'Water Efficiency (L oil/m³ water)')\nplot_efficiency_vs_production(comparison_data)\nplot_water_efficiency_vs_production(comparison_data)\nplot_water_need_vs_oil_production(comparison_data)\n\n# Analisi per tecnica\ntechnique_data = analyze_by_technique(simulated_data, olive_varieties)\n\nprint(technique_data)\n\n# Stampa un sommario statistico\nprint(\"Comparison by Variety:\")\nprint(comparison_data.set_index('Variety'))\nprint(\"\\nBest Varieties by Water Efficiency:\")\nprint(comparison_data.sort_values('Water Efficiency (L oil/m³ water)', ascending=False).head())",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:18:34.945646Z",
"iopub.execute_input": "2024-10-28T18:18:34.946022Z",
"iopub.status.idle": "2024-10-28T18:18:47.185796Z",
"shell.execute_reply.started": "2024-10-28T18:18:34.945985Z",
"shell.execute_reply": "2024-10-28T18:18:47.184864Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## 4. Analisi della Relazione tra Meteo e Produzione",
"metadata": {}
},
{
"cell_type": "code",
"source": "def get_full_data(simulated_data, olive_varieties):\n # Assumiamo che simulated_data contenga già tutti i dati necessari\n # Includiamo solo le colonne rilevanti\n relevant_columns = ['year', 'temp_mean', 'precip_sum', 'solar_energy_sum', 'ha', 'zone', 'olive_prod']\n\n # Aggiungiamo le colonne specifiche per varietà\n all_varieties = olive_varieties['Varietà di Olive'].unique()\n varieties = [clean_column_name(variety) for variety in all_varieties]\n for variety in varieties:\n relevant_columns.extend([f'{variety}_olive_prod', f'{variety}_tech'])\n\n return simulated_data[relevant_columns].copy()\n\n\ndef analyze_correlations(full_data, variety):\n # Filtra i dati per la varietà specifica\n variety_data = full_data[[col for col in full_data.columns if not col.startswith('_') or col.startswith(f'{variety}_')]]\n\n # Rinomina le colonne per chiarezza\n variety_data = variety_data.rename(columns={\n f'{variety}_olive_prod': 'olive_production',\n f'{variety}_tech': 'technique'\n })\n\n # Matrice di correlazione\n plt.figure(figsize=(12, 10))\n corr_matrix = variety_data[['temp_mean', 'precip_sum', 'solar_energy_sum', 'olive_production']].corr()\n sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')\n plt.title(f'Matrice di Correlazione - {variety}')\n plt.tight_layout()\n plt.show()\n save_plot(plt, f'correlation_matrix_{variety}', '/kaggle/working/plots')\n plt.close()\n\n # Scatter plots\n fig, axes = plt.subplots(2, 2, figsize=(20, 20))\n fig.suptitle(f'Relazione tra Fattori Meteorologici e Produzione di Olive - {variety}', fontsize=16)\n\n for ax, var in zip(axes.flat, ['temp_mean', 'precip_sum', 'solar_energy_sum', 'ha']):\n sns.scatterplot(data=variety_data, x=var, y='olive_production', hue='technique', ax=ax)\n ax.set_title(f'{var.capitalize()} vs Produzione Olive')\n ax.set_xlabel(var.capitalize())\n ax.set_ylabel('Produzione Olive (kg/ettaro)')\n\n plt.tight_layout()\n plt.show()\n save_plot(plt, f'meteorological_factors_{variety}', '/kaggle/working/plots')\n plt.close()\n\n\n# Uso delle funzioni\nfull_data = get_full_data(simulated_data, olive_varieties)\n\n# Assumiamo che 'selected_variety' sia definito altrove nel codice\n# Per esempio:\nselected_variety = 'nocellara_delletna'\n\nanalyze_correlations(full_data, selected_variety)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T09:34:03.394039Z",
"iopub.execute_input": "2024-10-28T09:34:03.394321Z",
"iopub.status.idle": "2024-10-28T09:37:17.482691Z",
"shell.execute_reply.started": "2024-10-28T09:34:03.394290Z",
"shell.execute_reply": "2024-10-28T09:37:17.481712Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## 5. Preparazione del Modello di Machine Learning",
"metadata": {}
},
{
"cell_type": "code",
"source": "def prepare_data(df, olive_varieties_df):\n # Crea una copia del DataFrame per evitare modifiche all'originale\n df = df.copy()\n\n # Ordina per zona e anno\n df = df.sort_values(['zone', 'year'])\n\n # Definisci le feature\n temporal_features = ['temp_mean', 'precip_sum', 'solar_energy_sum']\n static_features = ['ha'] # Feature statiche base\n target_features = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']\n\n # Ottieni le varietà pulite\n varieties = [clean_column_name(variety) for variety in olive_varieties_df['Varietà di Olive']]\n\n # Crea la struttura delle feature per ogni varietà\n variety_features = [\n 'tech', 'pct', 'prod_t_ha', 'oil_prod_t_ha', 'oil_prod_l_ha',\n 'min_yield_pct', 'max_yield_pct', 'min_oil_prod_l_ha', 'max_oil_prod_l_ha',\n 'avg_oil_prod_l_ha', 'l_per_t', 'min_l_per_t', 'max_l_per_t', 'avg_l_per_t'\n ]\n\n # Prepara dizionari per le nuove colonne\n new_columns = {}\n\n # Prepara le feature per ogni varietà\n for variety in varieties:\n # Feature esistenti\n for feature in variety_features:\n col_name = f\"{variety}_{feature}\"\n if col_name in df.columns:\n if feature != 'tech': # Non includere la colonna tech direttamente\n static_features.append(col_name)\n\n # Feature binarie per le tecniche di coltivazione\n for technique in ['tradizionale', 'intensiva', 'superintensiva']:\n col_name = f\"{variety}_{technique}\"\n new_columns[col_name] = df[f\"{variety}_tech\"].notna() & (\n df[f\"{variety}_tech\"].str.lower() == technique\n ).fillna(False)\n static_features.append(col_name)\n\n # Aggiungi tutte le nuove colonne in una volta sola\n new_df = pd.concat([df] + [pd.Series(v, name=k) for k, v in new_columns.items()], axis=1)\n\n # Ordiniamo per zona e anno per mantenere la continuità temporale\n df_sorted = new_df.sort_values(['zone', 'year'])\n\n # Definiamo la dimensione della finestra temporale\n window_size = 41\n\n # Liste per raccogliere i dati\n temporal_sequences = []\n static_features_list = []\n targets_list = []\n\n # Iteriamo per ogni zona\n for zone in df_sorted['zone'].unique():\n zone_data = df_sorted[df_sorted['zone'] == zone].reset_index(drop=True)\n\n if len(zone_data) >= window_size: # Verifichiamo che ci siano abbastanza dati\n # Creiamo sequenze temporali scorrevoli\n for i in range(len(zone_data) - window_size + 1):\n # Sequenza temporale\n temporal_window = zone_data.iloc[i:i + window_size][temporal_features].values\n # Verifichiamo che non ci siano valori NaN\n if not np.isnan(temporal_window).any():\n temporal_sequences.append(temporal_window)\n\n # Feature statiche (prendiamo quelle dell'ultimo timestep della finestra)\n static_features_list.append(zone_data.iloc[i + window_size - 1][static_features].values)\n\n # Target (prendiamo quelli dell'ultimo timestep della finestra)\n targets_list.append(zone_data.iloc[i + window_size - 1][target_features].values)\n\n # Convertiamo in array numpy\n X_temporal = np.array(temporal_sequences)\n X_static = np.array(static_features_list)\n y = np.array(targets_list)\n\n print(f\"Dataset completo - Temporal: {X_temporal.shape}, Static: {X_static.shape}, Target: {y.shape}\")\n\n # Split dei dati (usando indici casuali per una migliore distribuzione)\n indices = np.random.permutation(len(X_temporal))\n #train_idx = int(len(indices) * 0.7)\n #val_idx = int(len(indices) * 0.85)\n \n train_idx = int(len(indices) * 0.65) # 65% training\n val_idx = int(len(indices) * 0.85) # 20% validation\n # Il resto rimane 15% test\n\n # Oppure versione con 25% validation:\n #train_idx = int(len(indices) * 0.60) # 60% training\n #val_idx = int(len(indices) * 0.85) # 25% validation\n\n train_indices = indices[:train_idx]\n val_indices = indices[train_idx:val_idx]\n test_indices = indices[val_idx:]\n\n # Split dei dati\n X_temporal_train = X_temporal[train_indices]\n X_temporal_val = X_temporal[val_indices]\n X_temporal_test = X_temporal[test_indices]\n\n X_static_train = X_static[train_indices]\n X_static_val = X_static[val_indices]\n X_static_test = X_static[test_indices]\n\n y_train = y[train_indices]\n y_val = y[val_indices]\n y_test = y[test_indices]\n\n # Standardizzazione\n scaler_temporal = StandardScaler()\n scaler_static = StandardScaler()\n scaler_y = StandardScaler()\n\n # Standardizzazione dei dati temporali\n X_temporal_train = scaler_temporal.fit_transform(X_temporal_train.reshape(-1, len(temporal_features))).reshape(X_temporal_train.shape)\n X_temporal_val = scaler_temporal.transform(X_temporal_val.reshape(-1, len(temporal_features))).reshape(X_temporal_val.shape)\n X_temporal_test = scaler_temporal.transform(X_temporal_test.reshape(-1, len(temporal_features))).reshape(X_temporal_test.shape)\n\n # Standardizzazione dei dati statici\n X_static_train = scaler_static.fit_transform(X_static_train)\n X_static_val = scaler_static.transform(X_static_val)\n X_static_test = scaler_static.transform(X_static_test)\n\n # Standardizzazione dei target\n y_train = scaler_y.fit_transform(y_train)\n y_val = scaler_y.transform(y_val)\n y_test = scaler_y.transform(y_test)\n\n print(\"\\nShape dopo lo split e standardizzazione:\")\n print(f\"Train - Temporal: {X_temporal_train.shape}, Static: {X_static_train.shape}, Target: {y_train.shape}\")\n print(f\"Val - Temporal: {X_temporal_val.shape}, Static: {X_static_val.shape}, Target: {y_val.shape}\")\n print(f\"Test - Temporal: {X_temporal_test.shape}, Static: {X_static_test.shape}, Target: {y_test.shape}\")\n\n # Prepara i dizionari di input\n train_data = {'temporal': X_temporal_train, 'static': X_static_train}\n val_data = {'temporal': X_temporal_val, 'static': X_static_val}\n test_data = {'temporal': X_temporal_test, 'static': X_static_test}\n\n return (train_data, y_train), (val_data, y_val), (test_data, y_test), (scaler_temporal, scaler_static, scaler_y)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:19:04.089809Z",
"iopub.execute_input": "2024-10-28T18:19:04.090188Z",
"iopub.status.idle": "2024-10-28T18:19:04.139332Z",
"shell.execute_reply.started": "2024-10-28T18:19:04.090151Z",
"shell.execute_reply": "2024-10-28T18:19:04.138344Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## Divisione train/validation/test:\n",
"metadata": {}
},
{
"cell_type": "code",
"source": "simulated_data = pd.read_parquet(\"/kaggle/working/simulated_data.parquet\")\nolive_varieties = pd.read_parquet(\"/kaggle/working/olive_varieties.parquet\")\n\n(train_data, train_targets), (val_data, val_targets), (test_data, test_targets), scalers = prepare_data(simulated_data, olive_varieties)\n\nscaler_temporal, scaler_static, scaler_y = scalers\n\nprint(\"Temporal data shape:\", train_data['temporal'].shape)\nprint(\"Static data shape:\", train_data['static'].shape)\nprint(\"Target shape:\", train_targets.shape)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T18:19:10.442984Z",
"iopub.execute_input": "2024-10-28T18:19:10.443717Z",
"iopub.status.idle": "2024-10-28T18:53:11.893811Z",
"shell.execute_reply.started": "2024-10-28T18:19:10.443670Z",
"shell.execute_reply": "2024-10-28T18:53:11.892633Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## OliveOilTransformer",
"metadata": {}
},
{
"cell_type": "code",
"source": "import tensorflow as tf\nimport numpy as np\n\nclass PositionalEncoding(tf.keras.layers.Layer):\n def __init__(self, position, d_model):\n super(PositionalEncoding, self).__init__()\n self.pos_encoding = self.positional_encoding(position, d_model)\n\n def get_angles(self, position, i, d_model):\n angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))\n return position * angles\n\n def positional_encoding(self, position, d_model):\n angle_rads = self.get_angles(\n position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],\n i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],\n d_model=d_model)\n\n sines = tf.math.sin(angle_rads[:, 0::2])\n cosines = tf.math.cos(angle_rads[:, 1::2])\n\n pos_encoding = tf.concat([sines, cosines], axis=-1)\n pos_encoding = pos_encoding[tf.newaxis, ...]\n return tf.cast(pos_encoding, tf.float32)\n\n def call(self, inputs):\n return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]\n\nclass TemporalAugmentation(tf.keras.layers.Layer):\n def __init__(self, noise_factor=0.03, **kwargs):\n super().__init__(**kwargs)\n self.noise_factor = noise_factor\n\n def call(self, inputs, training=None):\n if training:\n noise = tf.random.normal(\n shape=tf.shape(inputs), \n mean=0.0, \n stddev=self.noise_factor\n )\n return inputs + noise\n return inputs\n\nclass EnhancedTransformerBlock(tf.keras.layers.Layer):\n def __init__(self, d_model, num_heads, ff_dim, dropout=0.1):\n super().__init__()\n self.att = tf.keras.layers.MultiHeadAttention(\n num_heads=num_heads, \n key_dim=d_model // num_heads,\n value_dim=d_model // num_heads\n )\n self.ffn = tf.keras.Sequential([\n tf.keras.layers.Dense(ff_dim, activation=\"gelu\"),\n tf.keras.layers.Dropout(dropout),\n tf.keras.layers.Dense(d_model)\n ])\n self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n self.dropout1 = tf.keras.layers.Dropout(dropout)\n self.dropout2 = tf.keras.layers.Dropout(dropout)\n self.residual_attention = tf.keras.layers.Dense(d_model, activation='sigmoid')\n\n def call(self, inputs, training):\n # Self-attention con residual connection\n attn_output = self.att(inputs, inputs)\n attn_output = self.dropout1(attn_output, training=training)\n residual_weights = self.residual_attention(inputs)\n out1 = self.layernorm1(inputs + residual_weights * attn_output)\n \n # Feed-forward con residual connection\n ffn_output = self.ffn(out1)\n ffn_output = self.dropout2(ffn_output, training=training)\n return self.layernorm2(out1 + ffn_output)\n\nclass TemporalPoolingLayer(tf.keras.layers.Layer):\n def __init__(self, num_heads, key_dim, **kwargs):\n super().__init__(**kwargs)\n self.attention_pooling = tf.keras.layers.MultiHeadAttention(\n num_heads=num_heads, \n key_dim=key_dim\n )\n self.temporal_pooling = tf.keras.layers.GlobalAveragePooling1D()\n self.max_pooling = tf.keras.layers.GlobalMaxPooling1D()\n self.concat = tf.keras.layers.Concatenate(axis=-1)\n \n def call(self, inputs, training=None):\n # Attention pooling\n att_output = self.attention_pooling(inputs, inputs)\n \n # Global average e max pooling\n avg_output = self.temporal_pooling(inputs)\n max_output = self.max_pooling(inputs)\n \n # Reshape attention output\n att_output = tf.reduce_mean(att_output, axis=1)\n \n # Concatena tutti i tipi di pooling\n return self.concat([att_output, avg_output, max_output])\n\nclass OliveOilTransformer(tf.keras.Model):\n def __init__(self, temporal_shape, static_shape, num_outputs,\n d_model=128, num_heads=8, ff_dim=256, num_transformer_blocks=4,\n mlp_units=[256, 128, 64], dropout=0.2):\n super(OliveOilTransformer, self).__init__()\n \n # Input layers\n self.temporal_input = tf.keras.layers.Input(shape=temporal_shape, name='temporal_input')\n self.static_input = tf.keras.layers.Input(shape=static_shape, name='static_input')\n \n # Input normalization\n self.temporal_normalization = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n self.static_normalization = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n \n # Data Augmentation\n self.temporal_augmentation = TemporalAugmentation(noise_factor=0.03)\n \n # Temporal path\n self.temporal_projection = tf.keras.Sequential([\n tf.keras.layers.Dense(d_model//2, activation='gelu',\n kernel_regularizer=tf.keras.regularizers.l2(1e-5)),\n tf.keras.layers.Dropout(dropout),\n tf.keras.layers.Dense(d_model, activation='gelu',\n kernel_regularizer=tf.keras.regularizers.l2(1e-5))\n ])\n \n self.pos_encoding = PositionalEncoding(position=temporal_shape[0], d_model=d_model)\n \n # Transformer blocks\n self.transformer_blocks = [\n EnhancedTransformerBlock(d_model, num_heads, ff_dim, dropout)\n for _ in range(num_transformer_blocks)\n ]\n \n # Temporal pooling\n self.temporal_pooling = TemporalPoolingLayer(\n num_heads=num_heads,\n key_dim=d_model//4\n )\n \n # Static path\n self.static_encoder = tf.keras.Sequential([\n tf.keras.layers.Dense(256, activation='gelu',\n kernel_regularizer=tf.keras.regularizers.l2(1e-5)),\n tf.keras.layers.Dropout(dropout),\n tf.keras.layers.Dense(128, activation='gelu',\n kernel_regularizer=tf.keras.regularizers.l2(1e-5)),\n tf.keras.layers.Dropout(dropout),\n tf.keras.layers.Dense(64, activation='gelu',\n kernel_regularizer=tf.keras.regularizers.l2(1e-5))\n ])\n \n # Feature fusion\n self.fusion_layer = tf.keras.layers.Concatenate()\n \n # MLP head\n self.mlp_layers = []\n for units in mlp_units:\n self.mlp_layers.extend([\n tf.keras.layers.BatchNormalization(),\n tf.keras.layers.Dense(units, activation=\"gelu\",\n kernel_regularizer=tf.keras.regularizers.l2(1e-5)),\n tf.keras.layers.Dropout(dropout)\n ])\n \n # Output layer\n self.final_layer = tf.keras.layers.Dense(\n num_outputs,\n activation='linear',\n kernel_regularizer=tf.keras.regularizers.l2(1e-5)\n )\n \n # Build model\n temporal_encoded = self.encode_temporal(self.temporal_input, training=True)\n static_encoded = self.encode_static(self.static_input)\n combined = self.fusion_layer([temporal_encoded, static_encoded])\n \n x = combined\n for layer in self.mlp_layers:\n x = layer(x)\n \n outputs = self.final_layer(x)\n \n self._model = tf.keras.Model(\n inputs={'temporal': self.temporal_input, 'static': self.static_input},\n outputs=outputs\n )\n\n def encode_temporal(self, x, training=None):\n # Normalization e augmentation\n x = self.temporal_normalization(x)\n x = self.temporal_augmentation(x, training=training)\n \n # Projection e positional encoding\n x = self.temporal_projection(x)\n x = self.pos_encoding(x)\n \n # Transformer blocks\n skip_connection = x\n for transformer in self.transformer_blocks:\n x = transformer(x, training=training)\n x = tf.keras.layers.Add()([x, skip_connection])\n \n # Pooling\n return self.temporal_pooling(x)\n\n def encode_static(self, x):\n x = self.static_normalization(x)\n return self.static_encoder(x)\n\n def call(self, inputs, training=None):\n temporal_input = inputs['temporal']\n static_input = inputs['static']\n \n temporal_encoded = self.encode_temporal(temporal_input, training)\n static_encoded = self.encode_static(static_input)\n \n combined = self.fusion_layer([temporal_encoded, static_encoded])\n \n x = combined\n for layer in self.mlp_layers:\n x = layer(x, training=training)\n \n return self.final_layer(x)\n\n def model(self):\n return self._model\n\n\n# Configurazione del modello e training\ndef create_and_compile_model(temporal_shape, static_shape, num_outputs):\n model = OliveOilTransformer(\n temporal_shape=temporal_shape,\n static_shape=static_shape,\n num_outputs=num_outputs,\n d_model=128,\n num_heads=8,\n ff_dim=256,\n num_transformer_blocks=4,\n mlp_units=[256, 128, 64],\n dropout=0.2\n )\n\n class WarmUpLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):\n def __init__(self, initial_learning_rate=1e-3, warmup_steps=1000, decay_steps=10000):\n super().__init__()\n self.initial_learning_rate = initial_learning_rate\n self.warmup_steps = warmup_steps\n self.decay_steps = decay_steps\n\n def __call__(self, step):\n warmup_pct = tf.cast(step, tf.float32) / self.warmup_steps\n warmup_lr = self.initial_learning_rate * warmup_pct\n\n decay_factor = tf.pow(0.1, tf.cast(step, tf.float32) / self.decay_steps)\n decayed_lr = self.initial_learning_rate * decay_factor\n\n final_lr = tf.where(step < self.warmup_steps, warmup_lr, decayed_lr)\n return final_lr\n\n def get_config(self):\n return {\n 'initial_learning_rate': self.initial_learning_rate,\n 'warmup_steps': self.warmup_steps,\n 'decay_steps': self.decay_steps\n }\n\n # Custom Metric per target\n class TargetSpecificMetric(tf.keras.callbacks.Callback):\n def __init__(self, validation_data, target_names):\n super().__init__()\n self.validation_data = validation_data\n self.target_names = target_names\n\n def on_epoch_end(self, epoch, logs={}):\n x_val, y_val = self.validation_data\n y_pred = self.model.predict(x_val)\n\n for i, name in enumerate(self.target_names):\n # Calcola MAE usando numpy\n mae = np.mean(np.abs(y_val[:, i] - y_pred[:, i]))\n logs[f'val_{name}_mae'] = mae\n\n # Target names per il monitoraggio specifico\n target_names = ['olive_prod', 'min_oil_prod', 'max_oil_prod', \n 'avg_oil_prod', 'total_water_need']\n\n # Learning rate schedule\n lr_schedule = WarmUpLearningRateSchedule(\n initial_learning_rate=1e-3,\n warmup_steps=500,\n decay_steps=5000\n )\n\n\n\n # Callbacks setup\n callbacks = [\n # Early Stopping migliorato\n tf.keras.callbacks.EarlyStopping(\n monitor='val_loss',\n patience=20,\n restore_best_weights=True,\n min_delta=0.0005,\n mode='min'\n ),\n\n # Model Checkpoint\n tf.keras.callbacks.ModelCheckpoint(\n filepath='/kaggle/working/transformer/best_model_{epoch:02d}_{val_loss:.4f}.keras',\n monitor='val_loss',\n save_best_only=True,\n mode='min'\n ),\n\n # Metric per target specifici\n TargetSpecificMetric(\n validation_data=(val_data, val_targets),\n target_names=target_names\n ),\n\n # TensorBoard logging\n tf.keras.callbacks.TensorBoard(\n log_dir='/kaggle/working/transformer/logs',\n histogram_freq=1,\n write_graph=True,\n update_freq='epoch'\n )\n ]\n\n # Ricompila il modello con il nuovo optimizer\n model.compile(\n optimizer=tf.keras.optimizers.AdamW(\n learning_rate=lr_schedule,\n weight_decay=0.01\n ),\n loss=tf.keras.losses.Huber(),\n metrics=['mae']\n )\n\n return model, callbacks\n\n\n# Creazione e compilazione del modello\nmodel, callbacks = create_and_compile_model(\n temporal_shape=(train_data['temporal'].shape[1], train_data['temporal'].shape[2]),\n static_shape=(train_data['static'].shape[1],),\n num_outputs=train_targets.shape[1]\n)\n\n# Mostra il summary\nmodel.model().summary()\n\n#tf.keras.utils.plot_model(model.model, \"/kaggle/working/models/oil_transformer_model.png\", show_shapes=True, show_layer_names=True)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T20:35:41.836445Z",
"iopub.execute_input": "2024-10-28T20:35:41.836806Z",
"iopub.status.idle": "2024-10-28T20:35:43.296587Z",
"shell.execute_reply.started": "2024-10-28T20:35:41.836772Z",
"shell.execute_reply": "2024-10-28T20:35:43.295751Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## Model Training",
"metadata": {}
},
{
"cell_type": "code",
"source": "# Esegui il training\nhistory = model.fit(\n x=train_data,\n y=train_targets,\n validation_data=(val_data, val_targets),\n epochs=150,\n batch_size=64,\n callbacks=callbacks,\n verbose=1,\n shuffle=True\n)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T20:36:16.203049Z",
"iopub.execute_input": "2024-10-28T20:36:16.203987Z",
"iopub.status.idle": "2024-10-28T21:38:49.845072Z",
"shell.execute_reply.started": "2024-10-28T20:36:16.203943Z",
"shell.execute_reply": "2024-10-28T21:38:49.844267Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "# Per denormalizzare e calcolare l'errore reale\ndef calculate_real_error(model, test_data, test_targets, scaler_y):\n # Fare predizioni\n predictions = model.predict(test_data)\n \n # Denormalizzare predizioni e target\n predictions_real = scaler_y.inverse_transform(predictions)\n targets_real = scaler_y.inverse_transform(test_targets)\n \n # Calcolare errore percentuale per ogni target\n percentage_errors = []\n absolute_errors = []\n \n for i in range(predictions_real.shape[1]):\n mae = np.mean(np.abs(predictions_real[:, i] - targets_real[:, i]))\n mape = np.mean(np.abs((predictions_real[:, i] - targets_real[:, i]) / targets_real[:, i])) * 100\n percentage_errors.append(mape)\n absolute_errors.append(mae)\n \n # Stampa risultati per ogni target\n target_names = ['olive_prod', 'min_oil_prod', 'max_oil_prod', 'avg_oil_prod', 'total_water_need']\n \n print(\"\\nErrori per target:\")\n print(\"-\" * 50)\n for i, target in enumerate(target_names):\n print(f\"{target}:\")\n print(f\"MAE assoluto: {absolute_errors[i]:.2f}\")\n print(f\"Errore percentuale medio: {percentage_errors[i]:.2f}%\")\n print(f\"Precisione: {100 - percentage_errors[i]:.2f}%\")\n print(\"-\" * 50)\n \n return percentage_errors, absolute_errors\n\n# Calcola gli errori reali\npercentage_errors, absolute_errors = calculate_real_error(model, val_data, val_targets, scaler_y)",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T21:38:49.848936Z",
"iopub.execute_input": "2024-10-28T21:38:49.849275Z",
"iopub.status.idle": "2024-10-28T21:39:06.761178Z",
"shell.execute_reply.started": "2024-10-28T21:38:49.849237Z",
"shell.execute_reply": "2024-10-28T21:39:06.759965Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": "def save_transformer_model(model, scaler_temporal, scaler_static, scaler_y, base_path='/kaggle/working/models/oli_transformer'):\n \"\"\"\n Salva il modello transformer e i suoi scaler.\n \n Parameters:\n -----------\n model : OliveOilTransformer\n Il modello transformer\n scaler_temporal : StandardScaler\n Scaler per i dati temporali\n scaler_static : StandardScaler\n Scaler per i dati statici\n scaler_y : StandardScaler\n Scaler per i target\n base_path : str\n Percorso base dove salvare il modello e gli scaler\n \"\"\"\n # Crea la cartella se non esiste\n os.makedirs(base_path, exist_ok=True)\n \n # Salva il modello\n model_path = os.path.join(base_path, 'olive_transformer.keras')\n model.save(model_path)\n \n # Salva gli scaler\n joblib.dump(scaler_temporal, os.path.join(base_path, 'scaler_temporal.joblib'))\n joblib.dump(scaler_static, os.path.join(base_path, 'scaler_static.joblib'))\n joblib.dump(scaler_y, os.path.join(base_path, 'scaler_y.joblib'))\n \n print(f\"Modello transformer e scaler salvati in: {base_path}\")\n\ndef load_transformer_model(base_path='/kaggle/working/models/oli_transformer'):\n \"\"\"\n Carica il modello transformer e i suoi scaler.\n \n Parameters:\n -----------\n base_path : str\n Percorso dove sono salvati il modello e gli scaler\n \n Returns:\n --------\n tuple\n (model, scaler_temporal, scaler_static, scaler_y)\n \"\"\"\n # Carica il modello\n model_path = os.path.join(base_path, 'olive_transformer.keras')\n model = tf.keras.models.load_model(model_path, custom_objects={\n 'WarmUpLearningRateSchedule': WarmUpLearningRateSchedule\n })\n \n # Carica gli scaler\n scaler_temporal = joblib.load(os.path.join(base_path, 'scaler_temporal.joblib'))\n scaler_static = joblib.load(os.path.join(base_path, 'scaler_static.joblib'))\n scaler_y = joblib.load(os.path.join(base_path, 'scaler_y.joblib'))\n \n print(f\"Modello transformer e scaler caricati da: {base_path}\")\n return model, scaler_temporal, scaler_static, scaler_y\n\n# Esempio di utilizzo:\n\n# Per salvare:\nsave_transformer_model(\n model=model,\n scaler_temporal=scaler_temporal,\n scaler_static=scaler_static,\n scaler_y=scaler_y,\n)\n\n# Per caricare:\n#model, scaler_temporal, scaler_static, scaler_y = load_transformer_model()\n",
"metadata": {
"execution": {
"iopub.status.busy": "2024-10-28T21:39:06.762613Z",
"iopub.execute_input": "2024-10-28T21:39:06.763155Z",
"iopub.status.idle": "2024-10-28T21:39:07.109425Z",
"shell.execute_reply.started": "2024-10-28T21:39:06.763103Z",
"shell.execute_reply": "2024-10-28T21:39:07.108352Z"
},
"trusted": true
},
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"source": "## 8. Conclusioni e Prossimi Passi\n\nIn questo notebook, abbiamo:\n1. Caricato e analizzato i dati meteorologici\n2. Simulato la produzione annuale di olive basata sui dati meteo\n3. Esplorato le relazioni tra variabili meteorologiche e produzione di olive\n4. Creato e valutato un modello di machine learning per prevedere la produzione\n5. Utilizzato ARIMA per fare previsioni meteo\n6. Previsto la produzione di olive per il prossimo anno\n\nProssimi passi:\n- Raccogliere dati reali sulla produzione di olive per sostituire i dati simulati\n- Esplorare modelli più avanzati, come le reti neurali o i modelli di ensemble\n- Incorporare altri fattori che potrebbero influenzare la produzione, come le pratiche agricole o l'età degli alberi\n- Sviluppare una dashboard interattiva basata su questo modello",
"metadata": {}
}
]
}