2422 lines
139 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "8adcbe0819b88578",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
"Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
"Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease \n",
"Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
"Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease\n",
"Get:6 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2738 kB]\n",
"Get:7 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1513 kB]\n",
"Fetched 4508 kB in 2s (2961 kB/s) \n",
"Reading package lists... Done\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"graphviz is already the newest version (2.42.2-6ubuntu0.1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 121 not upgraded.\n",
"Requirement already satisfied: tensorflow in /usr/local/lib/python3.11/dist-packages (2.14.0)\n",
"Requirement already satisfied: absl-py>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.0.0)\n",
"Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.6.3)\n",
"Requirement already satisfied: flatbuffers>=23.5.26 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (23.5.26)\n",
"Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.5.4)\n",
"Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.2.0)\n",
"Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.9.0)\n",
"Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (16.0.6)\n",
"Requirement already satisfied: ml-dtypes==0.2.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.2.0)\n",
"Requirement already satisfied: numpy>=1.23.5 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.26.0)\n",
"Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (3.3.0)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from tensorflow) (23.1)\n",
"Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (4.24.3)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from tensorflow) (68.2.2)\n",
"Requirement already satisfied: six>=1.12.0 in /usr/lib/python3/dist-packages (from tensorflow) (1.16.0)\n",
"Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.3.0)\n",
"Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (4.8.0)\n",
"Requirement already satisfied: wrapt<1.15,>=1.11.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.14.1)\n",
"Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (0.37.1)\n",
"Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (1.58.0)\n",
"Requirement already satisfied: tensorboard<2.15,>=2.14 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.14.0)\n",
"Requirement already satisfied: tensorflow-estimator<2.15,>=2.14.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.14.0)\n",
"Requirement already satisfied: keras<2.15,>=2.14.0 in /usr/local/lib/python3.11/dist-packages (from tensorflow) (2.14.0)\n",
"Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from astunparse>=1.6.0->tensorflow) (0.41.2)\n",
"Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.15,>=2.14->tensorflow) (2.23.1)\n",
"Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.15,>=2.14->tensorflow) (1.0.0)\n",
"Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.15,>=2.14->tensorflow) (3.4.4)\n",
"Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.15,>=2.14->tensorflow) (2.31.0)\n",
"Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.15,>=2.14->tensorflow) (0.7.1)\n",
"Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from tensorboard<2.15,>=2.14->tensorflow) (2.3.7)\n",
"Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.15,>=2.14->tensorflow) (5.3.1)\n",
"Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.11/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.15,>=2.14->tensorflow) (0.3.0)\n",
"Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.11/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.15,>=2.14->tensorflow) (4.9)\n",
"Requirement already satisfied: urllib3>=2.0.5 in /usr/local/lib/python3.11/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.15,>=2.14->tensorflow) (2.0.5)\n",
"Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard<2.15,>=2.14->tensorflow) (1.3.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorboard<2.15,>=2.14->tensorflow) (3.2.0)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorboard<2.15,>=2.14->tensorflow) (3.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests<3,>=2.21.0->tensorboard<2.15,>=2.14->tensorflow) (2023.7.22)\n",
"Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.11/dist-packages (from werkzeug>=1.0.1->tensorboard<2.15,>=2.14->tensorflow) (2.1.3)\n",
"Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.11/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.15,>=2.14->tensorflow) (0.5.0)\n",
"Requirement already satisfied: oauthlib>=3.0.0 in /usr/lib/python3/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard<2.15,>=2.14->tensorflow) (3.2.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (1.26.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.3)\n",
"Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (1.26.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: keras in /usr/local/lib/python3.11/dist-packages (2.14.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.5.2)\n",
"Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.26.0)\n",
"Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.14.1)\n",
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.5.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.8.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.1.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.11.0)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.42.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.5)\n",
"Requirement already satisfied: numpy<2,>=1.21 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.26.0)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (23.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (10.0.1)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.0)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (1.4.2)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: pyarrow in /usr/local/lib/python3.11/dist-packages (18.1.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: fastparquet in /usr/local/lib/python3.11/dist-packages (2024.11.0)\n",
"Requirement already satisfied: pandas>=1.5.0 in /usr/local/lib/python3.11/dist-packages (from fastparquet) (2.2.3)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from fastparquet) (1.26.0)\n",
"Requirement already satisfied: cramjam>=2.3 in /usr/local/lib/python3.11/dist-packages (from fastparquet) (2.9.0)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from fastparquet) (2024.10.0)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from fastparquet) (23.1)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.5.0->fastparquet) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.5.0->fastparquet) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.5.0->fastparquet) (2024.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->fastparquet) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (1.14.1)\n",
"Requirement already satisfied: numpy<2.3,>=1.23.5 in /usr/local/lib/python3.11/dist-packages (from scipy) (1.26.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (0.13.2)\n",
"Requirement already satisfied: numpy!=1.24.0,>=1.20 in /usr/local/lib/python3.11/dist-packages (from seaborn) (1.26.0)\n",
"Requirement already satisfied: pandas>=1.2 in /usr/local/lib/python3.11/dist-packages (from seaborn) (2.2.3)\n",
"Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /usr/local/lib/python3.11/dist-packages (from seaborn) (3.8.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.1.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.11.0)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.42.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.5)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (23.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.0.1)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.0)\n",
"Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.2->seaborn) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.2->seaborn) (2024.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: pydot in /usr/local/lib/python3.11/dist-packages (3.0.2)\n",
"Requirement already satisfied: pyparsing>=3.0.9 in /usr/local/lib/python3.11/dist-packages (from pydot) (3.2.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: tensorflow-io in /usr/local/lib/python3.11/dist-packages (0.37.1)\n",
"Requirement already satisfied: tensorflow-io-gcs-filesystem==0.37.1 in /usr/local/lib/python3.11/dist-packages (from tensorflow-io) (0.37.1)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: tensorflow-addons in /usr/local/lib/python3.11/dist-packages (0.23.0)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from tensorflow-addons) (23.1)\n",
"Requirement already satisfied: typeguard<3.0.0,>=2.7 in /usr/local/lib/python3.11/dist-packages (from tensorflow-addons) (2.13.3)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"# from opt_einsum.paths import branch_1\n",
"!apt-get update\n",
"!apt-get install graphviz -y\n",
"\n",
"!pip install tensorflow\n",
"!pip install numpy\n",
"!pip install pandas\n",
"\n",
"!pip install keras\n",
"!pip install scikit-learn\n",
"!pip install matplotlib\n",
"!pip install joblib\n",
"!pip install pyarrow\n",
"!pip install fastparquet\n",
"!pip install scipy\n",
"!pip install seaborn\n",
"!pip install tqdm\n",
"!pip install pydot\n",
"!pip install tensorflow-io\n",
"!pip install tensorflow-addons"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e6fe6bb613168a8a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-11-27 13:56:39.957016: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"2024-11-27 13:56:39.957067: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"2024-11-27 13:56:39.957117: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2024-11-27 13:56:39.966205: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"/usr/local/lib/python3.11/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning: \n",
"\n",
"TensorFlow Addons (TFA) has ended development and introduction of new features.\n",
"TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n",
"Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n",
"\n",
"For more information see: https://github.com/tensorflow/addons/issues/2807 \n",
"\n",
" warnings.warn(\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"from tensorflow.keras.layers import (\n",
" Dense, LSTM, MultiHeadAttention, Dropout, BatchNormalization, \n",
" LayerNormalization, Input, Activation, Lambda, Bidirectional, \n",
" Add, MaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D,\n",
" GlobalMaxPooling1D, Concatenate, ThresholdedReLU, Average,\n",
" Conv1D, Multiply\n",
")\n",
"from tensorflow.keras import regularizers\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau\n",
"from tensorflow.keras.optimizers import AdamW\n",
"from tensorflow.keras.metrics import AUC\n",
"from tensorflow.keras.utils import plot_model\n",
"\n",
"# Data processing and analysis\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import RobustScaler\n",
"from sklearn.metrics import (\n",
" mean_absolute_error, mean_squared_error, r2_score, \n",
" confusion_matrix, classification_report, roc_auc_score\n",
")\n",
"\n",
"# Visualization\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Additional utilities\n",
"import tensorflow_addons as tfa\n",
"from scipy import stats\n",
"import json\n",
"from datetime import datetime\n",
"import os\n",
"import joblib\n",
"\n",
"folder_name = datetime.now().strftime(\"%Y-%m-%d_%H-%M\")\n",
"\n",
"random_state_value = None"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3da8b15c7eb9833f",
"metadata": {},
"outputs": [],
"source": [
"def get_season(date):\n",
" month = date.month\n",
" day = date.day\n",
" if (month == 12 and day >= 21) or (month <= 3 and day < 20):\n",
" return 'Winter'\n",
" elif (month == 3 and day >= 20) or (month <= 6 and day < 21):\n",
" return 'Spring'\n",
" elif (month == 6 and day >= 21) or (month <= 9 and day < 23):\n",
" return 'Summer'\n",
" elif (month == 9 and day >= 23) or (month <= 12 and day < 21):\n",
" return 'Autumn'\n",
" else:\n",
" return 'Unknown'\n",
"\n",
"\n",
"def get_time_period(hour):\n",
" if 5 <= hour < 12:\n",
" return 'Morning'\n",
" elif 12 <= hour < 17:\n",
" return 'Afternoon'\n",
" elif 17 <= hour < 21:\n",
" return 'Evening'\n",
" else:\n",
" return 'Night'\n",
"\n",
"\n",
"def add_time_features(df):\n",
" df['datetime'] = pd.to_datetime(df['datetime'])\n",
" df['timestamp'] = df['datetime'].astype(np.int64) // 10 ** 9\n",
" df['year'] = df['datetime'].dt.year\n",
" df['month'] = df['datetime'].dt.month\n",
" df['day'] = df['datetime'].dt.day\n",
" df['hour'] = df['datetime'].dt.hour\n",
" df['minute'] = df['datetime'].dt.minute\n",
" df['hour_sin'] = np.sin(df['hour'] * (2 * np.pi / 24))\n",
" df['hour_cos'] = np.cos(df['hour'] * (2 * np.pi / 24))\n",
" df['day_of_week'] = df['datetime'].dt.dayofweek\n",
" df['day_of_year'] = df['datetime'].dt.dayofyear\n",
" df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)\n",
" df['quarter'] = df['datetime'].dt.quarter\n",
" df['is_month_end'] = df['datetime'].dt.is_month_end.astype(int)\n",
" df['is_quarter_end'] = df['datetime'].dt.is_quarter_end.astype(int)\n",
" df['is_year_end'] = df['datetime'].dt.is_year_end.astype(int)\n",
" df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))\n",
" df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))\n",
" df['day_of_year_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25))\n",
" df['day_of_year_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 365.25))\n",
" df['season'] = df['datetime'].apply(get_season)\n",
" df['time_period'] = df['hour'].apply(get_time_period)\n",
" return df\n",
"\n",
"\n",
"def add_solar_features(df):\n",
" # Features based only on radiation and other available variables\n",
" df['solar_elevation'] = np.sin(df['day_of_year'] * (2 * np.pi / 365.25)) * np.sin(df['hour'] * (2 * np.pi / 24))\n",
"\n",
" # Energy-specific features\n",
" df['radiation_clearsky'] = df['solarradiation'] * (100 - df['cloudcover']) / 100\n",
"\n",
" # Temperature impact on theoretical efficiency\n",
" df['temp_efficiency_factor'] = 1 - 0.004 * (df['temp'] - 25) # Typical temperature coefficient\n",
"\n",
" # Combined features\n",
" df['cloud_impact'] = df['cloudcover'] * df['solarradiation']\n",
" df['visibility_radiation'] = df['visibility'] * df['solarradiation']\n",
" df['clear_sky_index'] = (100 - df['cloudcover']) / 100\n",
" df['temp_effect'] = df['temp'] - df['tempmin']\n",
"\n",
" return df\n",
"\n",
"def add_solar_specific_features(df):\n",
" \"\"\"\n",
" Aggiunge feature specifiche per la predizione della radiazione solare\n",
" combinando caratteristiche astronomiche e meteorologiche\n",
" \"\"\"\n",
" # Caratteristiche astronomiche\n",
" df['day_length'] = 12 + 3 * np.sin(2 * np.pi * (df['day_of_year'] - 81) / 365.25)\n",
" df['solar_noon'] = np.abs(12 - df['hour'])\n",
" df['solar_elevation'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25) * np.cos(2 * np.pi * df['solar_noon'] / 24)\n",
"\n",
" # Angolo solare teorico\n",
" df['solar_angle'] = np.sin(df['hour_sin']) * np.sin(df['day_of_year_sin'])\n",
"\n",
" # Interazioni con condizioni atmosferiche\n",
" df['cloud_elevation'] = df['cloudcover'] * df['solar_elevation']\n",
" df['visibility_elevation'] = df['visibility'] * df['solar_elevation']\n",
" df['uv_cloud_interaction'] = df['uvindex'] * (100 - df['cloudcover']) / 100\n",
"\n",
" # Indici di chiarezza e trasmissione\n",
" df['clearness_index'] = (100 - df['cloudcover']) * df['visibility'] / 10000\n",
" df['atmospheric_attenuation'] = (df['pressure'] / 1013.25) * (1 - (df['humidity'] / 100) * 0.6)\n",
"\n",
" # Radiazione teorica e attenuazione\n",
" df['theoretical_radiation'] = df['solar_angle'].clip(0, 1) * 1000\n",
" df['expected_radiation'] = df['theoretical_radiation'] * df['clearness_index']\n",
"\n",
" # Rolling features\n",
" df['cloud_rolling_12h'] = df['cloudcover'].rolling(window=12).mean()\n",
" df['temp_rolling_12h'] = df['temp'].rolling(window=12).mean()\n",
" df['uv_rolling_12h'] = df['uvindex'].rolling(window=12).mean()\n",
"\n",
" # Interazioni temperatura-radiazione\n",
" df['temp_radiation_potential'] = df['temp'] * df['solar_elevation']\n",
"\n",
" return df\n",
"\n",
"def add_radiation_energy_features(df):\n",
" \"\"\"Adds specific features based on solarenergy and uvindex\"\"\"\n",
"\n",
" # Solar energy to UV ratio (independent from solarradiation)\n",
" df['energy_uv_ratio'] = df['solarenergy'] / (df['uvindex'] + 1e-6)\n",
"\n",
" # Time aggregations\n",
" # Moving averages\n",
" windows = [3, 6, 12, 24] # hours\n",
" for w in windows:\n",
" df[f'energy_rolling_mean_{w}h'] = df['solarenergy'].rolling(window=w).mean()\n",
" df[f'uv_rolling_mean_{w}h'] = df['uvindex'].rolling(window=w).mean()\n",
"\n",
" # Daily aggregations utilizzando datetime\n",
" df['energy_daily_sum'] = df.groupby(df['datetime'].dt.date)['solarenergy'].transform('sum')\n",
" df['uv_daily_max'] = df.groupby(df['datetime'].dt.date)['uvindex'].transform('max')\n",
"\n",
" # Changes\n",
" df['energy_change'] = df['solarenergy'].diff()\n",
" df['uv_change'] = df['uvindex'].diff()\n",
"\n",
" # Lag features\n",
" lags = [1, 2, 3, 6, 12, 24] # hours\n",
" for lag in lags:\n",
" df[f'energy_lag_{lag}h'] = df['solarenergy'].shift(lag)\n",
" df[f'uv_lag_{lag}h'] = df['uvindex'].shift(lag)\n",
"\n",
" # Peak indicators\n",
" df['is_energy_peak'] = (df['solarenergy'] > df['energy_rolling_mean_6h'] * 1.2).astype(int)\n",
" df['is_uv_peak'] = (df['uvindex'] > df['uv_rolling_mean_6h'] * 1.2).astype(int)\n",
"\n",
" # Aggiungiamo alcune metriche di volatilità\n",
" df['energy_volatility'] = df['energy_change'].rolling(window=24).std()\n",
" df['uv_volatility'] = df['uv_change'].rolling(window=24).std()\n",
"\n",
" # Indice di intensità solare composito\n",
" df['solar_intensity_index'] = (df['solarenergy'] * df['uvindex']) / (df['cloudcover'] + 1e-6)\n",
"\n",
" # Interazioni\n",
" df['uv_cloud_interaction'] = df['uvindex'] * (100 - df['cloudcover']) / 100\n",
" df['energy_temp_interaction'] = df['solarenergy'] * df['temp']\n",
"\n",
" return df\n",
"\n",
"def add_atmospheric_features(df):\n",
" # Indice di Massa d'Aria (Air Mass Index)\n",
" # Rappresenta il percorso ottico relativo dei raggi solari attraverso l'atmosfera\n",
" df['air_mass_index'] = 1 / (np.cos(np.radians(90 - df['solar_elevation'])) + 0.50572 *\n",
" (96.07995 - (90 - df['solar_elevation']))**-1.6364)\n",
"\n",
" # Indice di Stabilità Atmosferica\n",
" # Combina temperatura, umidità e pressione\n",
" df['atmospheric_stability'] = (df['temp'] * (100 - df['humidity'])) / df['pressure']\n",
"\n",
" # Vapor Pressure Deficit (VPD)\n",
" # Importante per la radiazione diffusa\n",
" df['saturation_vapor_pressure'] = 0.6108 * np.exp(17.27 * df['temp'] / (df['temp'] + 237.3))\n",
" df['actual_vapor_pressure'] = df['saturation_vapor_pressure'] * (df['humidity'] / 100)\n",
" df['vapor_pressure_deficit'] = df['saturation_vapor_pressure'] - df['actual_vapor_pressure']\n",
"\n",
" return df\n",
"\n",
"def add_diffusion_features(df):\n",
" # Indice di Diffusione\n",
" df['diffusion_index'] = (df['cloudcover'] * df['humidity']) / 10000\n",
"\n",
" # Radiazione Diretta vs Diffusa\n",
" df['direct_radiation'] = df['solarradiation'] * (1 - df['diffusion_index'])\n",
" df['diffuse_radiation'] = df['solarradiation'] * df['diffusion_index']\n",
"\n",
" # Fattore di Trasparenza Atmosferica\n",
" df['atmospheric_transmittance'] = (1 - df['cloudcover']/100) * (df['visibility']/10) * (1 - df['humidity']/200)\n",
"\n",
" return df\n",
"\n",
"def calculate_trend(x):\n",
" try:\n",
" return np.polyfit(np.arange(len(x)), x, 1)[0]\n",
" except:\n",
" return np.nan\n",
"\n",
"def add_persistence_features(df):\n",
" # Create a copy to avoid modifying the original dataframe\n",
" df = df.copy()\n",
"\n",
" # Calculate trends more efficiently\n",
" windows = [3, 6, 12, 24]\n",
" for w in windows:\n",
" # Use numba or vectorized operations if possible\n",
" df[f'radiation_trend_{w}h'] = df['solarradiation'].rolling(\n",
" window=w,\n",
" min_periods=w\n",
" ).apply(calculate_trend, raw=True)\n",
"\n",
" # Optimize volatility calculation by doing it in one pass\n",
" rolling_24 = df['solarradiation'].rolling(24, min_periods=1)\n",
" df['radiation_volatility'] = rolling_24.std() / rolling_24.mean().clip(lower=1e-10)\n",
"\n",
" return df\n",
"\n",
"def add_weather_pattern_features(df):\n",
" # Pattern giornalieri\n",
" df['clear_sky_duration'] = df.groupby(df['datetime'].dt.date)['cloudcover'].transform(\n",
" lambda x: (x < 30).sum()\n",
" )\n",
"\n",
" # Stabilità delle condizioni\n",
" for col in ['temp', 'humidity', 'cloudcover']:\n",
" df[f'{col}_stability'] = df[col].rolling(12).std() / df[col].rolling(12).mean()\n",
"\n",
" # Indice di Variabilità Meteorologica\n",
" df['weather_variability_index'] = (df['temp_stability'] +\n",
" df['humidity_stability'] +\n",
" df['cloudcover_stability']) / 3\n",
"\n",
" return df\n",
"\n",
"def add_efficiency_features(df):\n",
" # Perdite per temperatura\n",
" df['temp_losses'] = 0.004 * (df['temp'] - 25).clip(lower=0) # 0.4% per grado sopra 25°C\n",
"\n",
" # Perdite per polvere/sporco (stima basata su umidità e pressione)\n",
" df['soiling_loss_factor'] = 0.002 * (df['humidity']/100) * (df['pressure']/1013.25)\n",
"\n",
" # Efficienza complessiva stimata\n",
" df['estimated_efficiency'] = (1 - df['temp_losses']) * (1 - df['soiling_loss_factor']) * \\\n",
" df['atmospheric_transmittance']\n",
"\n",
" # Potenziale di produzione\n",
" df['production_potential'] = df['solarradiation'] * df['estimated_efficiency']\n",
"\n",
" return df\n",
"\n",
"def add_advanced_seasonal_features(df):\n",
" # Differenza dalla durata media del giorno\n",
" avg_day_length = 12\n",
" df['day_length_deviation'] = df['day_length'] - avg_day_length\n",
"\n",
" # Intensità stagionale\n",
" df['seasonal_intensity'] = np.sin(2 * np.pi * (df['day_of_year'] - 172) / 365.25)\n",
"\n",
" # Indice di Stagionalità\n",
" df['seasonality_index'] = df['seasonal_intensity'] * df['solar_elevation']\n",
"\n",
" # Correzione per alba/tramonto\n",
" df['daylight_correction'] = np.where(\n",
" (df['hour'] >= df['day_length']) | (df['hour'] <= 24-df['day_length']),\n",
" 0,\n",
" 1\n",
" )\n",
"\n",
" return df\n",
"\n",
"def add_basic_interactions(df):\n",
" \"\"\"\n",
" Aggiunge le interazioni base tra variabili meteorologiche\n",
" \"\"\"\n",
" # Feature esistenti originali\n",
" df['temp_humidity'] = df['temp'] * df['humidity']\n",
" df['temp_cloudcover'] = df['temp'] * df['cloudcover']\n",
" df['visibility_cloudcover'] = df['visibility'] * df['cloudcover']\n",
" df['temp_humidity_interaction'] = df['temp'] * df['humidity'] / 100\n",
"\n",
" # Clear sky e trasparenza atmosferica\n",
" df['clear_sky_factor'] = (100 - df['cloudcover']) / 100\n",
" df['atmospheric_transparency'] = (100 - df['cloudcover']) * (df['visibility'] / 10)\n",
"\n",
" return df\n",
"\n",
"def add_rolling_and_lag_features(df):\n",
" \"\"\"\n",
" Aggiunge feature rolling e lag\n",
" \"\"\"\n",
" # Rolling means esistenti\n",
" df['temp_rolling_mean_6h'] = df['temp'].rolling(window=6).mean()\n",
" df['cloudcover_rolling_mean_6h'] = df['cloudcover'].rolling(window=6).mean()\n",
"\n",
" # Lag features esistenti\n",
" df['temp_1h_lag'] = df['temp'].shift(1)\n",
" df['cloudcover_1h_lag'] = df['cloudcover'].shift(1)\n",
" df['humidity_1h_lag'] = df['humidity'].shift(1)\n",
"\n",
" return df\n",
"\n",
"def add_condition_indicators(df):\n",
" \"\"\"\n",
" Aggiunge indicatori di condizioni particolari\n",
" \"\"\"\n",
" # Extreme conditions indicator esistente\n",
" df['extreme_conditions'] = ((df['temp'] > df['temp'].quantile(0.75)) &\n",
" (df['humidity'] < df['humidity'].quantile(0.25))).astype(int)\n",
"\n",
" return df\n",
"\n",
"def add_physics_based_conversion_features(df):\n",
" \"\"\"\n",
" Aggiunge feature specifiche per la conversione tra radiazione ed energia\n",
" \"\"\"\n",
" # Conversione da kWh a MJ/m²/h (1 W = 1 J/s = 0.0036 MJ/h)\n",
" df['radiation_to_energy'] = df['solarradiation'] * 0.0036\n",
"\n",
" # Efficienza di conversione reale vs teorica\n",
" df['conversion_efficiency_ratio'] = df['solarenergy'] / df['radiation_to_energy'].clip(lower=1e-6)\n",
"\n",
" # Energia accumulata nel tempo (integrazione)\n",
" df['energy_integral'] = df['radiation_to_energy'].rolling(window=24).sum()\n",
"\n",
" # Differenza tra energia teorica e reale\n",
" df['energy_conversion_gap'] = df['radiation_to_energy'] - df['solarenergy']\n",
"\n",
" # Indice di performance del sistema\n",
" df['system_performance_ratio'] = df['solarenergy'] / df['radiation_to_energy'].clip(lower=1e-6)\n",
"\n",
" return df\n",
"\n",
"def add_advanced_features(df):\n",
" \"\"\"\n",
" Add all advanced features to the DataFrame\n",
" \"\"\"\n",
" # Feature esistenti di base\n",
" # 1. Feature temporali di base\n",
" df = add_time_features(df)\n",
"\n",
" # 2. Feature solari e meteorologiche\n",
" df = add_solar_features(df)\n",
" df = add_solar_specific_features(df)\n",
" df = add_radiation_energy_features(df)\n",
"\n",
" # 3. Feature atmosferiche e di diffusione\n",
" df = add_atmospheric_features(df)\n",
" df = add_diffusion_features(df)\n",
"\n",
" # 4. Feature di persistenza e pattern\n",
" df = add_persistence_features(df)\n",
" df = add_weather_pattern_features(df)\n",
"\n",
" # 5. Feature di efficienza e stagionalità\n",
" df = add_efficiency_features(df)\n",
" df = add_advanced_seasonal_features(df)\n",
"\n",
" # 6. Interazioni e feature derivate\n",
" df = add_basic_interactions(df)\n",
" df = add_rolling_and_lag_features(df)\n",
" df = add_condition_indicators(df)\n",
"\n",
" # 7. Nuove feature di conversione fisica\n",
" df = add_physics_based_conversion_features(df)\n",
"\n",
" # 8. One-hot encoding delle feature categoriche\n",
" df = pd.get_dummies(df, columns=['season', 'time_period'])\n",
"\n",
" return df\n",
"\n",
"\n",
"def prepare_advanced_data(df):\n",
" \"\"\"\n",
" Prepare data for advanced modeling with proper datetime handling\n",
" \"\"\"\n",
" # Assicuriamoci che abbiamo una copia del DataFrame\n",
" df = df.copy()\n",
"\n",
" # Apply feature engineering functions\n",
" df = add_advanced_features(df)\n",
"\n",
" #all_columns = list(df.columns)\n",
" #print(all_columns)\n",
"\n",
" features = {\n",
" # Primary Features (strong direct correlation)\n",
" 'primary_features': [\n",
" 'uvindex',\n",
" 'cloudcover',\n",
" 'visibility',\n",
" 'temp',\n",
" 'pressure',\n",
" 'humidity',\n",
" 'solarradiation'\n",
" ],\n",
"\n",
" # Astronomical and Temporal Features\n",
" 'astronomical_features': [\n",
" 'solar_elevation',\n",
" 'solar_angle',\n",
" 'day_length',\n",
" 'hour_sin',\n",
" 'hour_cos',\n",
" 'day_of_year_sin',\n",
" 'day_of_year_cos',\n",
" 'month_sin',\n",
" 'month_cos',\n",
" 'solar_noon',\n",
" 'daylight_correction'\n",
" ],\n",
"\n",
" # Key Indices and Interactions\n",
" 'key_interactions': [\n",
" 'clear_sky_index',\n",
" 'atmospheric_attenuation',\n",
" 'theoretical_radiation',\n",
" 'expected_radiation',\n",
" 'cloud_elevation',\n",
" 'visibility_elevation',\n",
" 'uv_cloud_interaction',\n",
" 'temp_radiation_potential',\n",
" 'air_mass_index',\n",
" 'atmospheric_stability',\n",
" 'vapor_pressure_deficit',\n",
" 'diffusion_index',\n",
" 'atmospheric_transmittance',\n",
" 'temp_humidity_interaction',\n",
" 'clear_sky_factor'\n",
" ],\n",
"\n",
" # Rolling Features (temporal trends)\n",
" 'rolling_features': [\n",
" 'cloud_rolling_12h',\n",
" 'temp_rolling_12h',\n",
" 'uv_rolling_12h',\n",
" 'cloudcover_rolling_mean_6h',\n",
" 'temp_rolling_mean_6h',\n",
" 'energy_rolling_mean_6h',\n",
" 'uv_rolling_mean_6h',\n",
" 'energy_volatility',\n",
" 'uv_volatility'\n",
" ],\n",
"\n",
" # Lag Features\n",
" 'lag_features': [\n",
" 'temp_1h_lag',\n",
" 'cloudcover_1h_lag',\n",
" 'humidity_1h_lag',\n",
" 'energy_lag_1h',\n",
" 'uv_lag_1h'\n",
" ],\n",
"\n",
" # Efficiency and Performance Features\n",
" 'efficiency_features': [\n",
" 'temp_losses',\n",
" 'soiling_loss_factor',\n",
" 'estimated_efficiency',\n",
" 'production_potential',\n",
" 'system_performance_ratio',\n",
" 'conversion_efficiency_ratio'\n",
" ],\n",
"\n",
" # Weather Pattern Features\n",
" 'weather_pattern_features': [\n",
" 'clear_sky_duration',\n",
" 'weather_variability_index',\n",
" 'temp_stability',\n",
" 'humidity_stability',\n",
" 'cloudcover_stability'\n",
" ],\n",
"\n",
" # Categorical Features\n",
" 'categorical_features': [\n",
" 'season_Spring',\n",
" 'season_Summer',\n",
" 'season_Autumn',\n",
" 'season_Winter',\n",
" 'time_period_Morning',\n",
" 'time_period_Afternoon',\n",
" 'time_period_Evening',\n",
" 'time_period_Night'\n",
" ]\n",
" }\n",
"\n",
" final_features = [feature for group in features.values() for feature in group]\n",
"\n",
" if not isinstance(df.index, pd.DatetimeIndex):\n",
" if 'datetime' in df.columns:\n",
" df['datetime'] = pd.to_datetime(df['datetime'])\n",
" df.set_index('datetime', inplace=True)\n",
" else:\n",
" raise ValueError(\"No datetime column or index found in DataFrame\")\n",
"\n",
" # Ordiniamo il DataFrame per datetime\n",
" df = df.sort_index()\n",
"\n",
" # Handle missing values\n",
" target_variables = ['solarradiation', 'solarenergy', 'uvindex']\n",
" for column in final_features + target_variables:\n",
" if column in df.columns:\n",
" if isinstance(df.index, pd.DatetimeIndex):\n",
" df[column] = df[column].interpolate(method='time')\n",
" else:\n",
" df[column] = df[column].interpolate(method='linear')\n",
"\n",
" df.fillna(0, inplace=True)\n",
"\n",
" # Temporal split\n",
" data_after_2010 = df[df['year'] >= 2010].copy()\n",
" data_before_2010 = df[df['year'] < 2010].copy()\n",
"\n",
" X = data_after_2010[final_features]\n",
" y = data_after_2010['solarenergy']\n",
" X_to_predict = data_before_2010[final_features]\n",
"\n",
" # Train-test split\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.13, random_state=random_state_value, shuffle=False\n",
" )\n",
"\n",
" # Scaling\n",
" scaler_X = RobustScaler()\n",
" X_train_scaled = scaler_X.fit_transform(X_train)\n",
" X_test_scaled = scaler_X.transform(X_test)\n",
" X_to_predict_scaled = scaler_X.transform(X_to_predict)\n",
"\n",
" scaler_y = RobustScaler()\n",
" y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))\n",
" y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))\n",
"\n",
" # Print info about selected features\n",
" print(\"\\nSelected features:\")\n",
" print(f\"Number of features: {len(final_features)}\")\n",
" print(\"Features list:\", final_features)\n",
"\n",
" return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, scaler_X, scaler_y, final_features, X_to_predict_scaled\n",
"\n",
"\n",
"def create_sequence_data(X, sequence_length=24):\n",
" \"\"\"\n",
" Converts data into sequences for LSTM input\n",
" sequence_length represents how many previous hours to consider\n",
" \"\"\"\n",
" sequences = []\n",
" for i in range(len(X) - sequence_length + 1):\n",
" sequences.append(X[i:i + sequence_length])\n",
" return np.array(sequences)\n",
"\n",
"\n",
"def prepare_hybrid_data(df):\n",
" X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, scaler_X, scaler_y, features, X_to_predict_scaled = prepare_advanced_data(df)\n",
"\n",
" # Convert data into sequences\n",
" sequence_length = 24 # 24 hours of historical data\n",
"\n",
" X_train_seq = create_sequence_data(X_train_scaled, sequence_length)\n",
" X_test_seq = create_sequence_data(X_test_scaled, sequence_length)\n",
"\n",
" # Adjust y by removing the first (sequence_length-1) elements\n",
" y_train = y_train_scaled[sequence_length - 1:]\n",
" y_test = y_test_scaled[sequence_length - 1:]\n",
"\n",
" X_to_predict_seq = create_sequence_data(X_to_predict_scaled, sequence_length)\n",
"\n",
" return X_train_seq, X_test_seq, y_train, y_test, scaler_X, scaler_y, features, X_to_predict_seq"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "570b18f2caa3e0db",
"metadata": {},
"outputs": [],
"source": [
"def create_solarenergy_model(input_shape, folder_name, l2_lambda=0.005, min_output=0, max_output=4.0):\n",
" from tensorflow import keras\n",
" from keras.models import Model\n",
" from keras.layers import (\n",
" Input, Dense, Conv1D, BatchNormalization, Dropout, \n",
" MultiHeadAttention, LayerNormalization, Lambda,\n",
" Concatenate, Activation, Bidirectional, LSTM, Add\n",
" )\n",
" from keras.regularizers import l2\n",
" from keras.optimizers import AdamW\n",
" import tensorflow as tf\n",
" import numpy as np\n",
" import tensorflow_addons as tfa\n",
" from tensorflow.keras.optimizers.schedules import CosineDecayRestarts\n",
" \n",
" # Input layer\n",
" inputs = Input(shape=input_shape)\n",
" \n",
" # Feature groups definition\n",
" feature_dims = {\n",
" 'solar': [6, 7, 8, 9, 16, 18, 19, 20, 21],\n",
" 'weather': [0, 1, 2, 3, 4, 5],\n",
" 'temporal': [10, 11, 12, 13, 14, 15],\n",
" 'derived': [22, 23, 24, 25, 26, 27, 28, 29, 30, 31],\n",
" 'rolling': [33, 34, 35, 36, 37, 38, 39],\n",
" 'lag': [40, 41, 42, 43, 44],\n",
" 'performance': [45, 46, 47, 48, 49, 50]\n",
" }\n",
" \n",
" # Feature extraction\n",
" feature_tensors = {}\n",
" for name, indices in feature_dims.items():\n",
" valid_indices = [i for i in indices if i < input_shape[-1]]\n",
" if valid_indices:\n",
" feature_tensors[name] = Lambda(\n",
" lambda x, idx=valid_indices: tf.gather(x, idx, axis=-1)\n",
" )(inputs)\n",
" \n",
" # Feature processing with residual connections\n",
" def process_feature_group(tensor, units, name):\n",
" x = Conv1D(units, kernel_size=3, padding='same', activation='swish',\n",
" kernel_regularizer=l2(l2_lambda))(tensor)\n",
" x = BatchNormalization()(x)\n",
" x = Dropout(0.2)(x)\n",
" \n",
" residual = Conv1D(units, kernel_size=1, padding='same')(tensor)\n",
" x = Add()([x, residual])\n",
" x = LayerNormalization()(x)\n",
" \n",
" return x\n",
" \n",
" # Process each feature group\n",
" processed_features = {}\n",
" for name, tensor in feature_tensors.items():\n",
" units = 64 if name == 'solar' else 32 if name == 'weather' else 16\n",
" processed_features[name] = process_feature_group(tensor, units, name)\n",
" \n",
" # Enhanced attention mechanism\n",
" def attention_block(x, num_heads=4):\n",
" attention_output = MultiHeadAttention(\n",
" num_heads=num_heads, \n",
" key_dim=x.shape[-1] // num_heads\n",
" )(x, x)\n",
" x = LayerNormalization()(x + attention_output)\n",
" \n",
" ffn = Dense(x.shape[-1] * 2, activation='swish')(x)\n",
" ffn = Dropout(0.1)(ffn)\n",
" ffn = Dense(x.shape[-1])(ffn)\n",
" \n",
" return LayerNormalization()(x + ffn)\n",
" \n",
" # Merge primary features with attention\n",
" primary_features = [\n",
" processed_features['solar'],\n",
" processed_features['weather'],\n",
" processed_features['performance']\n",
" ]\n",
" primary_context = Concatenate(axis=-1)(primary_features)\n",
" primary_context = attention_block(primary_context)\n",
" \n",
" # Merge secondary features\n",
" secondary_features = [\n",
" processed_features[name] for name in ['temporal', 'rolling', 'lag']\n",
" if name in processed_features\n",
" ]\n",
" if secondary_features:\n",
" secondary_context = Concatenate(axis=-1)(secondary_features)\n",
" secondary_context = attention_block(secondary_context)\n",
" else:\n",
" secondary_context = primary_context\n",
" \n",
" # Final feature merge\n",
" combined = Concatenate(axis=-1)([\n",
" primary_context, \n",
" secondary_context,\n",
" processed_features['derived']\n",
" ])\n",
" \n",
" # Sequential processing with residual LSTM\n",
" def residual_lstm_block(x, units):\n",
" lstm_out = Bidirectional(LSTM(units, return_sequences=True))(x)\n",
" residual = Conv1D(units * 2, kernel_size=1, padding='same')(x)\n",
" x = Add()([lstm_out, residual])\n",
" x = LayerNormalization()(x)\n",
" return x\n",
" \n",
" x = residual_lstm_block(combined, 128)\n",
" x = residual_lstm_block(x, 64)\n",
" x = Bidirectional(LSTM(64))(x)\n",
" x = Dropout(0.2)(x)\n",
" \n",
" # Classification branch\n",
" class_x = Dense(128, activation='swish', kernel_regularizer=l2(l2_lambda))(x)\n",
" class_x = BatchNormalization()(class_x)\n",
" class_x = Dropout(0.2)(class_x)\n",
" class_x = Dense(64, activation='swish', kernel_regularizer=l2(l2_lambda))(class_x)\n",
" class_output = Dense(1, activation='sigmoid', name='classification_output')(class_x)\n",
" \n",
" # Enhanced regression branch with multiple pathways\n",
" def create_regression_pathway(x, name):\n",
" x = Dense(128, activation='swish', kernel_regularizer=l2(l2_lambda))(x)\n",
" x = BatchNormalization()(x)\n",
" x = Dropout(0.2)(x)\n",
" \n",
" residual = x\n",
" x = Dense(128, activation='swish', kernel_regularizer=l2(l2_lambda))(x)\n",
" x = BatchNormalization()(x)\n",
" x = Dense(128, activation='swish', kernel_regularizer=l2(l2_lambda))(x)\n",
" x = Add()([x, residual])\n",
" \n",
" x = Dense(64, activation='swish', kernel_regularizer=l2(l2_lambda))(x)\n",
" return Dense(1, name=f'{name}_output')(x)\n",
" \n",
" # Create specialized regression pathways\n",
" low_range = create_regression_pathway(x, 'low_range')\n",
" mid_range = create_regression_pathway(x, 'mid_range')\n",
" high_range = create_regression_pathway(x, 'high_range')\n",
" \n",
" # Create feature representation for attention\n",
" feature_vector = Dense(32, activation='swish')(x)\n",
" \n",
" # Stack the range predictions\n",
" range_stack = tf.stack([low_range, mid_range, high_range], axis=1)\n",
" \n",
" # Create attention mechanism\n",
" attention_context = Dense(32, activation='swish')(feature_vector)\n",
" \n",
" # Calculate attention weights using the context\n",
" attention_weights = Dense(3, activation='softmax')(attention_context)\n",
" \n",
" # Apply attention weights to combine predictions\n",
" reg_output = Lambda(\n",
" lambda inputs: tf.reduce_sum(inputs[0] * inputs[1], axis=1),\n",
" name='regression_output'\n",
" )([attention_weights, range_stack])\n",
" \n",
" # Final output with enhanced processing\n",
" final_x = Dense(256, activation='swish', kernel_regularizer=l2(l2_lambda))(x)\n",
" final_x = BatchNormalization()(final_x)\n",
" final_x = Dropout(0.2)(final_x)\n",
" \n",
" residual = final_x\n",
" final_x = Dense(256, activation='swish', kernel_regularizer=l2(l2_lambda))(final_x)\n",
" final_x = BatchNormalization()(final_x)\n",
" final_x = Dense(256, activation='swish', kernel_regularizer=l2(l2_lambda))(final_x)\n",
" final_x = Add()([final_x, residual])\n",
" \n",
" final_x = Dense(128, activation='swish', kernel_regularizer=l2(l2_lambda))(final_x)\n",
" final_x = Dense(1)(final_x)\n",
" final_output = Lambda(\n",
" lambda x: tf.clip_by_value(x, min_output, max_output),\n",
" name='final_output'\n",
" )(final_x)\n",
" \n",
" # Build model\n",
" model = Model(inputs=inputs, outputs=[class_output, reg_output, final_output])\n",
" \n",
" # Enhanced loss functions\n",
" def enhanced_regression_loss(y_true, y_pred):\n",
" mae = tf.abs(y_true - y_pred)\n",
" mse = tf.square(y_true - y_pred)\n",
" \n",
" value_ranges = tf.cast(y_true > 2.0, tf.float32) * 1.5 + \\\n",
" tf.cast(tf.logical_and(y_true <= 2.0, y_true > 1.0), tf.float32) * 1.2 + \\\n",
" tf.cast(y_true <= 1.0, tf.float32)\n",
" \n",
" weighted_loss = (0.5 * mae + 0.5 * mse) * value_ranges\n",
" return tf.reduce_mean(weighted_loss)\n",
" \n",
" def final_loss(y_true, y_pred):\n",
" y_true = tf.clip_by_value(y_true, min_output, max_output)\n",
" mae = tf.reduce_mean(tf.abs(y_true - y_pred))\n",
" mse = tf.reduce_mean(tf.square(y_true - y_pred))\n",
" return 0.5 * mae + 0.5 * mse\n",
" \n",
" # Learning rate schedule\n",
" clr = CosineDecayRestarts(\n",
" initial_learning_rate=2e-4,\n",
" first_decay_steps=1000,\n",
" t_mul=2.0,\n",
" m_mul=0.9,\n",
" alpha=1e-7\n",
" )\n",
" \n",
" # Optimizer\n",
" optimizer = AdamW(\n",
" learning_rate=clr,\n",
" weight_decay=0.01,\n",
" clipnorm=1.0\n",
" )\n",
" \n",
" # Compile model\n",
" model.compile(\n",
" optimizer=optimizer,\n",
" loss={\n",
" 'classification_output': 'binary_crossentropy',\n",
" 'regression_output': enhanced_regression_loss,\n",
" 'final_output': final_loss\n",
" },\n",
" loss_weights={\n",
" 'classification_output': 0.2,\n",
" 'regression_output': 0.4,\n",
" 'final_output': 0.4\n",
" }\n",
" )\n",
"\n",
" # Plot model architecture\n",
" try:\n",
" plot_model(\n",
" model,\n",
" to_file=f'{folder_name}_model_architecture.png',\n",
" show_shapes=True,\n",
" show_layer_names=True,\n",
" dpi=150,\n",
" show_layer_activations=True\n",
" )\n",
" except Exception as e:\n",
" print(f\"Warning: Could not plot model architecture: {e}\")\n",
"\n",
" return model\n",
"\n",
"\n",
"def evaluate_solarenergy_predictions(y_true, y_pred, hour=None, folder_name=None):\n",
" \"\"\"\n",
" Comprehensive evaluation of solar energy predictions with detailed analysis and visualizations.\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
" Actual solar energy values (kWh)\n",
" y_pred : array-like\n",
" Predicted solar energy values (kWh)\n",
" hour : array-like, optional\n",
" Array of hours corresponding to predictions, for temporal analysis\n",
" folder_name : str, optional\n",
" Directory to save analysis plots\n",
"\n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing all calculated metrics\n",
" \"\"\"\n",
"\n",
" # Data preparation\n",
" y_true = np.array(y_true).ravel()\n",
" y_pred = np.array(y_pred).ravel()\n",
" errors = y_pred - y_true\n",
"\n",
" # Basic metrics calculation\n",
" mae_raw = mean_absolute_error(y_true, y_pred)\n",
" rmse_raw = np.sqrt(mean_squared_error(y_true, y_pred))\n",
" r2_raw = r2_score(y_true, y_pred)\n",
"\n",
" # Corrected MAPE calculation\n",
" mask = y_true > 10 # Consider only values above 10 kWh\n",
" if np.any(mask):\n",
" mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100\n",
" else:\n",
" mape = np.nan\n",
"\n",
" # Corrected error margin accuracy\n",
" within_5_percent = np.mean(np.abs(errors) <= 5) * 100 # Within 5 kWh\n",
" within_10_percent = np.mean(np.abs(errors) <= 10) * 100 # Within 10 kWh\n",
" within_20_percent = np.mean(np.abs(errors) <= 20) * 100 # Within 20 kWh\n",
"\n",
" # Energy level classification\n",
" def get_energy_level(value):\n",
" if value <= 0.5:\n",
" return 'Very Low'\n",
" elif value <= 2.0:\n",
" return 'Low'\n",
" elif value <= 4.0:\n",
" return 'Moderate'\n",
" elif value <= 6.0:\n",
" return 'High'\n",
" elif value <= 8.0:\n",
" return 'Very High'\n",
" else:\n",
" return 'Extreme'\n",
"\n",
" # Calculate energy levels\n",
" y_true_levels = [get_energy_level(v) for v in y_true]\n",
" y_pred_levels = [get_energy_level(v) for v in y_pred]\n",
" level_accuracy = np.mean([t == p for t, p in zip(y_true_levels, y_pred_levels)])\n",
"\n",
" unique_levels = sorted(list(set(y_true_levels + y_pred_levels)))\n",
"\n",
" # Print main metrics\n",
" print(\"\\nSolar Energy Prediction Metrics:\")\n",
" print(\"\\nAbsolute Metrics:\")\n",
" print(f\"MAE: {mae_raw:.2f} kWh\")\n",
" print(f\"RMSE: {rmse_raw:.2f} kWh\")\n",
" print(f\"R² Score: {r2_raw:.3f}\")\n",
" print(f\"MAPE: {mape:.2f}%\" if not np.isnan(mape) else \"MAPE: N/A (insufficient data)\")\n",
"\n",
" print(\"\\nAccuracy Metrics:\")\n",
" print(f\"Within ±5 kWh: {within_5_percent:.1f}%\")\n",
" print(f\"Within ±10 kWh: {within_10_percent:.1f}%\")\n",
" print(f\"Within ±20 kWh: {within_20_percent:.1f}%\")\n",
"\n",
" print(\"\\nLevel Accuracy:\")\n",
" print(f\"Level Accuracy: {level_accuracy * 100:.1f}%\")\n",
"\n",
" # Confusion matrix for energy levels\n",
" cm = confusion_matrix(y_true_levels, y_pred_levels, labels=unique_levels)\n",
" print(\"\\nConfusion Matrix for Energy Levels:\")\n",
" cm_df = pd.DataFrame(\n",
" cm,\n",
" columns=unique_levels,\n",
" index=unique_levels\n",
" )\n",
" print(cm_df)\n",
"\n",
" # Time period analysis\n",
" if hour is not None:\n",
" day_periods = {\n",
" 'Morning (5-11)': (5, 11),\n",
" 'Noon (11-13)': (11, 13),\n",
" 'Afternoon (13-17)': (13, 17),\n",
" 'Evening (17-21)': (17, 21),\n",
" 'Night (21-5)': (21, 5)\n",
" }\n",
"\n",
" print(\"\\nAnalysis by Time Period:\")\n",
" for period, (start, end) in day_periods.items():\n",
" if start < end:\n",
" mask = (hour >= start) & (hour < end)\n",
" else:\n",
" mask = (hour >= start) | (hour < end)\n",
"\n",
" if np.any(mask):\n",
" period_mae = mean_absolute_error(y_true[mask], y_pred[mask])\n",
"\n",
" # Corrected period MAPE calculation\n",
" period_mask = mask & (y_true > 10)\n",
" if np.any(period_mask):\n",
" period_mape = np.mean(np.abs((y_true[period_mask] - y_pred[period_mask]) / y_true[period_mask])) * 100\n",
" print(f\"\\n{period}:\")\n",
" print(f\"MAE: {period_mae:.2f} kWh\")\n",
" print(f\"MAPE: {period_mape:.2f}%\")\n",
" else:\n",
" print(f\"\\n{period}:\")\n",
" print(f\"MAE: {period_mae:.2f} kWh\")\n",
" print(\"MAPE: N/A (insufficient data)\")\n",
"\n",
" # Visualizations\n",
" if folder_name is not None:\n",
" try:\n",
" # Figure 1: Main analysis plots\n",
" plt.figure(figsize=(20, 15))\n",
"\n",
" # Plot 1: Scatter plot of actual vs predicted values\n",
" plt.subplot(3, 2, 1)\n",
" plt.scatter(y_true, y_pred, alpha=0.5)\n",
" plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
" plt.xlabel('Actual Energy (kWh)')\n",
" plt.ylabel('Predicted Energy (kWh)')\n",
" plt.title('Actual vs Predicted Values')\n",
" plt.grid(True)\n",
"\n",
" # Plot 2: Absolute error distribution\n",
" plt.subplot(3, 2, 2)\n",
" plt.hist(errors, bins=50, alpha=0.7)\n",
" plt.xlabel('Prediction Error (kWh)')\n",
" plt.ylabel('Frequency')\n",
" plt.title('Error Distribution')\n",
" plt.grid(True)\n",
"\n",
" # Plot 3: Percentage error distribution (only for values > 0.5 kWh)\n",
" plt.subplot(3, 2, 3)\n",
" mask = y_true > 0.5\n",
" if np.any(mask):\n",
" percentage_errors = ((y_pred[mask] - y_true[mask]) / y_true[mask]) * 100\n",
" plt.hist(np.clip(percentage_errors, -100, 100), bins=50, alpha=0.7)\n",
" plt.xlabel('Percentage Error (%)')\n",
" plt.ylabel('Frequency')\n",
" plt.title('Percentage Error Distribution (for values > 0.5 kWh)')\n",
" plt.grid(True)\n",
"\n",
" # Plot 4: Errors vs actual values\n",
" plt.subplot(3, 2, 4)\n",
" plt.scatter(y_true, errors, alpha=0.5)\n",
" plt.axhline(y=0, color='r', linestyle='--')\n",
" plt.xlabel('Actual Energy (kWh)')\n",
" plt.ylabel('Error (kWh)')\n",
" plt.title('Errors vs Actual Values')\n",
" plt.grid(True)\n",
"\n",
" # Plot 5: Error boxplot by Energy level\n",
" plt.subplot(3, 2, 5)\n",
" sns.boxplot(x=[get_energy_level(v) for v in y_true], y=errors)\n",
" plt.xticks(rotation=45)\n",
" plt.xlabel('Energy Level')\n",
" plt.ylabel('Error (kWh)')\n",
" plt.title('Error Distribution by Level')\n",
"\n",
" # Plot 6: Confusion matrix heatmap\n",
" plt.subplot(3, 2, 6)\n",
" sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')\n",
" plt.title('Confusion Matrix')\n",
" plt.xticks(rotation=45)\n",
" plt.yticks(rotation=45)\n",
"\n",
" plt.tight_layout()\n",
" filename = f'{folder_name}_energy_analysis.png'\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nPlot saved as: {filename}\")\n",
" plt.close()\n",
"\n",
" except Exception as e:\n",
" print(f\"\\nError saving plots: {str(e)}\")\n",
"\n",
" # Additional error statistics\n",
" print(\"\\nError Statistics:\")\n",
" print(f\"Mean error: {np.mean(errors):.3f}\")\n",
" print(f\"Error standard deviation: {np.std(errors):.3f}\")\n",
" print(f\"Median error: {np.median(errors):.3f}\")\n",
" print(f\"95th percentile absolute error: {np.percentile(np.abs(errors), 95):.3f}\")\n",
"\n",
" # Return structured metrics\n",
" metrics = {\n",
" 'absolute': {\n",
" 'mae': mae_raw,\n",
" 'rmse': rmse_raw,\n",
" 'r2': r2_raw,\n",
" 'mape': float(mape) if not np.isnan(mape) else None\n",
" },\n",
" 'accuracy': {\n",
" 'within_5_wm2': float(within_5_percent),\n",
" 'within_10_wm2': float(within_10_percent),\n",
" 'within_20_wm2': float(within_20_percent)\n",
" },\n",
" 'categorical': {\n",
" 'level_accuracy': float(level_accuracy)\n",
" },\n",
" 'error_stats': {\n",
" 'mean': float(np.mean(errors)),\n",
" 'std': float(np.std(errors)),\n",
" 'median': float(np.median(errors)),\n",
" 'p95_abs': float(np.percentile(np.abs(errors), 95))\n",
" }\n",
" }\n",
"\n",
" return metrics\n",
"\n",
"\n",
"def plot_training_history(history, folder_name=None):\n",
" \"\"\"\n",
" Visualize and save training history for the hybrid model\n",
" \"\"\"\n",
" plt.figure(figsize=(15, 10))\n",
"\n",
" # Loss plots\n",
" plt.subplot(2, 2, 1)\n",
" plt.plot(history.history['classification_output_loss'], label='Class Loss')\n",
" plt.plot(history.history['regression_output_loss'], label='Reg Loss')\n",
" plt.plot(history.history['final_output_loss'], label='Final Loss')\n",
" plt.plot(history.history['val_classification_output_loss'], label='Val Class Loss')\n",
" plt.plot(history.history['val_regression_output_loss'], label='Val Reg Loss')\n",
" plt.plot(history.history['val_final_output_loss'], label='Val Final Loss')\n",
" plt.title('Model Losses')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Loss')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" # Classification metrics\n",
" plt.subplot(2, 2, 2)\n",
" plt.plot(history.history['classification_output_accuracy'], label='Class Acc')\n",
" plt.plot(history.history['val_classification_output_accuracy'], label='Val Class Acc')\n",
" plt.plot(history.history['classification_output_auc'], label='Class AUC')\n",
" plt.plot(history.history['val_classification_output_auc'], label='Val Class AUC')\n",
" plt.title('Classification Metrics')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('Metric Value')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" # Regression metrics\n",
" plt.subplot(2, 2, 3)\n",
" plt.plot(history.history['regression_output_mae'], label='Reg MAE')\n",
" plt.plot(history.history['val_regression_output_mae'], label='Val Reg MAE')\n",
" plt.title('Regression MAE')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('MAE')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" # Final output metrics\n",
" plt.subplot(2, 2, 4)\n",
" plt.plot(history.history['final_output_mae'], label='Final MAE')\n",
" plt.plot(history.history['val_final_output_mae'], label='Val Final MAE')\n",
" plt.title('Final Output MAE')\n",
" plt.xlabel('Epoch')\n",
" plt.ylabel('MAE')\n",
" plt.legend()\n",
" plt.grid(True)\n",
"\n",
" plt.tight_layout()\n",
"\n",
" if folder_name is not None:\n",
" filename = f'{folder_name}_training_history.png'\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nTraining history plot saved as: {filename}\")\n",
"\n",
" # Save history to JSON\n",
" history_dict = history.history\n",
" json_filename = f'{folder_name}_training_history.json'\n",
" with open(json_filename, 'w') as f:\n",
" json.dump(history_dict, f)\n",
" print(f\"Training history saved as: {json_filename}\")\n",
"\n",
" plt.show()\n",
"\n",
"def calculate_metrics(y_true, y_class, y_reg, y_final, min_output, max_output):\n",
" \"\"\"\n",
" Calculates comprehensive metrics for the solar energy prediction model.\n",
" \n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
" Ground truth values\n",
" y_class : array-like\n",
" Classification predictions (probability of non-zero values)\n",
" y_reg : array-like\n",
" Regression predictions (unrestricted values)\n",
" y_final : array-like\n",
" Final clipped predictions\n",
" min_output : float\n",
" Minimum allowed output value\n",
" max_output : float\n",
" Maximum allowed output value\n",
" \n",
" Returns:\n",
" --------\n",
" dict\n",
" Dictionary containing all calculated metrics\n",
" \"\"\"\n",
" from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix\n",
" \n",
" # Ensure proper array formatting and dimensionality\n",
" y_true = np.array(y_true).flatten()\n",
" y_class = np.array(y_class).flatten()\n",
" y_reg = np.array(y_reg).flatten()\n",
" y_final = np.array(y_final).flatten()\n",
" \n",
" # Validate input dimensions\n",
" assert len(y_true) == len(y_class) == len(y_reg) == len(y_final), \\\n",
" \"All input arrays must have the same length\"\n",
" \n",
" # Classification metrics with error handling\n",
" print(\"\\nClassification Metrics:\")\n",
" try:\n",
" y_true_binary = (y_true > 0).astype(int)\n",
" y_pred_binary = (y_class > 0.5).astype(int)\n",
" \n",
" accuracy = np.mean((y_class > 0.5) == (y_true > 0)) * 100\n",
" auc_roc = roc_auc_score(y_true > 0, y_class)\n",
" print(f\"Accuracy: {accuracy:.2f}%\")\n",
" print(f\"AUC-ROC: {auc_roc:.4f}\")\n",
" \n",
" print(\"\\nConfusion Matrix:\")\n",
" conf_matrix = confusion_matrix(y_true_binary, y_pred_binary)\n",
" print(conf_matrix)\n",
" \n",
" print(\"\\nClassification Report:\")\n",
" class_report = classification_report(\n",
" y_true_binary, \n",
" y_pred_binary,\n",
" target_names=['Zero', 'Non-Zero'],\n",
" digits=4\n",
" )\n",
" print(class_report)\n",
" except Exception as e:\n",
" print(f\"Error in classification metrics calculation: {str(e)}\")\n",
" \n",
" # Regression metrics with error handling\n",
" print(\"\\nRegression Metrics (non-zero values):\")\n",
" mask_nonzero = y_true > 0\n",
" if np.any(mask_nonzero):\n",
" try:\n",
" y_true_nonzero = y_true[mask_nonzero]\n",
" y_reg_nonzero = y_reg[mask_nonzero]\n",
" \n",
" # Range validation\n",
" out_of_range = np.sum(\n",
" (y_reg_nonzero < min_output) | \n",
" (y_reg_nonzero > max_output)\n",
" )\n",
" \n",
" # Error metrics with numerical stability\n",
" epsilon = 1e-7\n",
" diff = np.abs((y_true_nonzero - y_reg_nonzero) / \n",
" (y_true_nonzero + epsilon))\n",
" diff = np.clip(diff, 0, 1)\n",
" \n",
" # Calculate metrics\n",
" mape = np.mean(diff) * 100\n",
" within_10_percent = np.mean(diff <= 0.10) * 100\n",
" mae = np.mean(np.abs(y_true_nonzero - y_reg_nonzero))\n",
" rmse = np.sqrt(np.mean(np.square(y_true_nonzero - y_reg_nonzero)))\n",
" \n",
" print(f\"Out of range: {out_of_range} predictions\")\n",
" print(f\"MAPE: {mape:.2f}%\")\n",
" print(f\"Within ±10%: {within_10_percent:.2f}%\")\n",
" print(f\"MAE: {mae:.2f}\")\n",
" print(f\"RMSE: {rmse:.2f}\")\n",
" except Exception as e:\n",
" print(f\"Error in regression metrics calculation: {str(e)}\")\n",
" else:\n",
" print(\"No non-zero values in this batch\")\n",
" \n",
" # Final output metrics with error handling\n",
" print(\"\\nFinal Combined Output Metrics:\")\n",
" try:\n",
" # Ensure outputs are within bounds\n",
" out_of_range = np.sum((y_final < min_output) | (y_final > max_output))\n",
" \n",
" # Calculate metrics with numerical stability\n",
" epsilon = 1e-7\n",
" diff = np.abs((y_true - y_final) / (y_true + epsilon))\n",
" diff = np.clip(diff, 0, 1)\n",
" \n",
" mape = np.mean(diff) * 100\n",
" within_2_percent = np.mean(diff <= 0.02) * 100\n",
" within_5_percent = np.mean(diff <= 0.05) * 100\n",
" within_10_percent = np.mean(diff <= 0.10) * 100\n",
" within_20_percent = np.mean(diff <= 0.20) * 100\n",
" mae = np.mean(np.abs(y_true - y_final))\n",
" rmse = np.sqrt(np.mean(np.square(y_true - y_final)))\n",
" \n",
" print(f\"Out of range: {out_of_range} predictions\")\n",
" print(f\"MAPE: {mape:.2f}%\")\n",
" print(f\"Within ±2%: {within_2_percent:.2f}%\")\n",
" print(f\"Within ±5%: {within_5_percent:.2f}%\")\n",
" print(f\"Within ±10%: {within_10_percent:.2f}%\")\n",
" print(f\"Within ±20%: {within_20_percent:.2f}%\")\n",
" print(f\"MAE: {mae:.2f}\")\n",
" print(f\"RMSE: {rmse:.2f}\")\n",
" except Exception as e:\n",
" print(f\"Error in final output metrics calculation: {str(e)}\")\n",
"\n",
"def train_hybrid_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, folder_name='solarenergy', min_output=0, max_output=1):\n",
" \"\"\"\n",
" Advanced training function for the hybrid solar energy model\n",
" \"\"\" \n",
" # Prepare binary targets for classification\n",
" y_train_binary = (y_train > 0).astype(float)\n",
" y_test_binary = (y_test > 0).astype(float)\n",
"\n",
" # Training targets dictionary - usando i nomi esatti degli output del modello\n",
" train_targets = {\n",
" 'classification_output': y_train_binary,\n",
" 'regression_output': y_train, # Questo nome corrisponde a quello nel modello\n",
" 'final_output': y_train\n",
" }\n",
"\n",
" # Validation targets dictionary\n",
" test_targets = {\n",
" 'classification_output': y_test_binary,\n",
" 'regression_output': y_test, # Questo nome corrisponde a quello nel modello\n",
" 'final_output': y_test\n",
" }\n",
"\n",
" def evaluate_epoch(epoch, logs):\n",
" if epoch % 10 == 0:\n",
" print(f\"\\nEpoch {epoch + 1} Detailed Metrics:\")\n",
" predictions = model.predict(X_test, verbose=0)\n",
" calculate_metrics(y_test, *predictions, min_output, max_output)\n",
"\n",
" callbacks = [\n",
" tf.keras.callbacks.EarlyStopping(\n",
" monitor='val_final_output_loss',\n",
" patience=35,\n",
" restore_best_weights=True,\n",
" mode='min',\n",
" verbose=1,\n",
" min_delta=1e-5\n",
" ),\n",
" tf.keras.callbacks.ModelCheckpoint(\n",
" filepath=f'{folder_name}_best_model.h5',\n",
" monitor='val_final_output_loss',\n",
" save_best_only=True,\n",
" mode='min',\n",
" save_weights_only=True # Modificato a True per evitare problemi di serializzazione\n",
" ),\n",
" tf.keras.callbacks.TensorBoard(\n",
" log_dir=f'./{folder_name}_logs',\n",
" histogram_freq=1,\n",
" write_graph=True,\n",
" update_freq='epoch'\n",
" ),\n",
" tf.keras.callbacks.LambdaCallback(on_epoch_end=evaluate_epoch),\n",
" tf.keras.callbacks.TerminateOnNaN()\n",
" ]\n",
"\n",
" '''\n",
" tf.keras.callbacks.ReduceLROnPlateau(\n",
" monitor='val_final_output_loss',\n",
" factor=0.8,\n",
" patience=10,\n",
" verbose=1,\n",
" mode='min',\n",
" min_delta=1e-4,\n",
" cooldown=2,\n",
" min_lr=1e-7\n",
" ),\n",
" '''\n",
" try:\n",
" history = model.fit(\n",
" X_train,\n",
" train_targets,\n",
" validation_data=(X_test, test_targets),\n",
" epochs=epochs,\n",
" batch_size=batch_size,\n",
" callbacks=callbacks,\n",
" verbose=1,\n",
" shuffle=False\n",
" )\n",
"\n",
" print(\"\\nTraining completed successfully!\")\n",
"\n",
" # Final evaluation\n",
" predictions = model.predict(X_test, verbose=0)\n",
" calculate_metrics(y_test, *predictions, min_output, max_output)\n",
"\n",
" return history\n",
"\n",
" except Exception as e:\n",
" print(f\"\\nError during training: {str(e)}\")\n",
" print(\"\\nModel output names:\", [output.name for output in model.outputs])\n",
" print(\"Training targets keys:\", train_targets.keys())\n",
" raise\n",
"\n",
" finally:\n",
" tf.keras.backend.clear_session()\n",
"\n",
"\n",
"def integrate_predictions(df, predictions, sequence_length=24):\n",
" \"\"\"\n",
" Integrates solar energy predictions into the original dataset for pre-2010 data.\n",
"\n",
" Parameters:\n",
" -----------\n",
" df : pandas.DataFrame\n",
" Original dataset\n",
" predictions : tuple\n",
" Tuple containing (classification_pred, regression_pred, final_pred)\n",
" - classification_pred: probability of non-zero values\n",
" - regression_pred: predicted values (used for non-zero cases)\n",
" - final_pred: final combined predictions\n",
" sequence_length : int\n",
" Sequence length used for predictions\n",
"\n",
" Returns:\n",
" --------\n",
" pandas.DataFrame\n",
" Updated dataset with solar energy predictions and additional prediction details\n",
" \"\"\"\n",
" # Convert datetime to datetime format if not already\n",
" df['datetime'] = pd.to_datetime(df['datetime'])\n",
"\n",
" # Identify pre-2010 rows\n",
" mask_pre_2010 = df['datetime'].dt.year < 2010\n",
"\n",
" # Unpack predictions\n",
" classification_pred, regression_pred, final_pred = predictions\n",
"\n",
" # Create temporary DataFrame with all predictions\n",
" dates_pre_2010 = df[mask_pre_2010]['datetime'].iloc[sequence_length - 1:]\n",
" predictions_df = pd.DataFrame({\n",
" 'datetime': dates_pre_2010,\n",
" 'solarenergy_predicted': final_pred.flatten(),\n",
" 'solarenergy_classification': classification_pred.flatten(),\n",
" 'solarenergy_regression': regression_pred.flatten()\n",
" })\n",
"\n",
" # Merge with original dataset\n",
" df = df.merge(predictions_df, on='datetime', how='left')\n",
"\n",
" # Update solar energy column where missing\n",
" df['solarenergy'] = df['solarenergy'].fillna(df['solarenergy_predicted'])\n",
"\n",
" # Print detailed statistics\n",
" print(\"\\nPrediction Integration Statistics:\")\n",
" print(f\"Added {len(final_pred)} predictions to dataset\")\n",
" print(f\"Rows with solar energy after integration: {df['solarenergy'].notna().sum()}\")\n",
"\n",
" # Analyze prediction components for the filled values\n",
" mask_filled = df['solarenergy'] == df['solarenergy_predicted']\n",
" if mask_filled.any():\n",
" filled_data = df[mask_filled]\n",
"\n",
" print(\"\\nFilled Values Analysis:\")\n",
" print(f\"Zero predictions (classification < 0.5): {(filled_data['solarenergy_classification'] < 0.5).sum()}\")\n",
" print(f\"Non-zero predictions (classification >= 0.5): {(filled_data['solarenergy_classification'] >= 0.5).sum()}\")\n",
"\n",
" # Distribution of predicted values\n",
" non_zero_pred = filled_data[filled_data['solarenergy_predicted'] > 0]\n",
" if len(non_zero_pred) > 0:\n",
" print(f\"\\nNon-zero predictions statistics:\")\n",
" print(f\"Mean: {non_zero_pred['solarenergy_predicted'].mean():.2f}\")\n",
" print(f\"Median: {non_zero_pred['solarenergy_predicted'].median():.2f}\")\n",
" print(f\"Std: {non_zero_pred['solarenergy_predicted'].std():.2f}\")\n",
"\n",
" # Optionally, you can keep or remove the intermediate prediction columns\n",
" columns_to_drop = ['solarenergy_predicted', 'solarenergy_classification',\n",
" 'solarenergy_regression']\n",
" df = df.drop(columns_to_drop, axis=1)\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b3b0c2e65ddf484",
"metadata": {},
"outputs": [],
"source": [
"def analyze_distribution(data, solar_column='solarenergy', name = 'Solar Energy'):\n",
" \"\"\"\n",
" Analizza dettagliatamente la distribuzione della variabile solarenergy.\n",
"\n",
" Parameters:\n",
" -----------\n",
" data : pandas.DataFrame\n",
" DataFrame contenente la colonna solarenergy\n",
" solar_column : str, default='solarenergy'\n",
" Nome della colonna da analizzare\n",
"\n",
" Returns:\n",
" --------\n",
" dict\n",
" Dizionario contenente le statistiche principali\n",
" \"\"\"\n",
"\n",
" # Creiamo una figura con più subplot\n",
" fig = plt.figure(figsize=(20, 12))\n",
"\n",
" # 1. Statistiche di base\n",
" stats_dict = {\n",
" 'count': len(data[solar_column]),\n",
" 'missing': data[solar_column].isnull().sum(),\n",
" 'zeros': (data[solar_column] == 0).sum(),\n",
" 'mean': data[solar_column].mean(),\n",
" 'median': data[solar_column].median(),\n",
" 'std': data[solar_column].std(),\n",
" 'min': data[solar_column].min(),\n",
" 'max': data[solar_column].max(),\n",
" 'skewness': stats.skew(data[solar_column].dropna()),\n",
" 'kurtosis': stats.kurtosis(data[solar_column].dropna())\n",
" }\n",
"\n",
" # Calcolo dei percentili\n",
" percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]\n",
" for p in percentiles:\n",
" stats_dict[f'percentile_{p}'] = np.percentile(data[solar_column].dropna(), p)\n",
"\n",
" # 2. Visualizzazioni\n",
"\n",
" # 2.1 Distribuzione\n",
" plt.subplot(2, 2, 1)\n",
" sns.histplot(data=data, x=solar_column, kde=True)\n",
" plt.title(f'Distribuzione di {name}')\n",
" plt.xlabel(f'{name}')\n",
" plt.ylabel('Frequenza')\n",
"\n",
" # 2.2 Box Plot\n",
" plt.subplot(2, 2, 2)\n",
" sns.boxplot(y=data[solar_column])\n",
" plt.title(f'Box Plot di {name}')\n",
"\n",
" # 2.3 QQ Plot\n",
" plt.subplot(2, 2, 3)\n",
" stats.probplot(data[solar_column].dropna(), dist=\"norm\", plot=plt)\n",
" plt.title(f'Q-Q Plot di {name}')\n",
"\n",
" # 2.4 Distribuzione Log-trasformata\n",
" plt.subplot(2, 2, 4)\n",
" sns.histplot(data=np.log1p(data[solar_column]), kde=True)\n",
" plt.title(f'Distribuzione Log-trasformata di {name}')\n",
" plt.xlabel(f'Log({name} + 1)')\n",
" plt.ylabel('Frequenza')\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" # 3. Analisi temporale se disponibile\n",
" if 'timestamp' in data.columns or 'datetime' in data.columns:\n",
" time_col = 'timestamp' if 'timestamp' in data.columns else 'datetime'\n",
" if isinstance(data[time_col].iloc[0], (int, float)):\n",
" data['temp_datetime'] = pd.to_datetime(data[time_col], unit='s')\n",
" else:\n",
" data['temp_datetime'] = pd.to_datetime(data[time_col])\n",
"\n",
" # Plot temporale\n",
" plt.figure(figsize=(15, 6))\n",
" plt.plot(data['temp_datetime'], data[solar_column])\n",
" plt.title(f'Serie Temporale di {name}')\n",
" plt.xlabel('Data')\n",
" plt.ylabel(f'{name}')\n",
" plt.xticks(rotation=45)\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" # Analisi stagionale\n",
" data['month'] = data['temp_datetime'].dt.month\n",
" seasonal_stats = data.groupby('month')[solar_column].agg(['mean', 'std', 'median'])\n",
"\n",
" plt.figure(figsize=(12, 6))\n",
" seasonal_stats['mean'].plot(kind='bar')\n",
" plt.title(f'Media Mensile di {name}')\n",
" plt.xlabel('Mese')\n",
" plt.ylabel(f'{name} Media')\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" # 4. Stampa delle statistiche principali\n",
" print(f\"\\nStatistiche principali di {name}:\")\n",
" print(\"-\" * 50)\n",
" for key, value in stats_dict.items():\n",
" print(f\"{key:15}: {value:,.4f}\")\n",
"\n",
" # 5. Suggerimenti per la normalizzazione\n",
" print(\"\\nSuggerimenti per la normalizzazione:\")\n",
" print(\"-\" * 50)\n",
"\n",
" skewness = abs(stats_dict['skewness'])\n",
" if skewness > 1:\n",
" print(\"- La distribuzione è fortemente asimmetrica (skewness > 1)\")\n",
" print(\"- Considerare una trasformazione logaritmica: np.log1p(x)\")\n",
"\n",
" range_ratio = stats_dict['max'] / stats_dict['std']\n",
" if range_ratio > 10:\n",
" print(\"- La variabile ha una scala molto ampia\")\n",
" print(\"- Considerare RobustScaler o StandardScaler per la normalizzazione\")\n",
"\n",
" zero_ratio = stats_dict['zeros'] / stats_dict['count']\n",
" if zero_ratio > 0.1:\n",
" print(f\"- Alta presenza di zeri ({zero_ratio:.2%})\")\n",
" print(\"- Considerare un modello in due parti: classificazione degli zeri + regressione sui valori non-zero\")\n",
"\n",
" return stats_dict"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1b1ee91d1573ec66",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initializing solar energy model training...\n",
"\n",
"1. Preparing data...\n",
"\n",
"Selected features:\n",
"Number of features: 66\n",
"Features list: ['uvindex', 'cloudcover', 'visibility', 'temp', 'pressure', 'humidity', 'solarradiation', 'solar_elevation', 'solar_angle', 'day_length', 'hour_sin', 'hour_cos', 'day_of_year_sin', 'day_of_year_cos', 'month_sin', 'month_cos', 'solar_noon', 'daylight_correction', 'clear_sky_index', 'atmospheric_attenuation', 'theoretical_radiation', 'expected_radiation', 'cloud_elevation', 'visibility_elevation', 'uv_cloud_interaction', 'temp_radiation_potential', 'air_mass_index', 'atmospheric_stability', 'vapor_pressure_deficit', 'diffusion_index', 'atmospheric_transmittance', 'temp_humidity_interaction', 'clear_sky_factor', 'cloud_rolling_12h', 'temp_rolling_12h', 'uv_rolling_12h', 'cloudcover_rolling_mean_6h', 'temp_rolling_mean_6h', 'energy_rolling_mean_6h', 'uv_rolling_mean_6h', 'energy_volatility', 'uv_volatility', 'temp_1h_lag', 'cloudcover_1h_lag', 'humidity_1h_lag', 'energy_lag_1h', 'uv_lag_1h', 'temp_losses', 'soiling_loss_factor', 'estimated_efficiency', 'production_potential', 'system_performance_ratio', 'conversion_efficiency_ratio', 'clear_sky_duration', 'weather_variability_index', 'temp_stability', 'humidity_stability', 'cloudcover_stability', 'season_Spring', 'season_Summer', 'season_Autumn', 'season_Winter', 'time_period_Morning', 'time_period_Afternoon', 'time_period_Evening', 'time_period_Night']\n",
"Training data shape: (112882, 24, 66)\n",
"Test data shape: (16849, 24, 66)\n",
"Saving scaler X to: 2024-11-27_13-56_scale_X.joblib\n",
"Saving scaler X to: 2024-11-27_13-56_scale_y.joblib\n",
"Saving features to: 2024-11-27_13-56_features.json\n"
]
}
],
"source": [
"df = pd.read_parquet('../../sources/weather_data_solarradiation.parquet')\n",
"\n",
"print(\"Initializing solar energy model training...\")\n",
"\n",
"# Data preparation\n",
"print(\"\\n1. Preparing data...\")\n",
"X_train_seq, X_test_seq, y_train, y_test, scaler_X, scaler_y, features, X_to_predict_seq = prepare_hybrid_data(df)\n",
"\n",
"print(f\"Training data shape: {X_train_seq.shape}\")\n",
"print(f\"Test data shape: {X_test_seq.shape}\")\n",
"\n",
"# Save or load scaler and features\n",
"scaler_X_path = f'{folder_name}_scale_X.joblib'\n",
"scaler_y_path = f'{folder_name}_scale_y.joblib'\n",
"features_path = f'{folder_name}_features.json'\n",
"model_path = f'{folder_name}_best_model.h5'\n",
"history_path = f'{folder_name}_training_history.json'\n",
"\n",
"if os.path.exists(scaler_X_path):\n",
" print(f\"Loading existing scaler X from: {scaler_X_path}\")\n",
" scaler = joblib.load(scaler_X_path)\n",
"else:\n",
" print(f\"Saving scaler X to: {scaler_X_path}\")\n",
" joblib.dump(scaler_X, scaler_X_path)\n",
"\n",
"if os.path.exists(scaler_y_path):\n",
" print(f\"Loading existing scaler X from: {scaler_y_path}\")\n",
" scaler = joblib.load(scaler_y_path)\n",
"else:\n",
" print(f\"Saving scaler X to: {scaler_y_path}\")\n",
" joblib.dump(scaler_y, scaler_y_path)\n",
"\n",
"if os.path.exists(features_path):\n",
" print(f\"Loading existing features from: {features_path}\")\n",
" with open(features_path, 'r') as f:\n",
" features = json.load(f)\n",
"else:\n",
" print(f\"Saving features to: {features_path}\")\n",
" with open(features_path, 'w') as f:\n",
" json.dump(features, f)\n",
"\n",
"# Data quality verification\n",
"if np.isnan(X_train_seq).any() or np.isnan(y_train).any():\n",
" raise ValueError(\"Found NaN values in training data\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "096e79e3-7a3d-4e17-9a30-4d0747ee2d40",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"2. Creating model...\n",
"\\Min dataset solar energy : 0.0 - Scaled Version : 0.0\n",
"\n",
"Max dataset solar energy : 4.0 - Scaled Version : 3.3333333333333335\n",
"Max dataset solar energy increased by 15% : 4.6 - Scaled Version : 3.833333333333333\n",
"\n",
"Class distribution in training set:\n",
"Zeros: 56899 (50.41%)\n",
"Non-zeros: 55983 (49.59%)\n",
"\n",
"Class distribution in test set:\n",
"Zeros: 8576 (50.90%)\n",
"Non-zeros: 8273 (49.10%)\n",
"\n",
"Model output names: ['classification_output', 'regression_output', 'final_output']\n",
"\n",
"4. Starting training...\n",
"Epoch 1/150\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-11-27 14:02:24.816496: W tensorflow/core/framework/op_kernel.cc:1827] INVALID_ARGUMENT: required broadcastable shapes\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Error during training: Graph execution error:\n",
"\n",
"Detected at node model/regression_output/mul defined at (most recent call last):\n",
" File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
"\n",
" File \"<frozen runpy>\", line 88, in _run_code\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py\", line 17, in <module>\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py\", line 1046, in launch_instance\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py\", line 736, in start\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py\", line 195, in start\n",
"\n",
" File \"/usr/lib/python3.11/asyncio/base_events.py\", line 604, in run_forever\n",
"\n",
" File \"/usr/lib/python3.11/asyncio/base_events.py\", line 1909, in _run_once\n",
"\n",
" File \"/usr/lib/python3.11/asyncio/events.py\", line 80, in _run\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 516, in dispatch_queue\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 505, in process_one\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 412, in dispatch_shell\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 740, in execute_request\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py\", line 422, in do_execute\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py\", line 546, in run_cell\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3024, in run_cell\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3079, in _run_cell\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3284, in run_cell_async\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3466, in run_ast_nodes\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3526, in run_code\n",
"\n",
" File \"/tmp/ipykernel_341907/1713792660.py\", line 47, in <module>\n",
"\n",
" File \"/tmp/ipykernel_341907/594795021.py\", line 730, in train_hybrid_model\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1783, in fit\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1377, in train_function\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1360, in step_function\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1349, in run_step\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1126, in train_step\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 589, in __call__\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py\", line 1149, in __call__\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 96, in error_handler\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/functional.py\", line 515, in call\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/functional.py\", line 672, in _run_internal_graph\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py\", line 1149, in __call__\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 96, in error_handler\n",
"\n",
" File \"/usr/local/lib/python3.11/dist-packages/keras/src/layers/core/lambda_layer.py\", line 212, in call\n",
"\n",
" File \"/tmp/ipykernel_341907/594795021.py\", line 153, in <lambda>\n",
"\n",
"required broadcastable shapes\n",
"\t [[{{node model/regression_output/mul}}]] [Op:__inference_train_function_106117]\n",
"\n",
"Model output names: ['classification_output/Sigmoid:0', 'regression_output/Sum:0', 'final_output/clip_by_value:0']\n",
"Training targets keys: dict_keys(['classification_output', 'regression_output', 'final_output'])\n"
]
},
{
"ename": "InvalidArgumentError",
"evalue": "Graph execution error:\n\nDetected at node model/regression_output/mul defined at (most recent call last):\n File \"<frozen runpy>\", line 198, in _run_module_as_main\n\n File \"<frozen runpy>\", line 88, in _run_code\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py\", line 17, in <module>\n\n File \"/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py\", line 1046, in launch_instance\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py\", line 736, in start\n\n File \"/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py\", line 195, in start\n\n File \"/usr/lib/python3.11/asyncio/base_events.py\", line 604, in run_forever\n\n File \"/usr/lib/python3.11/asyncio/base_events.py\", line 1909, in _run_once\n\n File \"/usr/lib/python3.11/asyncio/events.py\", line 80, in _run\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 516, in dispatch_queue\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 505, in process_one\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 412, in dispatch_shell\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 740, in execute_request\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py\", line 422, in do_execute\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py\", line 546, in run_cell\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3024, in run_cell\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3079, in _run_cell\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3284, in run_cell_async\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3466, in run_ast_nodes\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3526, in run_code\n\n File \"/tmp/ipykernel_341907/1713792660.py\", line 47, in <module>\n\n File \"/tmp/ipykernel_341907/594795021.py\", line 730, in train_hybrid_model\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1783, in fit\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1377, in train_function\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1360, in step_function\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1349, in run_step\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1126, in train_step\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 589, in __call__\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py\", line 1149, in __call__\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 96, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/functional.py\", line 515, in call\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/functional.py\", line 672, in _run_internal_graph\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py\", line 1149, in __call__\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 96, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/layers/core/lambda_layer.py\", line 212, in call\n\n File \"/tmp/ipykernel_341907/594795021.py\", line 153, in <lambda>\n\nrequired broadcastable shapes\n\t [[{{node model/regression_output/mul}}]] [Op:__inference_train_function_106117]",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mInvalidArgumentError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[11], line 47\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mModel output names:\u001b[39m\u001b[38;5;124m\"\u001b[39m, output_names)\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m4. Starting training...\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 47\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_hybrid_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 48\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 49\u001b[0m \u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX_train_seq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 51\u001b[0m \u001b[43m \u001b[49m\u001b[43mX_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX_test_seq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 52\u001b[0m \u001b[43m \u001b[49m\u001b[43my_test\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_test\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 53\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m150\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 54\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m512\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43mfolder_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfolder_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mmin_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmin_val_scaled\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_val_scaled\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[8], line 730\u001b[0m, in \u001b[0;36mtrain_hybrid_model\u001b[0;34m(model, X_train, y_train, X_test, y_test, epochs, batch_size, folder_name, min_output, max_output)\u001b[0m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;124;03mtf.keras.callbacks.ReduceLROnPlateau(\u001b[39;00m\n\u001b[1;32m 719\u001b[0m \u001b[38;5;124;03m monitor='val_final_output_loss',\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 727\u001b[0m \u001b[38;5;124;03m ),\u001b[39;00m\n\u001b[1;32m 728\u001b[0m \u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[1;32m 729\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 730\u001b[0m history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 731\u001b[0m \u001b[43m \u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 732\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain_targets\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 733\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mX_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_targets\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 734\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 735\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 736\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 737\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 738\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 739\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mTraining completed successfully!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 743\u001b[0m \u001b[38;5;66;03m# Final evaluation\u001b[39;00m\n",
"File \u001b[0;32m/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 67\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 68\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 69\u001b[0m \u001b[38;5;66;03m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
"File \u001b[0;32m/usr/local/lib/python3.11/dist-packages/tensorflow/python/eager/execute.py:60\u001b[0m, in \u001b[0;36mquick_execute\u001b[0;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# Convert any objects of type core_types.Tensor to Tensor.\u001b[39;00m\n\u001b[1;32m 54\u001b[0m inputs \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 55\u001b[0m tensor_conversion_registry\u001b[38;5;241m.\u001b[39mconvert(t)\n\u001b[1;32m 56\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(t, core_types\u001b[38;5;241m.\u001b[39mTensor)\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m t\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m inputs\n\u001b[1;32m 59\u001b[0m ]\n\u001b[0;32m---> 60\u001b[0m tensors \u001b[38;5;241m=\u001b[39m pywrap_tfe\u001b[38;5;241m.\u001b[39mTFE_Py_Execute(ctx\u001b[38;5;241m.\u001b[39m_handle, device_name, op_name,\n\u001b[1;32m 61\u001b[0m inputs, attrs, num_outputs)\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m core\u001b[38;5;241m.\u001b[39m_NotOkStatusException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"\u001b[0;31mInvalidArgumentError\u001b[0m: Graph execution error:\n\nDetected at node model/regression_output/mul defined at (most recent call last):\n File \"<frozen runpy>\", line 198, in _run_module_as_main\n\n File \"<frozen runpy>\", line 88, in _run_code\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py\", line 17, in <module>\n\n File \"/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py\", line 1046, in launch_instance\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py\", line 736, in start\n\n File \"/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py\", line 195, in start\n\n File \"/usr/lib/python3.11/asyncio/base_events.py\", line 604, in run_forever\n\n File \"/usr/lib/python3.11/asyncio/base_events.py\", line 1909, in _run_once\n\n File \"/usr/lib/python3.11/asyncio/events.py\", line 80, in _run\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 516, in dispatch_queue\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 505, in process_one\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 412, in dispatch_shell\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py\", line 740, in execute_request\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py\", line 422, in do_execute\n\n File \"/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py\", line 546, in run_cell\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3024, in run_cell\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3079, in _run_cell\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py\", line 129, in _pseudo_sync_runner\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3284, in run_cell_async\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3466, in run_ast_nodes\n\n File \"/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py\", line 3526, in run_code\n\n File \"/tmp/ipykernel_341907/1713792660.py\", line 47, in <module>\n\n File \"/tmp/ipykernel_341907/594795021.py\", line 730, in train_hybrid_model\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1783, in fit\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1377, in train_function\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1360, in step_function\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1349, in run_step\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 1126, in train_step\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/training.py\", line 589, in __call__\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py\", line 1149, in __call__\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 96, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/functional.py\", line 515, in call\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/functional.py\", line 672, in _run_internal_graph\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 65, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/engine/base_layer.py\", line 1149, in __call__\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py\", line 96, in error_handler\n\n File \"/usr/local/lib/python3.11/dist-packages/keras/src/layers/core/lambda_layer.py\", line 212, in call\n\n File \"/tmp/ipykernel_341907/594795021.py\", line 153, in <lambda>\n\nrequired broadcastable shapes\n\t [[{{node model/regression_output/mul}}]] [Op:__inference_train_function_106117]"
]
}
],
"source": [
"#Model creation\n",
"print(\"\\n2. Creating model...\")\n",
"input_shape = (X_train_seq.shape[1], X_train_seq.shape[2])\n",
"\n",
"min_val = df['solarenergy'].min()\n",
"min_val_scaled = scaler_y.transform([[0]])[0][0]\n",
"\n",
"max_val = df['solarenergy'].max()\n",
"max_val_scaled = scaler_y.transform([[max_val]])[0][0]\n",
"\n",
"print(f\"\\Min dataset solar energy : {min_val} - Scaled Version : {min_val_scaled}\")\n",
"\n",
"print(f\"\\nMax dataset solar energy : {max_val} - Scaled Version : {max_val_scaled}\")\n",
"\n",
"increase_percentage = 15\n",
"\n",
"max_val = max_val * (1 + increase_percentage / 100)\n",
"max_val_scaled = max_val_scaled * (1 + increase_percentage / 100)\n",
"\n",
"print(f\"Max dataset solar energy increased by {increase_percentage}% : {max_val} - Scaled Version : {max_val_scaled}\")\n",
"\n",
"# Create the hybrid model\n",
"model = create_solarenergy_model(\n",
" input_shape=input_shape, \n",
" folder_name=folder_name, \n",
" min_output=min_val_scaled, \n",
" max_output=max_val_scaled\n",
")\n",
"\n",
"# Prepare binary targets for classification\n",
"y_train_binary = (y_train > 0).astype(float)\n",
"y_test_binary = (y_test > 0).astype(float)\n",
"\n",
"print(\"\\nClass distribution in training set:\")\n",
"print(f\"Zeros: {np.sum(y_train_binary == 0)} ({np.mean(y_train_binary == 0)*100:.2f}%)\")\n",
"print(f\"Non-zeros: {np.sum(y_train_binary == 1)} ({np.mean(y_train_binary == 1)*100:.2f}%)\")\n",
"\n",
"print(\"\\nClass distribution in test set:\")\n",
"print(f\"Zeros: {np.sum(y_test_binary == 0)} ({np.mean(y_test_binary == 0)*100:.2f}%)\")\n",
"print(f\"Non-zeros: {np.sum(y_test_binary == 1)} ({np.mean(y_test_binary == 1)*100:.2f}%)\")\n",
"\n",
"# Get the exact output names from the model\n",
"output_names = [output.name.split('/')[0] for output in model.outputs]\n",
"print(\"\\nModel output names:\", output_names)\n",
"\n",
"print(\"\\n4. Starting training...\")\n",
"history = train_hybrid_model(\n",
" model=model,\n",
" X_train=X_train_seq,\n",
" y_train=y_train,\n",
" X_test=X_test_seq,\n",
" y_test=y_test,\n",
" epochs=150,\n",
" batch_size=512,\n",
" folder_name=folder_name,\n",
" min_output=min_val_scaled,\n",
" max_output=max_val_scaled\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "958d78b99e8898d6",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n5. Generating predictions...\")\n",
"predictions = model.predict(X_test_seq)\n",
"classification_pred, regression_pred, final_pred = predictions\n",
"\n",
"# Clip solo le predizioni di regressione e finali\n",
"regression_pred = np.clip(regression_pred, min_val_scaled, max_val_scaled)\n",
"final_pred = np.clip(final_pred, min_val_scaled, max_val_scaled)\n",
"\n",
"# Inverse transform per tornare ai valori originali\n",
"regression_pred_original = scaler_y.inverse_transform(regression_pred)\n",
"final_pred_original = scaler_y.inverse_transform(final_pred)\n",
"y_test_original = scaler_y.inverse_transform(y_test)\n",
"\n",
"print(\"\\n6. Evaluating model...\")\n",
"# Valutazione delle predizioni finali\n",
"metrics = evaluate_solarenergy_predictions(y_test_original, final_pred_original, folder_name=folder_name)\n",
"\n",
"# Create results dictionary con metriche aggiuntive per il modello ibrido\n",
"training_results = {\n",
" 'model_params': {\n",
" 'input_shape': input_shape,\n",
" 'n_features': len(features),\n",
" 'sequence_length': X_train_seq.shape[1]\n",
" },\n",
" 'training_params': {\n",
" 'batch_size': 192,\n",
" 'total_epochs': len(history.history['loss']),\n",
" 'best_epoch': np.argmin(history.history['val_final_output_loss']) + 1\n",
" },\n",
" 'performance_metrics': {\n",
" 'classification': {\n",
" 'final_loss': float(history.history['val_classification_output_loss'][-1]),\n",
" 'final_auc': float(history.history['val_classification_output_auc'][-1])\n",
" },\n",
" 'regression': {\n",
" 'final_loss': float(history.history['val_regression_output_loss'][-1]),\n",
" 'final_mae': float(history.history['val_regression_output_mae'][-1]),\n",
" 'out_of_range_predictions': int(np.sum((regression_pred < 0) | (regression_pred > max_val_scaled)))\n",
" },\n",
" 'final_output': {\n",
" 'final_loss': float(history.history['val_final_output_loss'][-1]),\n",
" 'final_mae': float(history.history['val_final_output_mae'][-1]),\n",
" 'best_val_loss': float(min(history.history['val_final_output_loss'])),\n",
" 'out_of_range_predictions': int(np.sum((final_pred < 0) | (final_pred > max_val_scaled)))\n",
" }\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c05d1d03336b1e4",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n7. Predicting missing data...\")\n",
"to_predict_predictions = model.predict(X_to_predict_seq)\n",
"classification_pred, regression_pred, final_pred = to_predict_predictions\n",
"\n",
"# Clip solo le predizioni finali che useremo per l'integrazione\n",
"final_pred = np.clip(final_pred, min_val_scaled, max_val_scaled)\n",
"final_pred_original = scaler_y.inverse_transform(final_pred)\n",
"\n",
"print(\"\\n8. Integrating predictions into original dataset...\")\n",
"df_updated = integrate_predictions(df.copy(), predictions=(classification_pred, regression_pred, final_pred_original))\n",
"\n",
"df_updated.to_parquet('../../sources/weather_data_solarenergy.parquet')\n",
"\n",
"# Add prediction statistics to training_results\n",
"training_results['prediction_stats'] = {\n",
" 'n_predictions_added': len(final_pred_original),\n",
" 'classification_stats': {\n",
" 'predicted_zeros': int(np.sum(classification_pred < 0.5)),\n",
" 'predicted_non_zeros': int(np.sum(classification_pred >= 0.5)),\n",
" 'mean_confidence': float(classification_pred.mean()),\n",
" },\n",
" 'regression_stats': {\n",
" 'mean_predicted_value': float(regression_pred.mean()),\n",
" 'min_predicted_value': float(regression_pred.min()),\n",
" 'max_predicted_value': float(regression_pred.max()),\n",
" },\n",
" 'final_predictions': {\n",
" 'mean_predicted_solarenergy': float(final_pred_original.mean()),\n",
" 'min_predicted_solarenergy': float(final_pred_original.min()),\n",
" 'max_predicted_solarenergy': float(final_pred_original.max()),\n",
" 'zero_predictions': int(np.sum(final_pred_original == 0)),\n",
" 'non_zero_predictions': int(np.sum(final_pred_original > 0)),\n",
" }\n",
"}\n",
"\n",
"print(\"\\nPrediction Statistics:\")\n",
"print(f\"Total predictions added: {training_results['prediction_stats']['n_predictions_added']}\")\n",
"print(\"\\nClassification Statistics:\")\n",
"print(f\"Predicted zeros: {training_results['prediction_stats']['classification_stats']['predicted_zeros']} \"\n",
" f\"({training_results['prediction_stats']['classification_stats']['predicted_zeros']/len(final_pred_original)*100:.2f}%)\")\n",
"print(f\"Predicted non-zeros: {training_results['prediction_stats']['classification_stats']['predicted_non_zeros']} \"\n",
" f\"({training_results['prediction_stats']['classification_stats']['predicted_non_zeros']/len(final_pred_original)*100:.2f}%)\")\n",
"print(f\"Mean classification confidence: {training_results['prediction_stats']['classification_stats']['mean_confidence']:.4f}\")\n",
"\n",
"print(\"\\nFinal Predictions Statistics:\")\n",
"print(f\"Mean solar energy: {training_results['prediction_stats']['final_predictions']['mean_predicted_solarenergy']:.2f}\")\n",
"print(f\"Min solar energy: {training_results['prediction_stats']['final_predictions']['min_predicted_solarenergy']:.2f}\")\n",
"print(f\"Max solar energy: {training_results['prediction_stats']['final_predictions']['max_predicted_solarenergy']:.2f}\")\n",
"print(f\"Zero predictions: {training_results['prediction_stats']['final_predictions']['zero_predictions']} \"\n",
" f\"({training_results['prediction_stats']['final_predictions']['zero_predictions']/len(final_pred_original)*100:.2f}%)\")\n",
"\n",
"print(\"\\nTraining completed successfully!\")\n",
"\n",
"tf.keras.backend.clear_session()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef29b3ecdf12c6db",
"metadata": {},
"outputs": [],
"source": [
"analyze_distribution(df_updated, 'solarenergy', 'Solar Energy')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e884cc287364c4ed",
"metadata": {},
"outputs": [],
"source": [
"def plot_error_analysis(y_true, predictions, folder_name=None):\n",
" \"\"\"\n",
" Function to visualize prediction error analysis for the hybrid model\n",
"\n",
" Parameters:\n",
" -----------\n",
" y_true : array-like\n",
" Actual values\n",
" predictions : tuple\n",
" Tuple containing (classification_pred, regression_pred, final_pred)\n",
" folder_name : str, optional\n",
" Directory to save plots. If None, plots are only displayed\n",
"\n",
" Generates:\n",
" ----------\n",
" - Classification analysis plots\n",
" - Regression error analysis plots\n",
" - Final prediction error analysis plots\n",
" \"\"\"\n",
" from sklearn.metrics import roc_curve\n",
"\n",
" # Unpack predictions\n",
" classification_pred, regression_pred, final_pred = predictions\n",
"\n",
" # Convert to 1D numpy arrays if needed\n",
" y_true = np.ravel(y_true)\n",
" classification_pred = np.ravel(classification_pred)\n",
" regression_pred = np.ravel(regression_pred)\n",
" final_pred = np.ravel(final_pred)\n",
"\n",
" # Create binary ground truth\n",
" y_true_binary = (y_true > 0).astype(float)\n",
"\n",
" # Calculate errors for regression and final predictions\n",
" regression_errors = regression_pred - y_true\n",
" final_errors = final_pred - y_true\n",
"\n",
" # Create main figure\n",
" plt.figure(figsize=(20, 15))\n",
"\n",
" # Classification Analysis (Top Row)\n",
" # Plot 1: Classification Distribution\n",
" plt.subplot(3, 3, 1)\n",
" plt.hist(classification_pred, bins=50, alpha=0.7)\n",
" plt.axvline(x=0.5, color='r', linestyle='--')\n",
" plt.title('Classification Probability Distribution')\n",
" plt.xlabel('Classification Probability')\n",
" plt.ylabel('Frequency')\n",
"\n",
" # Plot 2: ROC Curve\n",
" plt.subplot(3, 3, 2)\n",
" fpr, tpr, _ = roc_curve(y_true_binary, classification_pred)\n",
" plt.plot(fpr, tpr)\n",
" plt.plot([0, 1], [0, 1], 'r--')\n",
" plt.title(f'ROC Curve (AUC = {roc_auc_score(y_true_binary, classification_pred):.4f})')\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
"\n",
" # Plot 3: Classification Confusion Matrix\n",
" plt.subplot(3, 3, 3)\n",
" cm = confusion_matrix(y_true_binary, classification_pred > 0.5)\n",
" sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n",
" plt.title('Classification Confusion Matrix')\n",
" plt.xlabel('Predicted')\n",
" plt.ylabel('Actual')\n",
"\n",
" # Regression Analysis (Middle Row)\n",
" # Plot 4: Regression Error Distribution\n",
" plt.subplot(3, 3, 4)\n",
" plt.hist(regression_errors[y_true > 0], bins=50, alpha=0.7)\n",
" plt.title('Regression Error Distribution (Non-zero Values)')\n",
" plt.xlabel('Error')\n",
" plt.ylabel('Frequency')\n",
"\n",
" # Plot 5: Actual vs Predicted (Regression)\n",
" plt.subplot(3, 3, 5)\n",
" mask_nonzero = y_true > 0\n",
" plt.scatter(y_true[mask_nonzero], regression_pred[mask_nonzero], alpha=0.5)\n",
" plt.plot([y_true[mask_nonzero].min(), y_true[mask_nonzero].max()],\n",
" [y_true[mask_nonzero].min(), y_true[mask_nonzero].max()], 'r--', lw=2)\n",
" plt.title('Actual vs Predicted (Regression, Non-zero Values)')\n",
" plt.xlabel('Actual Values')\n",
" plt.ylabel('Predicted Values')\n",
"\n",
" # Plot 6: Regression Errors vs Actual Values\n",
" plt.subplot(3, 3, 6)\n",
" plt.scatter(y_true[mask_nonzero], regression_errors[mask_nonzero], alpha=0.5)\n",
" plt.axhline(y=0, color='r', linestyle='--')\n",
" plt.title('Regression Errors vs Actual Values (Non-zero Values)')\n",
" plt.xlabel('Actual Values')\n",
" plt.ylabel('Error')\n",
"\n",
" # Final Predictions Analysis (Bottom Row)\n",
" # Plot 7: Final Error Distribution\n",
" plt.subplot(3, 3, 7)\n",
" plt.hist(final_errors, bins=50, alpha=0.7)\n",
" plt.title('Final Prediction Error Distribution')\n",
" plt.xlabel('Error')\n",
" plt.ylabel('Frequency')\n",
"\n",
" # Plot 8: Actual vs Predicted (Final)\n",
" plt.subplot(3, 3, 8)\n",
" plt.scatter(y_true, final_pred, alpha=0.5)\n",
" plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)\n",
" plt.title('Actual vs Predicted (Final)')\n",
" plt.xlabel('Actual Values')\n",
" plt.ylabel('Predicted Values')\n",
"\n",
" # Plot 9: Final Errors vs Actual Values\n",
" plt.subplot(3, 3, 9)\n",
" plt.scatter(y_true, final_errors, alpha=0.5)\n",
" plt.axhline(y=0, color='r', linestyle='--')\n",
" plt.title('Final Errors vs Actual Values')\n",
" plt.xlabel('Actual Values')\n",
" plt.ylabel('Error')\n",
"\n",
" plt.tight_layout()\n",
"\n",
" # Save plot if directory is specified\n",
" if folder_name is not None:\n",
" try:\n",
" filename = f'{folder_name}_error_analysis.png'\n",
" plt.savefig(filename, dpi=300, bbox_inches='tight')\n",
" print(f\"\\nPlot saved as: {filename}\")\n",
" except Exception as e:\n",
" print(f\"\\nError saving plot: {str(e)}\")\n",
"\n",
" plt.show()\n",
"\n",
" # Print comprehensive statistics\n",
" print(\"\\nClassification Statistics:\")\n",
" print(classification_report(y_true_binary, classification_pred > 0.5))\n",
" print(f\"AUC-ROC: {roc_auc_score(y_true_binary, classification_pred):.4f}\")\n",
"\n",
" print(\"\\nRegression Statistics (Non-zero values):\")\n",
" mask_nonzero = y_true > 0\n",
" if np.any(mask_nonzero):\n",
" print(f\"MAE: {np.mean(np.abs(regression_errors[mask_nonzero])):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(regression_errors[mask_nonzero] ** 2)):.4f}\")\n",
" print(f\"Mean error: {np.mean(regression_errors[mask_nonzero]):.4f}\")\n",
" print(f\"Error std: {np.std(regression_errors[mask_nonzero]):.4f}\")\n",
"\n",
" print(\"\\nFinal Prediction Statistics:\")\n",
" print(f\"MAE: {np.mean(np.abs(final_errors)):.4f}\")\n",
" print(f\"RMSE: {np.sqrt(np.mean(final_errors ** 2)):.4f}\")\n",
" print(f\"Mean error: {np.mean(final_errors):.4f}\")\n",
" print(f\"Error std: {np.std(final_errors):.4f}\")\n",
"\n",
" # Calculate percentage of errors within thresholds\n",
" thresholds = [0.5, 1.0, 1.5, 2.0]\n",
" print(\"\\nError Thresholds (Final Predictions):\")\n",
" for threshold in thresholds:\n",
" within_threshold = np.mean(np.abs(final_errors) <= threshold) * 100\n",
" print(f\"Predictions within ±{threshold}: {within_threshold:.1f}%\")\n",
"\n",
"# Example usage\n",
"plot_error_analysis(y_test, predictions, folder_name=folder_name)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0rc1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}