{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "multivariate_regression.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "gW5Zcf_Ro_hC", "colab_type": "text" }, "source": [ "Environment Setup" ] }, { "cell_type": "code", "metadata": { "id": "-NfV9x-NlTmr", "colab_type": "code", "colab": {} }, "source": [ "#!pip3 install tensorflow\n", "#!pip3 install tensorflow-gpu\n", "#!pip3 install pandas\n", "#!pip3 install numpy\n", "#!pip3 install sklearn" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "GiYW1SM2pLYI", "colab_type": "text" }, "source": [ "Library Imports" ] }, { "cell_type": "code", "metadata": { "id": "55GRLjrPoqsP", "colab_type": "code", "colab": {} }, "source": [ "import pandas as pd\n", "import numpy as np\n", "import tensorflow as tf\n", "from datetime import datetime\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.decomposition import PCA\n", "from keras.layers import LSTM, Dense\n", "from keras.models import Sequential\n", "from matplotlib import pyplot" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "xJ0zQrOhpe-3", "colab_type": "code", "colab": {} }, "source": [ "df = pd.read_csv(\"jena_climate_2009_2016.csv\")\n", "df.columns = ['dt', 'p_mbar', 'T_C', 'T_K', 'Tdew_C', 'rh', 'VPmax_mbar', 'VPact_mbar', \n", " 'VPdef_mbar', 'sh', 'h2o_c', 'rho', 'wv', 'max_wv', 'wd_deg']\n", "# 14 column data\n", "df['dt'] = pd.to_datetime(df['dt'], format=\"%d.%m.%Y %X\")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "WanIv6QdzDV4", "colab_type": "code", "colab": {} }, "source": [ "def check_sequence_time(df):\n", " start = df['dt'][0]\n", " interval = df['dt'][1] - start\n", " for i in range(len(df['dt'])):\n", " if ((start + i*interval)==df['dt'][i]):\n", " pass\n", " else:\n", " return True\n", " return False\n", "\n", "if check_sequence_time(df):\n", " df = df.drop([\"dt\", \"T_C\"], axis=1)\n", " print(\"Good Data. On to next cell, mate.\")\n", "else:\n", " print(\"Data incomplete\")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ORhdaXtdXUAa", "colab_type": "text" }, "source": [ "Principal Component Analysis" ] }, { "cell_type": "code", "metadata": { "id": "3oFshWmVeDdb", "colab_type": "code", "colab": {} }, "source": [ "y_val= df[\"T_K\"]\n", "y_val = y_val.drop(y_val.index[0])\n", "# x_val = df\n", "X_train, X_eval, y_train, y_eval = train_test_split(df.drop(df.index[-1]), y_val, test_size=0.2, shuffle=False)\n", "sc = StandardScaler()\n", "X_train_pca = sc.fit_transform(X_train)\n", "X_test_pca = sc.transform(X_eval)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "v1qkrqhGXR_S", "colab_type": "code", "colab": {} }, "source": [ "pca = PCA()\n", "some_var = pca.fit_transform(X_train_pca)\n", "some_test_var = pca.transform(X_test_pca)\n", "n_pcs= pca.components_.shape[0]\n", "most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]\n", "initial_feature_names = ['p_mbar', 'T_K', 'Tdew_C', 'rh', 'VPmax_mbar', 'VPact_mbar', 'VPdef_mbar', 'sh', 'h2o_c', 'rho', 'wv', 'max_wv', 'wd_deg']\n", "most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]\n", "\n", "dic = {'PC{}'.format(i+1): most_important_names[i] for i in range(n_pcs)}" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "glwNVV02gI4u", "colab_type": "code", "colab": {} }, "source": [ "pca_out = pd.DataFrame(sorted(dic.items()))\n", "pca_out['3'] = pca.explained_variance_ratio_\n", "chosen_columns = (pca_out[1].tolist())[1:8]\n", "df = df.filter(items=chosen_columns)\n", "df = df.drop(df.index[-1])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "yBfZWfEspSS1", "colab_type": "text" }, "source": [ "Data Preprocessing" ] }, { "cell_type": "code", "metadata": { "id": "CqtbT8HgAUPC", "colab_type": "code", "colab": {} }, "source": [ "scaler_x = MinMaxScaler(feature_range=(0, 1))\n", "scaled_x = scaler_x.fit_transform(df)\n", "\n", "scaler_y = MinMaxScaler(feature_range=(0, 1))\n", "scaled_y = scaler_y.fit_transform(y_val.values.reshape(-1, 1))\n", "\n", "X_train, X_eval, y_train, y_eval = train_test_split(scaled_x, scaled_y, test_size=0.2, shuffle=False)\n", "\n", "X_train = X_train.reshape([X_train.shape[0], 1, 7])\n", "X_eval = X_eval.reshape([X_eval.shape[0], 1, 7])\n", "y_train = y_train.reshape((-1, y_train.shape[0]))\n", "y_eval = y_eval.reshape((-1, y_eval.shape[0]))" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "dDX2DH7hsmOg", "colab_type": "code", "colab": {} }, "source": [ "model = Sequential()\n", "\n", "model.add(LSTM(256, input_shape=(X_train.shape[1:]), activation='relu'))\n", "model.add(Dense(256, activation='relu'))\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dense(64, activation='relu'))\n", "model.add(Dense(1))\n", "\n", "model.compile(loss='mae', optimizer='adam', metrics=['mae'])\n", "\n", "history = model.fit(X_train, y_train[0], epochs=5, validation_data=(X_eval, y_eval[0]))\n", "\n", "pyplot.plot(history.history['loss'], label='train')\n", "pyplot.plot(history.history['val_loss'], label='test')\n", "pyplot.legend()\n", "pyplot.show()" ], "execution_count": 0, "outputs": [] } ] }