Linear Regression Pipeline

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f70cba96",
   "metadata": {},
   "source": [
    "### Configuration Variables."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "84c02166",
   "metadata": {},
   "outputs": [],
   "source": [
    "# We Need To Provide All These Variables Before Proceding.\n",
    "# All Column Names Of Your DataSet.\n",
    "all_columns = [\"Meter\",\"dt\",\"Global_reactive_power\",\"Voltage\",\"Global_intensity\",\"Sub_metering_1\",\"Sub_metering_2\",\"Sub_metering_3\",\"Power_Consumption\"] \n",
    "# Columns You Want To Drop From Your DataSet.\n",
    "dropped_columns = [\"dt\"]\n",
    "# Columns You Want To Train Your Model.\n",
    "training_columns = [\"Global_reactive_power\",\"Voltage\",\"Sub_metering_1\",\"Sub_metering_2\",\"Sub_metering_3\"]\n",
    "# Target Column You Are Predicting.\n",
    "target_column = \"Power_Consumption\"\n",
    "# Name Of The Column You Want To Do Iteration.\n",
    "iter_column = 'Meter'\n",
    "# Input File Location\n",
    "file_location= \"C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Power_Consumption_Meters.csv\"\n",
    "# All Model Performance File\n",
    "all_model_performance_location = \"C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Model_Performance.csv\"\n",
    "# Model Coefficients File\n",
    "coeff_file_location = \"C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Model_Coefficients.csv\"\n",
    "#Final Output File\n",
    "final_output_file_location = \"C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Final_Output.csv\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00a65583",
   "metadata": {},
   "source": [
    "### Required Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "565e1479",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'xgboost'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Input \u001b[1;32mIn [2]\u001b[0m, in \u001b[0;36m<cell line: 14>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression, Lasso,Ridge,ElasticNet\n\u001b[0;32m     13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtree\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DecisionTreeRegressor\n\u001b[1;32m---> 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxgboost\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m XGBRegressor\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mensemble\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GradientBoostingRegressor\n\u001b[0;32m     16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LabelEncoder\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.tree import DecisionTreeRegressor, plot_tree\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split # Hyperparameter tuning\n",
    "from category_encoders import OneHotEncoder\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "from sklearn.linear_model import LinearRegression, Lasso,Ridge,ElasticNet\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn import preprocessing\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a933bdc9",
   "metadata": {},
   "source": [
    "### Input Data Reading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d8769e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(file_location)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20b92ba2",
   "metadata": {},
   "source": [
    "### Dropping The Not Required Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b9747df",
   "metadata": {},
   "outputs": [],
   "source": [
    "required_data = data.drop(dropped_columns , axis = 1, inplace=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ef08092",
   "metadata": {},
   "source": [
    "### Auto Encodes Any Dataframe Column Of Type Category Or Object."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc8d2d12",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dummyEncode(data):\n",
    "        columnsToEncode = list(data.select_dtypes(include=['category','object']))\n",
    "        le = LabelEncoder()\n",
    "        for feature in columnsToEncode:\n",
    "            try:\n",
    "                data[feature] = le.fit_transform(data[feature])\n",
    "            except:\n",
    "                print('Error encoding '+feature)\n",
    "        return data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d59ce20",
   "metadata": {},
   "source": [
    "### Training & Testing The Model & Predicting Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "403bf37f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_test_splitting(data):\n",
    "    # Do The Label Encoding Of Your DataSet.\n",
    "    encoded_data = dummyEncode(data[training_columns])\n",
    "    # Add Target Column To The \"Encoded Data\" Dataset.\n",
    "    encoded_data[target_column]=data[target_column]\n",
    "    # Assign X To The Training Column Values.\n",
    "    X = np.array(encoded_data.drop([target_column],1))\n",
    "    # Assign \"y\" to the target Column Values.\n",
    "    y = np.array(encoded_data[target_column])\n",
    "    # skale X – normalized -1 to 1.\n",
    "    X = preprocessing.scale(X)\n",
    "    # Split The Encoded Data Into Train & Test Split\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=48)\n",
    "    # Return The Train & Test Split Values As A Tuple\n",
    "    return (X_train, X_test, y_train, y_test)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5acf269b",
   "metadata": {},
   "source": [
    "### Creating Base Model Pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cb031a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_pipeline(data):\n",
    "    \n",
    "    # This If Statement Is A Hard Coded Value, Only Works For One DataSet. \n",
    "    # Ignore This If Statement.\n",
    "    if \"Global_intensity\" in all_columns:\n",
    "        if \"Global_intensity\" in training_columns:\n",
    "            training_columns.remove(\"Global_intensity\")\n",
    "    \n",
    "    # Call The \"train_test_splitting\" Method\n",
    "    X_train, X_test, y_train, y_test = train_test_splitting(data)\n",
    "    \n",
    "    # Linear Regression Model\n",
    "    model_lr = make_pipeline(\n",
    "        LinearRegression()\n",
    "        \n",
    "    )\n",
    "    model_lr.fit(X_train,y_train)\n",
    "    \n",
    "    \n",
    "    # Lasso Model\n",
    "    model_ls = make_pipeline(\n",
    "       \n",
    "        Lasso(alpha=1)\n",
    "    )\n",
    "    model_ls.fit(X_train,y_train)\n",
    "    \n",
    "    \n",
    "    #XGB Regressor Model\n",
    "    model_xgb = make_pipeline(\n",
    "        \n",
    "        XGBRegressor(booster='gblinear') # learning_rate=0.1, 0.01, 02\n",
    "    )\n",
    "    \n",
    "    model_xgb.fit(X_train,y_train)\n",
    "    \n",
    "    # Ridge Model\n",
    "    model_R = make_pipeline(Ridge()\n",
    "       \n",
    "    )\n",
    "    model_R.fit(X_train,y_train)\n",
    "    \n",
    "    # Elastic-net Model\n",
    "    model_Er = make_pipeline(\n",
    "       \n",
    "        ElasticNet()\n",
    "    )\n",
    "    model_Er.fit(X_train,y_train)\n",
    "    \n",
    "    # Return All The Trained Model\n",
    "    return (model_lr,model_ls,model_xgb,model_R,model_Er)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7320dc4",
   "metadata": {},
   "source": [
    "### Evaluating Base Model Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfb91a52",
   "metadata": {},
   "outputs": [],
   "source": [
    "def base_model_evaluation(data,trained_model_list, model_names):\n",
    "    # \"trained_model_list\" = Will Contain All The Trained Model Objects\n",
    "    # \"model_names\" = Will Contain The Algorithms Names Only\n",
    "    \n",
    "    # This If Statement Is A Hard Coded Value, Only Works For One DataSet. \n",
    "    # Ignore This If Statement.\n",
    "    if \"Global_intensity\" in all_columns:\n",
    "        if \"Global_intensity\" in training_columns:\n",
    "            training_columns.remove(\"Global_intensity\")\n",
    "    \n",
    "    # Call The \"train_test_splitting\" Method\n",
    "    X_train, X_test, y_train, y_test = train_test_splitting(data)\n",
    "    \n",
    "\n",
    "    col_names = ['Algorithm','Accuracy_Before','RMSE_Before','Intercept_Before']\n",
    "    \n",
    "    # \"model_list\" = It Will Contain All Model Names and The 'Accuracy','RMSE','Intercept' Details\n",
    "    model_list=[]   \n",
    "           \n",
    "    #Loop through models in trained_model_list and obtain metrics and add to model_list\n",
    "    for i, model_i in enumerate(trained_model_list):\n",
    "        # \"row_list\" = It Will Contain 'Algorithm','Accuracy','RMSE','Intercept' Values\n",
    "        row_list = [model_names[i]]\n",
    "        row_list.extend([\n",
    "            str(round(r2_score(y_test,model_i.predict(X_test))*100)) + '%',\n",
    "            mean_squared_error(y_test,model_i.predict(X_test))**0.5,\n",
    "            model_i._final_estimator.intercept_\n",
    "        ]);\n",
    "        model_list.append(row_list)\n",
    "        \n",
    "    # \"base_model\" = Data Frame Will Store All Model Performance Details    \n",
    "    base_model = pd.DataFrame(model_list, columns=col_names)\n",
    "    \n",
    "    # Return Base Model DataFrame\n",
    "    # Model Name,Accuracy RMSE and Intercept\n",
    "    return base_model\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee51ee0d",
   "metadata": {},
   "source": [
    "### Calling Base Model Evaluation Method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08e8769f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare A Method That Will Call The \"base_model_evaluation\" Method And Get The Scores\n",
    "def evaluate_base_model(data):\n",
    "    # Calling \"model_pipeline\" To Get The Model Objects\n",
    "    model_lr, model_ls, model_xgb,model_R,model_Er = model_pipeline(data)\n",
    "    \n",
    "    # Store It Inside A List\n",
    "    models = [model_lr, model_ls, model_xgb,model_R,model_Er]\n",
    "    \n",
    "    # List Of Algorithms You Are Using\n",
    "    modelnams = ['linear_regression','lasso', 'xgb_boost_regression','Ridge regression','Elastic net']\n",
    "    \n",
    "    # Calling \"base_model_evaluation\" Method\n",
    "    base_model = base_model_evaluation(data,models,modelnams)\n",
    "    \n",
    "    return base_model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ddcd17a",
   "metadata": {},
   "source": [
    "### Tuning Our Base Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0384ad4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Declare A Method That Will Tune The Base Model\n",
    "def tuning_model(data):\n",
    "    \n",
    "    # This If Statement Is A Hard Coded Value, Only Works For One DataSet. \n",
    "    # Ignore This If Statement.\n",
    "    if \"Global_intensity\" in all_columns:\n",
    "        if \"Global_intensity\" not in training_columns:\n",
    "            training_columns.append(\"Global_intensity\")\n",
    "    \n",
    "    # Call The \"train_test_splitting\" Method\n",
    "    X_train, X_test, y_train, y_test = train_test_splitting(data)\n",
    "    \n",
    "    # Calling \"model_pipeline\" To Get The Model Objects\n",
    "    model_lr, model_ls, model_xgb,model_R,model_Er = model_pipeline(data)\n",
    "    \n",
    "    # Store It Inside A List\n",
    "    models = [model_lr, model_ls, model_xgb,model_R,model_Er]\n",
    "    \n",
    "    #(1) Linear Regression Tuning\n",
    "    param_grid = {'n_jobs':[0,1,2,3,4,5],\n",
    "                  'positive':[True,False]}\n",
    "    \n",
    "    model_lr_t = GridSearchCV(\n",
    "        estimator = model_lr[0],\n",
    "        param_grid = param_grid,\n",
    "        n_jobs=-1,\n",
    "        cv=5,\n",
    "        verbose=0\n",
    "    )\n",
    "    model_lr_t.fit(X_train,np.ravel(y_train))\n",
    "    model_lr_t_params = model_lr_t.best_params_\n",
    "    print('best params:- ',model_lr_t_params)    \n",
    "    print('R2 Score:', str(round(r2_score(y_train,model_lr_t.predict(X_train))*100)) + '%')\n",
    "    \n",
    "    \n",
    "    #(2) Lasso Tuning\n",
    "    param_grid = {'alpha': [0.1, 0.3, 0.5, 0.7, 0.9, 1, 1.5, 2, 2.5]}\n",
    "    # Lasso Model\n",
    "    model_ls_t = GridSearchCV(\n",
    "        estimator = model_ls[0],\n",
    "        param_grid = param_grid,\n",
    "        n_jobs=-1,\n",
    "        cv=5,\n",
    "        verbose=0\n",
    "    )\n",
    "    model_ls_t.fit(X_train,np.ravel(y_train))\n",
    "    model_ls_t_params = model_ls_t.best_params_\n",
    "    print('best params:- ',model_ls_t_params)    \n",
    "    print('R2 Score:', str(round(r2_score(y_train,model_ls_t.predict(X_train))*100)) + '%')\n",
    "    \n",
    "    \n",
    "     #(3) XGB Boost tuning\n",
    "    param_grid = {\n",
    "        'learning_rate':[0.1, 0.3, 0.5, 0.7, 0.9, 1, 1]\n",
    "    }\n",
    "    # Boost Model\n",
    "    model_xgb_t = GridSearchCV(\n",
    "        estimator = model_xgb[0],\n",
    "        param_grid = param_grid,\n",
    "        n_jobs=-1,\n",
    "        cv=5,\n",
    "        verbose=0,\n",
    "\n",
    "    )\n",
    "    model_xgb_t.fit(X_train,np.ravel(y_train))\n",
    "    model_xgb_t_params = model_xgb_t.best_params_\n",
    "    print('best params:- ',model_xgb_t_params)\n",
    "    print('R2 Score:', str(round(r2_score(y_train,model_xgb_t.predict(X_train))*100)) + '%')\n",
    "    \n",
    "     #(4) Ridge Tuning\n",
    "    param_grid = {'alpha': np.logspace(-3,3,10)}\n",
    "    # Ridge Model\n",
    "    model_R_t = GridSearchCV(\n",
    "        estimator=model_R[0],\n",
    "        param_grid = param_grid,\n",
    "\n",
    "        cv=5           \n",
    "    )\n",
    "    model_R_t.fit(X_train,y_train)\n",
    "    model_R_t_params = model_R_t.best_params_\n",
    "    \n",
    "    print('best params:- ',model_R_t_params)\n",
    "    print('R2 Score:', str(round(r2_score(y_train,model_R_t.predict(X_train))*100)) + '%')\n",
    "        \n",
    "                \n",
    "    #(5) Elastic net Tuning\n",
    "    param_grid = {'alpha': np.logspace(-0.5,2,3)}\n",
    "    # Ridge Model\n",
    "    model_E_t = GridSearchCV(\n",
    "        estimator=model_Er[0],\n",
    "        param_grid = param_grid,\n",
    "\n",
    "        cv=5\n",
    "\n",
    "    )\n",
    "    model_E_t.fit(X_train,y_train)\n",
    "    model_E_t_params = model_E_t.best_params_\n",
    "    \n",
    "    print('best params:- ',model_E_t_params)\n",
    "    print('R2 Score:', str(round(r2_score(y_train,model_E_t.predict(X_train))*100)) + '%')\n",
    "    \n",
    "    # Return All The Tuned Model Objects\n",
    "    return (model_lr_t,model_ls_t,model_xgb_t,model_R_t,model_E_t)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9e2604d",
   "metadata": {},
   "source": [
    "### Evaluating Tuned Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4feb2ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define A Method To Evaluate The Tuned Model\n",
    "def evaluate_tuned_model(data):\n",
    "    \n",
    "    # This If Statement Is A Hard Coded Value, Only Works For One DataSet. \n",
    "    # Ignore This If Statement.\n",
    "    if \"Global_intensity\" in all_columns:\n",
    "        if \"Global_intensity\" not in training_columns:\n",
    "            training_columns.append(\"Global_intensity\")\n",
    "    \n",
    "    # Call The \"train_test_splitting\" Method\n",
    "    X_train, X_test, y_train, y_test = train_test_splitting(data)\n",
    "    \n",
    "    # Call Tuned Model\n",
    "    model_lr_t,model_ls_t, model_xgb_t, model_R_t,model_E_t = tuning_model(data)\n",
    "    \n",
    "    # Store It Inside List\n",
    "    tuned_model_list = [model_lr_t,model_ls_t, model_xgb_t, model_R_t,model_E_t]\n",
    "    \n",
    "    #Model Names\n",
    "    model_names = ['linear_regression','lasso', 'xgb_boost_regression','Ridge regression','Elastic net']\n",
    "    \n",
    "    col_names = ['Algorithm','Accuracy_After','RMSE_After','Intercept_After']\n",
    "    \n",
    "    model_list=[]           \n",
    "    #Loop through models in model_list and obtain metrics and add to data_list\n",
    "    for i, model_i in enumerate(tuned_model_list):\n",
    "        row_list = [model_names[i]]\n",
    "        row_list.extend([\n",
    "\n",
    "            str(round(r2_score(y_test,model_i.predict(X_test))*100)) + ' %',\n",
    "            mean_squared_error(y_test,model_i.predict(X_test))**0.5,\n",
    "            model_i.best_estimator_.intercept_\n",
    "\n",
    "        ]);\n",
    "        model_list.append(row_list)\n",
    "    tuned_model = pd.DataFrame(model_list, columns=col_names)\n",
    "    \n",
    "    # Return Tuned Model Performance Details\n",
    "    return tuned_model\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1efd8649",
   "metadata": {},
   "source": [
    "### Combining Base & Tuned Model Performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9340eaba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This Method Will Combine The BASE Model $ TUNED Model Results.\n",
    "def combine_base_tuned_model(data):\n",
    "    \n",
    "    # Call Base Model Performance\n",
    "    base_model = evaluate_base_model(data)\n",
    "    \n",
    "    # Call Tuned Model Performance\n",
    "    tuned_model = evaluate_tuned_model(data)\n",
    "    \n",
    "    # Merge Base and Tuned Model Performance\n",
    "    base_tuned_result = pd.merge(base_model,tuned_model).sort_values(by=['Accuracy_Before','Accuracy_After'],ascending=False)\n",
    "    \n",
    "    # Return The Combined Result\n",
    "    return base_tuned_result\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d08a8de",
   "metadata": {},
   "source": [
    "### Flatterning List Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c58f8e82",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Flatten Method To Flatten The List Into Indivisual Values\n",
    "def flatten(test_list):\n",
    "    if isinstance(test_list, list):\n",
    "        temp = []\n",
    "        for ele in test_list:\n",
    "            temp.extend(flatten(ele))\n",
    "        return temp\n",
    "    else:\n",
    "        return [test_list]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ac9031b",
   "metadata": {},
   "source": [
    "### Generating Model Coefficients Values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5569200b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_coefficients(data):\n",
    "        \n",
    "    # Call Tuned Model\n",
    "    model_lr_t,model_ls_t, model_xgb_t, model_R_t,model_E_t = tuning_model(data)\n",
    "    \n",
    "    #Calling \"combine_base_tuned_model\" Method.\n",
    "    base_tuned_result = combine_base_tuned_model(data)\n",
    "    \n",
    "    coef1=model_lr_t.best_estimator_.coef_\n",
    "    coef2=model_ls_t.best_estimator_.coef_\n",
    "    coef3=model_xgb_t.best_estimator_.coef_\n",
    "    coef4=model_R_t.best_estimator_.coef_\n",
    "    coef5=model_E_t.best_estimator_.coef_\n",
    "    \n",
    "    coeffs=[flatten(['linear_regression',coef1.tolist()]),\n",
    "           flatten(['lasso',coef2.tolist()]),\n",
    "           flatten(['xgb_boost_regression',coef3.tolist()]),\n",
    "           flatten(['Ridge regression',coef4.tolist()]),\n",
    "           flatten(['Elastic net',coef5.tolist()])]\n",
    "    \n",
    "    # This If Statement Is A Hard Coded Value, Only Works For One DataSet. \n",
    "    # Ignore This If Statement.\n",
    "    if \"Global_intensity\" in all_columns:\n",
    "        if \"Global_intensity\" not in training_columns:\n",
    "            training_columns.append(\"Global_intensity\")\n",
    "    \n",
    "    needed_columns = ['Algorithm']\n",
    "    \n",
    "    needed_columns.extend(training_columns)\n",
    "    \n",
    "    coeffs_df=pd.DataFrame(coeffs,columns=needed_columns) # Contents All The Intercept Values\n",
    "    \n",
    "    # Contents Accuracy Score and The Intercept Values\n",
    "    model_coefficient=pd.merge(base_tuned_result,coeffs_df) \n",
    "    \n",
    "    required_columns = ['Algorithm','Intercept_After']\n",
    "    \n",
    "    required_columns.extend(training_columns)\n",
    "    \n",
    "    required_columns.extend(['Accuracy_Before','Accuracy_After','RMSE_Before','RMSE_After'])\n",
    "    \n",
    "    model_coefficient = model_coefficient[required_columns] # Will contain algo,intercepts,coefficients, accuracy\n",
    "    \n",
    "    # Return The Coefficient Data Frame Contains All Coefficients And Intercepts\n",
    "    return model_coefficient\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "544c2741",
   "metadata": {},
   "source": [
    "### Main Function Defination "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f415c67d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def main():\n",
    "    \n",
    "    # Global Variables\n",
    "    # i = Meter1, Meter2, Meter3 Values\n",
    "    # j = Number Of Meters [0,1,2] ,It will iterate over model_coeff dataset.\n",
    "    # k = Training Column Names\n",
    "    j = 0\n",
    "    \n",
    "    meter_data = {} # Meter Dictionary That Will Contain All The Distinct Meter DataSet.\n",
    "    \n",
    "    insert_loc = len(all_columns) - 1\n",
    "    \n",
    "    pred_column_name = 'Pred_'+target_column\n",
    "    \n",
    "    meter_dict = {}\n",
    "\n",
    "    meter_list = []\n",
    "\n",
    "    frames = []\n",
    "\n",
    "    final_frames = []\n",
    "\n",
    "    model_coeff = pd.DataFrame()\n",
    "\n",
    "    final_table = pd.DataFrame()\n",
    "    \n",
    "\n",
    "    # Creating Different DataFrames For Different Values Of Iter_Columns.\n",
    "    # This Will Store All The Coefficient Values.\n",
    "    for i in required_data[iter_column].unique():\n",
    "\n",
    "        meter_data_values = required_data[required_data[iter_column]==i]\n",
    "\n",
    "        meter_data_values = meter_data_values.reset_index(drop=True)\n",
    "\n",
    "        meter_list.append(model_coefficients(meter_data_values))\n",
    "\n",
    "    if(os.path.exists(all_model_performance_location) and os.path.isfile(all_model_performance_location)):\n",
    "        os.remove(all_model_performance_location)\n",
    "        \n",
    "    # Inserting New Meter Columns In All The Meter Coefficient Dataset\n",
    "    for i,j in zip(required_data[iter_column].unique(),meter_list):\n",
    "\n",
    "        meter_dict[i] = j\n",
    "\n",
    "        meter_dict[i].insert(loc=0, column=\"MeterDetails\", value=i)\n",
    "        \n",
    "        meter_dict[i].to_csv(all_model_performance_location,mode='a', index=False)\n",
    "\n",
    "        \n",
    "    \n",
    "    #Creating A Final Table Containing All the Meters Coefficient Values Of Efficent Model.\n",
    "    for i in required_data[iter_column].unique():\n",
    "        \n",
    "        frames.append(meter_dict[i].iloc[:1])\n",
    "        \n",
    "        model_coeff = pd.concat(frames,ignore_index=True)\n",
    "        \n",
    "    # Storing The Coefficient Values In A CSV File    \n",
    "    model_coeff.to_csv(coeff_file_location,index=False)\n",
    "        \n",
    "    # In This For Loop We Will Calculate The Predicted Power Consumption. \n",
    "    # First For Loop Will Iterate Over All The Unique Meters Present. \n",
    "    j = 0\n",
    "    for i in required_data[iter_column].unique():      \n",
    "        meter_data[i] = data[data[iter_column]== i]\n",
    "        sum_coeff = 0\n",
    "        # Second For Loop Will Iterate Over All The Training Columns \n",
    "        # And Multiply It With The Coefficient Values And Add It.\n",
    "        for k in training_columns:\n",
    "            x = model_coeff.loc[j,k] # Coefficient Value\n",
    "            y = meter_data[i][k] # Actual Value It Will Return A Column\n",
    "            dataType = meter_data[i].dtypes\n",
    "            if dataType[k] == 'O':\n",
    "                y = 1\n",
    "            else:\n",
    "                y = preprocessing.scale(y)\n",
    "                \n",
    "            sum_coeff = sum_coeff + (x * y)\n",
    "\n",
    "        meter_data[i][pred_column_name] = model_coeff.loc[j,'Intercept_After'] + sum_coeff\n",
    "        meter_data[i]['Error_Difference'] = meter_data[i][target_column] - meter_data[i][pred_column_name]\n",
    "        meter_data[i].insert(loc=insert_loc, column='Algorithm', value=model_coeff.loc[j,'Algorithm'])\n",
    "        meter_data[i].insert(loc=insert_loc+1, column='Accuracy_Before', value=model_coeff.loc[j,'Accuracy_Before'])\n",
    "        meter_data[i].insert(loc=insert_loc+2, column='Accuracy_After', value=model_coeff.loc[j,'Accuracy_After'])\n",
    "        meter_data[i].insert(loc=insert_loc+3, column='RMSE_Before', value=model_coeff.loc[j,'RMSE_Before'])\n",
    "        meter_data[i].insert(loc=insert_loc+4, column='RMSE_After', value=model_coeff.loc[j,'RMSE_After'])\n",
    "        j = j + 1\n",
    "\n",
    "\n",
    "    #Creating A Final Table Containing All the Meters Coefficient Values Of Efficent Model.¶\n",
    "\n",
    "    for i in required_data[iter_column].unique():\n",
    "        \n",
    "        final_frames.append(meter_data[i])\n",
    "\n",
    "    final_table = pd.concat(final_frames,ignore_index=True)\n",
    "    \n",
    "    final_table.to_csv(final_output_file_location,index=False)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff57190e",
   "metadata": {},
   "source": [
    "### Calling Main Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79e8fdd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "main()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5742ec4",
   "metadata": {},
   "source": [
    "### All Model Performance Metrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1e518a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_model_perform = pd.read_csv(all_model_performance_location)\n",
    "all_model_perform"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d24099b8",
   "metadata": {},
   "source": [
    "### Model Coefficients Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a63a8e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_coeff = pd.read_csv(coeff_file_location)\n",
    "model_coeff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7395402",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_coeff[[\"MeterDetails\",\"Algorithm\",\"Accuracy_Before\",\"Accuracy_After\",\"RMSE_Before\",\"RMSE_After\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "54661746",
   "metadata": {},
   "source": [
    "### Final Prediction Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f378fa0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_output = pd.read_csv(final_output_file_location)\n",
    "final_output.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d137427",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d69cae07",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b47fa548",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f8c8a61",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03b29293",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
Praudyog

Linear Regression Pipeline

Leave a Reply Cancel reply