### Configuration Variables.

In [1]:
# We Need To Provide All These Variables Before Proceding.
# All Column Names Of Your DataSet.
all_columns = ["Meter","dt","Global_reactive_power","Voltage","Global_intensity","Sub_metering_1","Sub_metering_2","Sub_metering_3","Power_Consumption"] 
# Columns You Want To Drop From Your DataSet.
dropped_columns = ["dt"]
# Columns You Want To Train Your Model.
training_columns = ["Global_reactive_power","Voltage","Sub_metering_1","Sub_metering_2","Sub_metering_3"]
# Target Column You Are Predicting.
target_column = "Power_Consumption"
# Name Of The Column You Want To Do Iteration.
iter_column = 'Meter'
# Input File Location
file_location= "C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Power_Consumption_Meters.csv"
# All Model Performance File
all_model_performance_location = "C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Model_Performance.csv"
# Model Coefficients File
coeff_file_location = "C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Model_Coefficients.csv"
#Final Output File
final_output_file_location = "C:/Subrat Documents/Honeywell Project Work/1- Multiple Model Performance/datasets/Final_Output.csv"

### Required Libraries

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split # Hyperparameter tuning
from category_encoders import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'xgboost'

### Input Data Reading

In [None]:
data = pd.read_csv(file_location)

### Dropping The Not Required Columns

In [None]:
required_data = data.drop(dropped_columns , axis = 1, inplace=False)

### Auto Encodes Any Dataframe Column Of Type Category Or Object.

In [None]:
def dummyEncode(data):
 columnsToEncode = list(data.select_dtypes(include=['category','object']))
 le = LabelEncoder()
 for feature in columnsToEncode:
 try:
 data[feature] = le.fit_transform(data[feature])
 except:
 print('Error encoding '+feature)
 return data

### Training & Testing The Model & Predicting Performance

In [None]:
def train_test_splitting(data):
 # Do The Label Encoding Of Your DataSet.
 encoded_data = dummyEncode(data[training_columns])
 # Add Target Column To The "Encoded Data" Dataset.
 encoded_data[target_column]=data[target_column]
 # Assign X To The Training Column Values.
 X = np.array(encoded_data.drop([target_column],1))
 # Assign "y" to the target Column Values.
 y = np.array(encoded_data[target_column])
 # skale X – normalized -1 to 1.
 X = preprocessing.scale(X)
 # Split The Encoded Data Into Train & Test Split
 X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=48)
 # Return The Train & Test Split Values As A Tuple
 return (X_train, X_test, y_train, y_test)
 

### Creating Base Model Pipeline.

In [None]:
def model_pipeline(data):
 
 # This If Statement Is A Hard Coded Value, Only Works For One DataSet. 
 # Ignore This If Statement.
 if "Global_intensity" in all_columns:
 if "Global_intensity" in training_columns:
 training_columns.remove("Global_intensity")
 
 # Call The "train_test_splitting" Method
 X_train, X_test, y_train, y_test = train_test_splitting(data)
 
 # Linear Regression Model
 model_lr = make_pipeline(
 LinearRegression()
 
 )
 model_lr.fit(X_train,y_train)
 
 
 # Lasso Model
 model_ls = make_pipeline(
 
 Lasso(alpha=1)
 )
 model_ls.fit(X_train,y_train)
 
 
 #XGB Regressor Model
 model_xgb = make_pipeline(
 
 XGBRegressor(booster='gblinear') # learning_rate=0.1, 0.01, 02
 )
 
 model_xgb.fit(X_train,y_train)
 
 # Ridge Model
 model_R = make_pipeline(Ridge()
 
 )
 model_R.fit(X_train,y_train)
 
 # Elastic-net Model
 model_Er = make_pipeline(
 
 ElasticNet()
 )
 model_Er.fit(X_train,y_train)
 
 # Return All The Trained Model
 return (model_lr,model_ls,model_xgb,model_R,model_Er)
 

### Evaluating Base Model Performance

In [None]:
def base_model_evaluation(data,trained_model_list, model_names):
 # "trained_model_list" = Will Contain All The Trained Model Objects
 # "model_names" = Will Contain The Algorithms Names Only
 
 # This If Statement Is A Hard Coded Value, Only Works For One DataSet. 
 # Ignore This If Statement.
 if "Global_intensity" in all_columns:
 if "Global_intensity" in training_columns:
 training_columns.remove("Global_intensity")
 
 # Call The "train_test_splitting" Method
 X_train, X_test, y_train, y_test = train_test_splitting(data)
 

 col_names = ['Algorithm','Accuracy_Before','RMSE_Before','Intercept_Before']
 
 # "model_list" = It Will Contain All Model Names and The 'Accuracy','RMSE','Intercept' Details
 model_list=[] 
 
 #Loop through models in trained_model_list and obtain metrics and add to model_list
 for i, model_i in enumerate(trained_model_list):
 # "row_list" = It Will Contain 'Algorithm','Accuracy','RMSE','Intercept' Values
 row_list = [model_names[i]]
 row_list.extend([
 str(round(r2_score(y_test,model_i.predict(X_test))*100)) + '%',
 mean_squared_error(y_test,model_i.predict(X_test))**0.5,
 model_i._final_estimator.intercept_
 ]);
 model_list.append(row_list)
 
 # "base_model" = Data Frame Will Store All Model Performance Details 
 base_model = pd.DataFrame(model_list, columns=col_names)
 
 # Return Base Model DataFrame
 # Model Name,Accuracy RMSE and Intercept
 return base_model
 
 

### Calling Base Model Evaluation Method

In [None]:
# Declare A Method That Will Call The "base_model_evaluation" Method And Get The Scores
def evaluate_base_model(data):
 # Calling "model_pipeline" To Get The Model Objects
 model_lr, model_ls, model_xgb,model_R,model_Er = model_pipeline(data)
 
 # Store It Inside A List
 models = [model_lr, model_ls, model_xgb,model_R,model_Er]
 
 # List Of Algorithms You Are Using
 modelnams = ['linear_regression','lasso', 'xgb_boost_regression','Ridge regression','Elastic net']
 
 # Calling "base_model_evaluation" Method
 base_model = base_model_evaluation(data,models,modelnams)
 
 return base_model

### Tuning Our Base Model

In [None]:
# Declare A Method That Will Tune The Base Model
def tuning_model(data):
 
 # This If Statement Is A Hard Coded Value, Only Works For One DataSet. 
 # Ignore This If Statement.
 if "Global_intensity" in all_columns:
 if "Global_intensity" not in training_columns:
 training_columns.append("Global_intensity")
 
 # Call The "train_test_splitting" Method
 X_train, X_test, y_train, y_test = train_test_splitting(data)
 
 # Calling "model_pipeline" To Get The Model Objects
 model_lr, model_ls, model_xgb,model_R,model_Er = model_pipeline(data)
 
 # Store It Inside A List
 models = [model_lr, model_ls, model_xgb,model_R,model_Er]
 
 #(1) Linear Regression Tuning
 param_grid = {'n_jobs':[0,1,2,3,4,5],
 'positive':[True,False]}
 
 model_lr_t = GridSearchCV(
 estimator = model_lr[0],
 param_grid = param_grid,
 n_jobs=-1,
 cv=5,
 verbose=0
 )
 model_lr_t.fit(X_train,np.ravel(y_train))
 model_lr_t_params = model_lr_t.best_params_
 print('best params:- ',model_lr_t_params) 
 print('R2 Score:', str(round(r2_score(y_train,model_lr_t.predict(X_train))*100)) + '%')
 
 
 #(2) Lasso Tuning
 param_grid = {'alpha': [0.1, 0.3, 0.5, 0.7, 0.9, 1, 1.5, 2, 2.5]}
 # Lasso Model
 model_ls_t = GridSearchCV(
 estimator = model_ls[0],
 param_grid = param_grid,
 n_jobs=-1,
 cv=5,
 verbose=0
 )
 model_ls_t.fit(X_train,np.ravel(y_train))
 model_ls_t_params = model_ls_t.best_params_
 print('best params:- ',model_ls_t_params) 
 print('R2 Score:', str(round(r2_score(y_train,model_ls_t.predict(X_train))*100)) + '%')
 
 
 #(3) XGB Boost tuning
 param_grid = {
 'learning_rate':[0.1, 0.3, 0.5, 0.7, 0.9, 1, 1]
 }
 # Boost Model
 model_xgb_t = GridSearchCV(
 estimator = model_xgb[0],
 param_grid = param_grid,
 n_jobs=-1,
 cv=5,
 verbose=0,

 )
 model_xgb_t.fit(X_train,np.ravel(y_train))
 model_xgb_t_params = model_xgb_t.best_params_
 print('best params:- ',model_xgb_t_params)
 print('R2 Score:', str(round(r2_score(y_train,model_xgb_t.predict(X_train))*100)) + '%')
 
 #(4) Ridge Tuning
 param_grid = {'alpha': np.logspace(-3,3,10)}
 # Ridge Model
 model_R_t = GridSearchCV(
 estimator=model_R[0],
 param_grid = param_grid,

 cv=5 
 )
 model_R_t.fit(X_train,y_train)
 model_R_t_params = model_R_t.best_params_
 
 print('best params:- ',model_R_t_params)
 print('R2 Score:', str(round(r2_score(y_train,model_R_t.predict(X_train))*100)) + '%')
 
 
 #(5) Elastic net Tuning
 param_grid = {'alpha': np.logspace(-0.5,2,3)}
 # Ridge Model
 model_E_t = GridSearchCV(
 estimator=model_Er[0],
 param_grid = param_grid,

 cv=5

 )
 model_E_t.fit(X_train,y_train)
 model_E_t_params = model_E_t.best_params_
 
 print('best params:- ',model_E_t_params)
 print('R2 Score:', str(round(r2_score(y_train,model_E_t.predict(X_train))*100)) + '%')
 
 # Return All The Tuned Model Objects
 return (model_lr_t,model_ls_t,model_xgb_t,model_R_t,model_E_t)

### Evaluating Tuned Model

In [None]:
# Define A Method To Evaluate The Tuned Model
def evaluate_tuned_model(data):
 
 # This If Statement Is A Hard Coded Value, Only Works For One DataSet. 
 # Ignore This If Statement.
 if "Global_intensity" in all_columns:
 if "Global_intensity" not in training_columns:
 training_columns.append("Global_intensity")
 
 # Call The "train_test_splitting" Method
 X_train, X_test, y_train, y_test = train_test_splitting(data)
 
 # Call Tuned Model
 model_lr_t,model_ls_t, model_xgb_t, model_R_t,model_E_t = tuning_model(data)
 
 # Store It Inside List
 tuned_model_list = [model_lr_t,model_ls_t, model_xgb_t, model_R_t,model_E_t]
 
 #Model Names
 model_names = ['linear_regression','lasso', 'xgb_boost_regression','Ridge regression','Elastic net']
 
 col_names = ['Algorithm','Accuracy_After','RMSE_After','Intercept_After']
 
 model_list=[] 
 #Loop through models in model_list and obtain metrics and add to data_list
 for i, model_i in enumerate(tuned_model_list):
 row_list = [model_names[i]]
 row_list.extend([

 str(round(r2_score(y_test,model_i.predict(X_test))*100)) + ' %',
 mean_squared_error(y_test,model_i.predict(X_test))**0.5,
 model_i.best_estimator_.intercept_

 ]);
 model_list.append(row_list)
 tuned_model = pd.DataFrame(model_list, columns=col_names)
 
 # Return Tuned Model Performance Details
 return tuned_model


### Combining Base & Tuned Model Performance

In [None]:
# This Method Will Combine The BASE Model $ TUNED Model Results.
def combine_base_tuned_model(data):
 
 # Call Base Model Performance
 base_model = evaluate_base_model(data)
 
 # Call Tuned Model Performance
 tuned_model = evaluate_tuned_model(data)
 
 # Merge Base and Tuned Model Performance
 base_tuned_result = pd.merge(base_model,tuned_model).sort_values(by=['Accuracy_Before','Accuracy_After'],ascending=False)
 
 # Return The Combined Result
 return base_tuned_result
 

### Flatterning List Function

In [None]:
# Flatten Method To Flatten The List Into Indivisual Values
def flatten(test_list):
 if isinstance(test_list, list):
 temp = []
 for ele in test_list:
 temp.extend(flatten(ele))
 return temp
 else:
 return [test_list]

### Generating Model Coefficients Values

In [None]:
def model_coefficients(data):
 
 # Call Tuned Model
 model_lr_t,model_ls_t, model_xgb_t, model_R_t,model_E_t = tuning_model(data)
 
 #Calling "combine_base_tuned_model" Method.
 base_tuned_result = combine_base_tuned_model(data)
 
 coef1=model_lr_t.best_estimator_.coef_
 coef2=model_ls_t.best_estimator_.coef_
 coef3=model_xgb_t.best_estimator_.coef_
 coef4=model_R_t.best_estimator_.coef_
 coef5=model_E_t.best_estimator_.coef_
 
 coeffs=[flatten(['linear_regression',coef1.tolist()]),
 flatten(['lasso',coef2.tolist()]),
 flatten(['xgb_boost_regression',coef3.tolist()]),
 flatten(['Ridge regression',coef4.tolist()]),
 flatten(['Elastic net',coef5.tolist()])]
 
 # This If Statement Is A Hard Coded Value, Only Works For One DataSet. 
 # Ignore This If Statement.
 if "Global_intensity" in all_columns:
 if "Global_intensity" not in training_columns:
 training_columns.append("Global_intensity")
 
 needed_columns = ['Algorithm']
 
 needed_columns.extend(training_columns)
 
 coeffs_df=pd.DataFrame(coeffs,columns=needed_columns) # Contents All The Intercept Values
 
 # Contents Accuracy Score and The Intercept Values
 model_coefficient=pd.merge(base_tuned_result,coeffs_df) 
 
 required_columns = ['Algorithm','Intercept_After']
 
 required_columns.extend(training_columns)
 
 required_columns.extend(['Accuracy_Before','Accuracy_After','RMSE_Before','RMSE_After'])
 
 model_coefficient = model_coefficient[required_columns] # Will contain algo,intercepts,coefficients, accuracy
 
 # Return The Coefficient Data Frame Contains All Coefficients And Intercepts
 return model_coefficient


### Main Function Defination 

In [None]:
def main():
 
 # Global Variables
 # i = Meter1, Meter2, Meter3 Values
 # j = Number Of Meters [0,1,2] ,It will iterate over model_coeff dataset.
 # k = Training Column Names
 j = 0
 
 meter_data = {} # Meter Dictionary That Will Contain All The Distinct Meter DataSet.
 
 insert_loc = len(all_columns) - 1
 
 pred_column_name = 'Pred_'+target_column
 
 meter_dict = {}

 meter_list = []

 frames = []

 final_frames = []

 model_coeff = pd.DataFrame()

 final_table = pd.DataFrame()
 

 # Creating Different DataFrames For Different Values Of Iter_Columns.
 # This Will Store All The Coefficient Values.
 for i in required_data[iter_column].unique():

 meter_data_values = required_data[required_data[iter_column]==i]

 meter_data_values = meter_data_values.reset_index(drop=True)

 meter_list.append(model_coefficients(meter_data_values))

 if(os.path.exists(all_model_performance_location) and os.path.isfile(all_model_performance_location)):
 os.remove(all_model_performance_location)
 
 # Inserting New Meter Columns In All The Meter Coefficient Dataset
 for i,j in zip(required_data[iter_column].unique(),meter_list):

 meter_dict[i] = j

 meter_dict[i].insert(loc=0, column="MeterDetails", value=i)
 
 meter_dict[i].to_csv(all_model_performance_location,mode='a', index=False)

 
 
 #Creating A Final Table Containing All the Meters Coefficient Values Of Efficent Model.
 for i in required_data[iter_column].unique():
 
 frames.append(meter_dict[i].iloc[:1])
 
 model_coeff = pd.concat(frames,ignore_index=True)
 
 # Storing The Coefficient Values In A CSV File 
 model_coeff.to_csv(coeff_file_location,index=False)
 
 # In This For Loop We Will Calculate The Predicted Power Consumption. 
 # First For Loop Will Iterate Over All The Unique Meters Present. 
 j = 0
 for i in required_data[iter_column].unique(): 
 meter_data[i] = data[data[iter_column]== i]
 sum_coeff = 0
 # Second For Loop Will Iterate Over All The Training Columns 
 # And Multiply It With The Coefficient Values And Add It.
 for k in training_columns:
 x = model_coeff.loc[j,k] # Coefficient Value
 y = meter_data[i][k] # Actual Value It Will Return A Column
 dataType = meter_data[i].dtypes
 if dataType[k] == 'O':
 y = 1
 else:
 y = preprocessing.scale(y)
 
 sum_coeff = sum_coeff + (x * y)

 meter_data[i][pred_column_name] = model_coeff.loc[j,'Intercept_After'] + sum_coeff
 meter_data[i]['Error_Difference'] = meter_data[i][target_column] - meter_data[i][pred_column_name]
 meter_data[i].insert(loc=insert_loc, column='Algorithm', value=model_coeff.loc[j,'Algorithm'])
 meter_data[i].insert(loc=insert_loc+1, column='Accuracy_Before', value=model_coeff.loc[j,'Accuracy_Before'])
 meter_data[i].insert(loc=insert_loc+2, column='Accuracy_After', value=model_coeff.loc[j,'Accuracy_After'])
 meter_data[i].insert(loc=insert_loc+3, column='RMSE_Before', value=model_coeff.loc[j,'RMSE_Before'])
 meter_data[i].insert(loc=insert_loc+4, column='RMSE_After', value=model_coeff.loc[j,'RMSE_After'])
 j = j + 1


 #Creating A Final Table Containing All the Meters Coefficient Values Of Efficent Model.¶

 for i in required_data[iter_column].unique():
 
 final_frames.append(meter_data[i])

 final_table = pd.concat(final_frames,ignore_index=True)
 
 final_table.to_csv(final_output_file_location,index=False)
 

### Calling Main Function

In [None]:
main()

### All Model Performance Metrix

In [None]:
all_model_perform = pd.read_csv(all_model_performance_location)
all_model_perform

### Model Coefficients Matrix

In [None]:
model_coeff = pd.read_csv(coeff_file_location)
model_coeff

In [None]:
model_coeff[["MeterDetails","Algorithm","Accuracy_Before","Accuracy_After","RMSE_Before","RMSE_After"]]

### Final Prediction Table

In [None]:
final_output = pd.read_csv(final_output_file_location)
final_output.head()