# Loading packages
library(data.table) # used for reading and manipulation of data
library(dplyr) # used for data manipulation and joining
library(ggplot2) # used for plotting
library(caret) # used for modeling
library(e1071) # used for removing skewness
library(corrplot) # used for making correlation plot
library(xgboost) # used for building XGBoost model
library(cowplot) # used for combining multiple plots
# Importing datasets
train = fread("Train_UWu5bXk.csv")
test = fread("Test_u94Q5KV.csv")
# Structure of dataset
str(train)
# Setting test dataset
# Combining datasets
# add Item_Outlet_Sales to test data
test[, Item_Outlet_Sales := NA]
combi = rbind(train, test)
# Missing Value Treatment
missing_index = which(is.na(combi$Item_Weight))
for(i in missing_index){
item = combi$Item_Identifier[i]
combi$Item_Weight[i] = mean(combi$Item_Weight
[combi$Item_Identifier == item],
na.rm = T)
}
# Feature Engineering
# Feature Transformation
# Replacing 0 in Item_Visibility with mean
zero_index = which(combi$Item_Visibility == 0)
for(i in zero_index){
item = combi$Item_Identifier[i]
combi$Item_Visibility[i] = mean(
combi$Item_Visibility[combi$Item_Identifier == item],
na.rm = T
)
}
# Feature Construction
# Create a new feature 'Item_Type_new'
perishable = c("Breads", "Breakfast", "Dairy",
"Fruits and Vegetables", "Meat", "Seafood")
non_perishable = c("Baking Goods", "Canned", "Frozen Foods",
"Hard Drinks", "Health and Hygiene",
"Household", "Soft Drinks")
combi[,Item_Type_new := ifelse(Item_Type %in% perishable, "perishable",
ifelse(Item_Type %in% non_perishable,
"non_perishable", "not_sure"))]
combi[,Item_category := substr(combi$Item_Identifier, 1, 2)]
combi$Item_Fat_Content[combi$Item_category == "NC"] = "Non-Edible"
# Years of operation of Outlets
combi[,Outlet_Years := 2013 - Outlet_Establishment_Year]
combi$Outlet_Establishment_Year = as.factor(combi$Outlet_Establishment_Year)
# Price per unit weight
combi[,price_per_unit_wt := Item_MRP/Item_Weight]
# Label Encoding
combi[,Outlet_Size_num := ifelse(Outlet_Size == "Small", 0,
ifelse(Outlet_Size == "Medium", 1, 2))]
combi[,Outlet_Location_Type_num := ifelse(Outlet_Location_Type == "Tier 3", 0,
ifelse(Outlet_Location_Type == "Tier 2", 1, 2))]
combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL]
# One-hot Encoding
ohe = dummyVars("~.", data = combi[,-c("Item_Identifier",
"Outlet_Establishment_Year",
"Item_Type")], fullRank = T)
ohe_df = data.table(predict(ohe, combi[,-c("Item_Identifier",
"Outlet_Establishment_Year",
"Item_Type")]))
combi = cbind(combi[,"Item_Identifier"], ohe_df)
# Removing Skewness
skewness(combi$Item_Visibility)
skewness(combi$price_per_unit_wt)
combi[,Item_Visibility := log(Item_Visibility + 1)]
combi[,price_per_unit_wt := log(price_per_unit_wt + 1)]
# Scaling and Centering data
# index of numeric features
num_vars = which(sapply(combi, is.numeric))
num_vars_names = names(num_vars)
combi_numeric = combi[,setdiff(num_vars_names,
"Item_Outlet_Sales"), with = F]
prep_num = preProcess(combi_numeric, method=c("center", "scale"))
combi_numeric_norm = predict(prep_num, combi_numeric)
# Transforming Features
combi[,setdiff(num_vars_names, "Item_Outlet_Sales") := NULL]
combi = cbind(combi, combi_numeric_norm)
# Splitting data
train = combi[1:nrow(train)]
test = combi[(nrow(train) + 1):nrow(combi)]
# Removing Item_Outlet_Sales
test[,Item_Outlet_Sales := NULL]
# Model Building - xgboost
para_list = list(
objective = "reg:linear",
eta=0.01,
gamma = 1,
max_depth=6,
subsample=0.8,
colsample_bytree=0.5
)
# D Matrix
d_train = xgb.DMatrix(data = as.matrix(train[,-c("Item_Identifier",
"Item_Outlet_Sales")]),
label= train$Item_Outlet_Sales)
d_test = xgb.DMatrix(data = as.matrix(test[,-c("Item_Identifier")]))
# K-fold cross validation
set.seed(123) # Setting seed
xgb_cv = xgb.cv(params = para_list,
data = d_train,
nrounds = 1000,
nfold = 5,
print_every_n = 10,
early_stopping_rounds = 30,
maximize = F)
# Training model
model_xgb = xgb.train(data = d_train,
params = para_list,
nrounds = 428)
model_xgb
# Variable Importance Plot
variable_imp = xgb.importance(feature_names = setdiff(names(train),
c("Item_Identifier", "Item_Outlet_Sales")),
model = model_xgb)
xgb.plot.importance(variable_imp)