top of page

Decision Tree Regressor And Support Vector Regression With K-fold Cross Validation

realcode4you

Here we have implement two Data Mining Techniques, below the some basic steps which used to implement Boston House Pricing data.


Data Mining Basics

1) Data pre-processing: Data categorical feature transformation, Data numerical feature normalization, Missing values imputation, and Cross-Validation.


Techniques

1) Decision Tree Regression

2) Support Vector Methodology

3) K-NN prediction modelling

4) K-mean clustering

5) Naïve Bayesian


Import Libraries

#import libraires
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, make_scorer
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import itertools 
import numpy as np
import matplotlib.pyplot as plt

Read CSV

#Read Dataset
df = pd.read_csv("Boston Real Est.csv")
df

Output:












Checking Missing Values

#Checking Null Value
#Visualize for check null value
check_null_value = df.isnull()
sns.heatmap(check_null_value,yticklabels=False,cbar=False,cmap='viridis')

Output:












Show Dataset Columns:

#show data frame all columns
df.columns

Output:

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'MEDV'], dtype='object')


In above heatmap we an see their is two missing values in RM Now we need to remove it using median


Remove Missing Values by median

#Remove Missing Values by Median
df['RM'].fillna(df['RM'].median(), inplace=True)

Checking Again Missing Values

#Now checking again Nan value
#Visualize for check null value
check_null_value = df.isnull()
sns.heatmap(check_null_value,yticklabels=False,cbar=False,cmap='viridis')

Output:












In above heat map we can she there are no missing values, all is replace by median.


#Checking shape of dataset
df.shape

Output:

(511, 13)
#checking dataset information
df.info()

Output:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     511 non-null    float64
 1   ZN       511 non-null    float64
 2   INDUS    511 non-null    float64
 3   CHAS     511 non-null    int64  
 4   NOX      511 non-null    float64
 5   RM       511 non-null    float64
 6   AGE      511 non-null    float64
 7   DIS      511 non-null    float64
 8   RAD      511 non-null    int64  
 9   TAX      511 non-null    int64  
 10  PTRATIO  511 non-null    float64
 11  LSTAT    511 non-null    float64
 12  MEDV     511 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 52.0 KB


Summary Statistics

#checking summary of dataset
df.describe()

Output:


Feature Selection

#Deviding the target and features variables
X = df.drop('MEDV', axis = 1)
y = df['MEDV']

Normalize Features

from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)


Split Dataset

#split
# Import 'train_test_split'
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=2)

# Success
print("Training and testing split was successful.")

K-fold Cross Validation

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits=10)


SVR(Support Vector Regression)

from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_rbf.fit( X_train, y_train)
scores = cross_val_score(svr_rbf, X_train, y_train, cv=kf)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Output:

Accuracy: 0.77 (+/- 0.12)

Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor
desc_tr = DecisionTreeRegressor(max_depth=3)
desc_tr.fit(X_train, y_train)
scores = cross_val_score(desc_tr, X_train, y_train, cv=kf)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Output:

Accuracy: 0.62 (+/- 0.12)

Draw the Graph

!pip install graphviz
!pip install pydotplus
import six
import sys
sys.modules['sklearn.externals.six'] = six
#draw tree
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
feature_cols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
dot_data = StringIO()
export_graphviz(desc_tr, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('boston.png')
Image(graph.create_png())

Output:



KNN

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit( X_train, y_train)
scores = cross_val_score(knn, X_train, y_train, cv=kf)
print("KNN Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Output:

KNN Accuracy: 0.64 (+/- 0.12

Comments


bottom of page