Commit 8ca3e9e2 authored by Zdenek Svaton's avatar Zdenek Svaton

HW 5 initial commit

parent 9adef5d9
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import math
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
data = pd.read_csv('dataHW05.csv', index_col=0)
data = data.drop([data.columns[0],data.columns[1]], axis=1)
data = data - data.mean()
dtrain, dtest = train_test_split(data, test_size=0.25, random_state=458)
X = dtrain.drop(['SalePrice'], axis=1, errors='ignore')
y = dtrain.SalePrice
Xtest = dtest.drop(['SalePrice'], axis=1, errors='ignore')
ytest = dtest.SalePrice
q = len(data.columns) - 70
X = X.as_matrix()
Xtest1 = Xtest.as_matrix()
Xtest2 = scale(Xtest.as_matrix())
RMSLE = []
RMSLE_scale = []
for n in range(1, q):
Xsub1 = X[:, 0:n]
Xsubtest1 = Xtest1[:, 0:n]
clf1 = LinearRegression()
clf1.fit(Xsub1, y)
# save RMSLE
RMSLE.append(np.sqrt(mean_squared_error(clf1.predict(Xsubtest1), ytest)))
Xsub2 = X[:, 0:n]
Xsubtest2 = Xtest2[:, 0:n]
clf1 = LinearRegression()
clf1.fit(Xsub2, y)
# save RMSLE
RMSLE_scale.append(np.sqrt(mean_squared_error(clf1.predict(Xsubtest2), ytest)))
plt.subplots(1, 1, figsize=(15, 8))
ns = plt.scatter(range(1, q), RMSLE, c='red')
s = plt.scatter(range(1, q), RMSLE_scale, c='green')
plt.title(u"Comparision without PCA")
plt.xlabel(u'number of principal component used')
plt.ylabel('RMSLE')
plt.plot([0, q], [0.5, 0.5], 'b-')
plt.legend((ns, s), ('non-scaled', 'scaled'))
plt.show()
# loading data
data = pd.read_csv('dataHW05.csv', index_col=0)
how_much = len(data.columns)
corr = data.corr(method='spearman')
corr_cols = corr.SalePrice.abs().nlargest(how_much)
plt.subplots(1, 1, figsize=(15, 8))
ns = plt.scatter(range(1, how_much), corr_cols[1:], c='red')
plt.title(u"Column correlation indexes")
plt.plot([0, how_much], [1, 1], 'b-')
plt.savefig('corealtion_indexes.jpeg')
plt.show()
ix = data.corr().sort_values('SalePrice', ascending=False).index
data = data.loc[:, ix]
# spliting data to train and test datasets
dtrain, dtest = train_test_split(data, test_size=0.25, random_state=458)
X = dtrain.drop(['SalePrice'], axis=1, errors='ignore')
y = dtrain.SalePrice
Xtest = dtest.drop(['SalePrice'], axis=1, errors='ignore')
ytest = dtest.SalePrice
q = int(len(data.columns)/2)
pca = PCA()
pca.fit_transform(X)
X1 = pca.transform(X)
Xtest1 = pca.transform(Xtest)
pca.fit_transform(scale(X))
X2 = pca.transform(scale(X))
Xtest2 = pca.transform(scale(Xtest))
RMSLE = []
RMSLE_scale = []
for n in range(1, q):
Xsub1 = X1[:, 0:n]
Xsubtest1 = Xtest1[:, 0:n]
clf1 = LinearRegression()
clf1.fit(Xsub1, y)
# save RMSLE
RMSLE.append(np.sqrt(mean_squared_error(clf1.predict(Xsubtest1), ytest)))
Xsub2 = X2[:, 0:n]
Xsubtest2 = Xtest2[:, 0:n]
clf1 = LinearRegression()
clf1.fit(Xsub2, y)
# save RMSLE
RMSLE_scale.append(np.sqrt(mean_squared_error(clf1.predict(Xsubtest2), ytest)))
plt.subplots(1, 1, figsize=(15, 8))
ns = plt.scatter(range(1, q), RMSLE, c='red')
s = plt.scatter(range(1, q), RMSLE_scale, c='green')
plt.title(u"RMSLE as a function of number of principal components used")
plt.xlabel(u'number of principal component used')
plt.ylabel('RMSLE')
plt.plot([0, q], [1, 1], 'b-')
plt.legend((ns, s), ('non-scaled', 'scaled'))
plt.show()
plt.savefig('scaled_vs_nonscaled.jpeg')
print()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment