【发布时间】:2019-02-16 14:39:20
【问题描述】:
我正在开始 ML 之旅,但我在这个编码练习中遇到了麻烦 这是我的代码
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X @ theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X @ theta)
grad = (X.T @ (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T @ (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X @ theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
所以,当learningRate = 1 时,准确度应该在83,05% 左右,但我得到80.5%,当learningRate = 0 时,准确度应该是91.52%,但我得到87.28%
所以问题是 我做错了什么?为什么我的准确率低于问题默认答案?
希望有人能指引我正确的方向。谢谢!
P.D:这是数据集,也许它可以提供帮助
【问题讨论】:
-
数据集中有哪些列?
-
数据集的原始形状是 (118, 3) 所以 3 列是 X1(feature1) , X2(feature2) 和 y(target) 。我的“X”变量形状是 (118, 28) ,我的目标变量 "y" 形状是 (118, 1),我的 "theta (weights)" 变量形状是 (1, 28)。希望对您有所帮助。
-
您提供的 txt 文件有 2 列。这是正确的数据集吗?
-
还有,有什么特点和目标?
-
实际上它有 3 个,最后一个由 1 和 0 组成,Feature 是前 2 列,即“Test1”列和“Test2”列,目标是最后一个“已接受”列(1 和 0)
标签: python machine-learning logistic-regression regularized