sklearn
sklearn_01
YeeKal
•
•
"#sklearn"
intro to sklearn
datasets
sklearn.datasets module includes utilities to load datasets, including methods to load and fetch popular reference datasets. It also features some artificial data generators.
#load iris dataset
from sklearn.datasets import load_iris
data = load_iris()
data_X = data.data
data_y = data.target
# load Boston house price
from sklearn import datasets
loaded_data = datasets.load_boston()
#导入数据
data_X = loaded_data.data
data_y = loaded_data.target
通用学习模式
# 通用学习模式
model = KNeighborsClassifier()
model.fit(X_train, y_train)
model.score(x_test,y_test) #Return the mean accuracy on the given test data and labels.
print(model.predict(X_test))
print(y_test)
# model parameters
model.coef_
model.intercept_
model.get_params()
model.score(data_X, data_y) #coefficient of determination
# 预测结果
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(predictions)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
# 交叉验证
result=model_selection.cross_val_score(knn, X_train, Y_train, cv=5)
print(result.mean())
#保存模型
#保存Model(注:save文件夹要预先建立,否则会报错)
with open('save/clf.pickle', 'wb') as f:
pickle.dump(clf, f)
#读取Model
with open('save/clf.pickle', 'rb') as f:
clf2 = pickle.load(f)
#测试读取后的Model
print(clf2.predict(X[0:1]))
#使用jopblib保存模型
from sklearn.externals import joblib #jbolib模块
#保存Model(注:save文件夹要预先建立,否则会报错)
joblib.dump(clf, 'save/clf.pkl')
#读取Model
clf3 = joblib.load('save/clf.pkl')
#测试读取后的Model
print(clf3.predict(X[0:1]))
文本类别编码
简单数字映射
from sklearn.preprocessing import LabelEncoder
sex=df_train['Sex']
encoder=LabelEncoder()
sex_encoded=encoder.fit_transform(sex)
print(sex_encoded)
print(sex)
print(encoder.classes_)# 查看编码映射
独热编码
from sklearn.preprocessing import OneHotEncoder
encoder2=OneHotEncoder()
sex_hot=encoder2.fit_transform(sex_encoded.reshape(-1,1))
print(sex_hot)
'''
<891x2 sparse matrix of type '<class 'numpy.float64'>'
with 891 stored elements in Compressed Sparse Row format>
# 这是SciPy中的稀疏矩阵,每一行仅有一个是1,使用sex_hot.toarray()转换为Numpy数组
'''
合并上述两个编码
from sklearn.preprocessing import LabelBinarizer
sex=df_train['Sex']
encoder=LabelBinarizer()
sex_encoded=encoder.fit_transform(sex)
#通过sparse_output=True给LabelBinarizer构造函数,可以得到稀疏矩阵