📚 引言
在上一篇机器学习入门文章中,我们学习了机器学习的基础概念和完整流程。今天,我们将深入探讨10个核心算法的原理、数学基础和实践技巧,帮助你真正理解算法背后的工作机制,并能够在实际项目中灵活应用。
1. 线性回归:从最小二乘法到梯度下降
1.1 数学原理
线性回归的目标是找到最佳拟合直线:y = wx + b,其中w是权重,b是偏置。
# 损失函数:均方误差(MSE)
# J(w,b) = (1/2n) * Σ(y_i - (wx_i + b))²
# 梯度下降更新公式
# w = w - α * ∂J/∂w
# b = b - α * ∂J/∂b
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# 生成示例数据
np.random.seed(42)
X = np.random.randn(100, 1) * 10
y = 2.5 * X.squeeze() + np.random.randn(100) * 5
# 训练模型
model = LinearRegression()
model.fit(X, y)
# 预测
y_pred = model.predict(X)
# 评估
print(f"系数: {model.coef_[0]:.2f}")
print(f"截距: {model.intercept_:.2f}")
print(f"R²分数: {r2_score(y, y_pred):.4f}")
print(f"均方误差: {mean_squared_error(y, y_pred):.2f}")
1.2 从零实现线性回归
class LinearRegressionScratch:
"""从零实现线性回归"""
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.lr = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def fit(self, X, y):
n_samples, n_features = X.shape
# 初始化参数
self.weights = np.zeros(n_features)
self.bias = 0
# 梯度下降
for _ in range(self.n_iterations):
# 前向传播
y_pred = np.dot(X, self.weights) + self.bias
# 计算梯度
dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
db = (1/n_samples) * np.sum(y_pred - y)
# 更新参数
self.weights -= self.lr * dw
self.bias -= self.lr * db
def predict(self, X):
return np.dot(X, self.weights) + self.bias
# 使用自定义实现
model_scratch = LinearRegressionScratch(learning_rate=0.01, n_iterations=1000)
model_scratch.fit(X, y)
y_pred_scratch = model_scratch.predict(X)
print(f"自定义实现 R²: {r2_score(y, y_pred_scratch):.4f}")
2. 逻辑回归:分类问题的基石
2.1 Sigmoid函数与决策边界
逻辑回归使用Sigmoid函数将输出映射到[0,1]区间,表示概率。
def sigmoid(z):
"""Sigmoid函数"""
return 1 / (1 + np.exp(-z))
# 可视化Sigmoid函数
import matplotlib.pyplot as plt
z = np.linspace(-10, 10, 100)
s = sigmoid(z)
plt.figure(figsize=(10, 6))
plt.plot(z, s, linewidth=2, color='#2980b9')
plt.axhline(y=0.5, color='red', linestyle='--', label='决策边界 (0.5)')
plt.axvline(x=0, color='gray', linestyle='--')
plt.xlabel('z')
plt.ylabel('sigmoid(z)')
plt.title('Sigmoid函数', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 逻辑回归示例
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# 生成分类数据
X, y = make_classification(n_samples=1000, n_features=4, n_classes=2, random_state=42)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练模型
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
# 预测概率
y_proba = lr_model.predict_proba(X_test)[:, 1]
y_pred = lr_model.predict(X_test)
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC分数: {roc_auc_score(y_test, y_proba):.4f}")
print(f"混淆矩阵:\n{confusion_matrix(y_test, y_pred)}")
3. 决策树:直观的树形结构
3.1 信息增益与基尼系数
| 指标 | 公式 | 说明 |
|---|---|---|
| 信息熵 | H = -Σp_i·log₂(p_i) | 衡量不确定性 |
| 基尼系数 | Gini = 1 - Σp_i² | 衡量不纯度 |
| 信息增益 | IG = H_parent - Σ(w_i·H_child) | 分裂前后的信息减少量 |
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_iris
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 训练决策树
dt_model = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_model.fit(X, y)
# 特征重要性
feature_importance = pd.DataFrame({
'feature': iris.feature_names,
'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False)
print("特征重要性:")
print(feature_importance)
# 可视化决策树
plt.figure(figsize=(20, 10))
plot_tree(dt_model, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True, rounded=True)
plt.title('决策树可视化', fontsize=16)
plt.show()
# 剪枝优化
dt_pruned = DecisionTreeClassifier(
max_depth=4,
min_samples_split=10,
min_samples_leaf=5,
max_features='sqrt',
random_state=42
)
dt_pruned.fit(X_train, y_train)
4. 随机森林:集成学习的典范
4.1 Bagging与特征随机选择
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# 随机森林参数详解
rf_model = RandomForestClassifier(
n_estimators=100, # 树的数量
max_depth=10, # 最大深度
min_samples_split=5, # 最小分裂样本数
min_samples_leaf=2, # 叶子节点最小样本数
max_features='sqrt', # 每次分裂考虑的特征数
bootstrap=True, # 是否使用bootstrap采样
oob_score=True, # 是否使用袋外分数
random_state=42
)
# 训练
rf_model.fit(X_train, y_train)
# 袋外分数评估
print(f"袋外分数: {rf_model.oob_score_:.4f}")
# 交叉验证
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print(f"交叉验证分数: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
# 特征重要性可视化
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('随机森林特征重要性', fontsize=14)
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [iris.feature_names[i] for i in indices], rotation=45)
plt.tight_layout()
plt.show()
5. 支持向量机(SVM):寻找最优超平面
5.1 核函数与软间隔
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# 数据标准化(SVM对尺度敏感)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 不同核函数的SVM
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
svm_models = {}
for kernel in kernels:
svm = SVC(kernel=kernel, random_state=42, probability=True)
svm.fit(X_train_scaled, y_train)
svm_models[kernel] = svm
score = svm.score(X_test_scaled, y_test)
print(f"{kernel}核准确率: {score:.4f}")
# RBF核参数调优
from sklearn.model_selection import GridSearchCV
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1],
'kernel': ['rbf']
}
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
# 使用最佳模型
best_svm = grid_search.best_estimator_
print(f"测试集准确率: {best_svm.score(X_test_scaled, y_test):.4f}")
6. K-Means聚类:无监督学习的核心
6.1 肘部法则与轮廓系数
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
# 生成聚类数据
X_blobs, y_blobs = make_blobs(n_samples=500, centers=4, cluster_std=0.6, random_state=42)
# 肘部法则确定K值
inertias = []
silhouette_scores = []
K_range = range(2, 11)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_blobs)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_blobs, kmeans.labels_))
# 可视化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(K_range, inertias, 'bo-', linewidth=2)
ax1.set_xlabel('K值')
ax1.set_ylabel('惯性(Inertia)')
ax1.set_title('肘部法则', fontsize=14)
ax1.grid(True, alpha=0.3)
ax2.plot(K_range, silhouette_scores, 'ro-', linewidth=2)
ax2.set_xlabel('K值')
ax2.set_ylabel('轮廓系数')
ax2.set_title('轮廓系数法', fontsize=14)
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 最优K值的K-Means
optimal_k = 4
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
labels = kmeans_final.fit_predict(X_blobs)
print(f"轮廓系数: {silhouette_score(X_blobs, labels):.4f}")
# 可视化聚类结果
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=labels, cmap='viridis', s=50)
plt.scatter(kmeans_final.cluster_centers_[:, 0], kmeans_final.cluster_centers_[:, 1],
c='red', marker='X', s=200, linewidths=3, edgecolors='white')
plt.colorbar(scatter)
plt.title(f'K-Means聚类结果 (K={optimal_k})', fontsize=14)
plt.show()
7. PCA降维:高维数据的可视化
7.1 主成分分析与方差解释率
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits
# 加载手写数字数据集
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
print(f"原始维度: {X_digits.shape[1]}")
# PCA降维
pca = PCA()
X_pca = pca.fit_transform(X_digits)
# 累积方差解释率
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
# 可视化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-', linewidth=2)
ax1.axhline(y=0.95, color='r', linestyle='--', label='95%方差')
ax1.set_xlabel('主成分数量')
ax1.set_ylabel('累积方差解释率')
ax1.set_title('PCA累积方差解释率', fontsize=14)
ax1.legend()
ax1.grid(True, alpha=0.3)
# 确定保留95%方差所需的维度
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'{n_components_95}个主成分')
ax1.legend()
# 降维到2D可视化
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_digits)
scatter = ax2.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y_digits, cmap='tab10', s=20, alpha=0.6)
ax2.set_xlabel('第一主成分')
ax2.set_ylabel('第二主成分')
ax2.set_title('手写数字PCA可视化 (2D)', fontsize=14)
plt.colorbar(scatter, ax=ax2)
plt.tight_layout()
plt.show()
print(f"保留95%方差需要 {n_components_95} 个主成分")
print(f"维度从 {X_digits.shape[1]} 降到 {n_components_95}")
8. XGBoost:竞赛神器
8.1 梯度提升与正则化
import xgboost as xgb
from sklearn.model_selection import train_test_split
# 准备数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# XGBoost模型
xgb_model = xgb.XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.1, # L1正则化
reg_lambda=1, # L2正则化
random_state=42
)
xgb_model.fit(X_train, y_train)
# 预测
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)
print(f"XGBoost准确率: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost AUC: {roc_auc_score(y_test, y_proba_xgb[:, 1]):.4f}")
# 特征重要性
xgb.plot_importance(xgb_model, importance_type='weight', figsize=(10, 6))
plt.title('XGBoost特征重要性', fontsize=14)
plt.show()
# 早停法防止过拟合
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_early_stop = xgb.XGBClassifier(
n_estimators=1000,
max_depth=5,
learning_rate=0.05,
early_stopping_rounds=20,
eval_metric='logloss',
random_state=42
)
xgb_early_stop.fit(X_train, y_train, eval_set=eval_set, verbose=False)
print(f"最佳迭代轮数: {xgb_early_stop.best_iteration}")
print(f"早停版准确率: {xgb_early_stop.score(X_test, y_test):.4f}")
9. 神经网络基础:感知机到多层网络
9.1 激活函数与反向传播
from sklearn.neural_network import MLPClassifier
# 常见激活函数对比
def relu(x):
return np.maximum(0, x)
def tanh(x):
return np.tanh(x)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
x = np.linspace(-5, 5, 100)
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.plot(x, relu(x), linewidth=2, color='#2980b9')
plt.title('ReLU激活函数', fontsize=12)
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 2)
plt.plot(x, tanh(x), linewidth=2, color='#e74c3c')
plt.title('Tanh激活函数', fontsize=12)
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 3)
plt.plot(x, sigmoid(x), linewidth=2, color='#27ae60')
plt.title('Sigmoid激活函数', fontsize=12)
plt.grid(True, alpha=0.3)
# MLP分类器
mlp_model = MLPClassifier(
hidden_layer_sizes=(100, 50), # 两层隐藏层
activation='relu',
solver='adam',
alpha=0.0001,
batch_size='auto',
learning_rate='adaptive',
learning_rate_init=0.001,
max_iter=200,
random_state=42
)
mlp_model.fit(X_train, y_train)
print(f"MLP准确率: {mlp_model.score(X_test, y_test):.4f}")
# 损失曲线
plt.subplot(2, 2, 4)
plt.plot(mlp_model.loss_curve_, linewidth=2, color='#9b59b6')
plt.xlabel('迭代次数')
plt.ylabel('损失值')
plt.title('训练损失曲线', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
10. 模型融合:Stacking与Voting
10.1 集成策略
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
# 定义基础模型
base_models = [
('lr', LogisticRegression(random_state=42)),
('dt', DecisionTreeClassifier(max_depth=5, random_state=42)),
('svm', SVC(kernel='rbf', probability=True, random_state=42))
]
# 硬投票(多数表决)
voting_hard = VotingClassifier(estimators=base_models, voting='hard')
voting_hard.fit(X_train, y_train)
# 软投票(概率平均)
voting_soft = VotingClassifier(estimators=base_models, voting='soft')
voting_soft.fit(X_train, y_train)
# 堆叠(Stacking)
stacking = StackingClassifier(
estimators=base_models,
final_estimator=LogisticRegression(),
cv=5
)
stacking.fit(X_train, y_train)
# 对比结果
models = {
'逻辑回归': lr_model,
'决策树': dt_model,
'SVM': best_svm,
'XGBoost': xgb_model,
'硬投票': voting_hard,
'软投票': voting_soft,
'Stacking': stacking
}
results = []
for name, model in models.items():
if hasattr(model, 'predict'):
score = model.score(X_test, y_test)
results.append({'模型': name, '准确率': f'{score:.4f}'})
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
# 可视化对比
plt.figure(figsize=(12, 6))
bars = plt.bar(results_df['模型'], [float(x) for x in results_df['准确率']],
color='#2980b9', edgecolor='white')
plt.ylim(0.8, 1.0)
plt.ylabel('准确率')
plt.title('不同模型性能对比', fontsize=14)
plt.xticks(rotation=45, ha='right')
for bar, score in zip(bars, results_df['准确率']):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
score, ha='center', va='bottom')
plt.tight_layout()
plt.show()
算法选择指南
| 问题类型 | 推荐算法 | 优势 | 劣势 |
|---|---|---|---|
| 回归预测 | 线性回归、随机森林、XGBoost | 解释性强、效果好 | 易过拟合 |
| 二分类 | 逻辑回归、SVM、XGBoost | 概率输出、边界明确 | 参数敏感 |
| 多分类 | 随机森林、XGBoost、神经网络 | 准确率高 | 计算复杂 |
| 聚类 | K-Means、DBSCAN、层次聚类 | 无监督、发现模式 | K值难确定 |
| 降维 | PCA、t-SNE、UMAP | 可视化、去噪 | 信息损失 |
实战练习:端到端机器学习项目
# 完整的机器学习项目流程
def end_to_end_ml_pipeline(X, y, test_size=0.2):
"""端到端机器学习流程"""
# 1. 数据划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42, stratify=y
)
# 2. 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 3. 模型训练与选择
models = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(n_estimators=100),
'XGBoost': xgb.XGBClassifier(n_estimators=100),
'SVM': SVC(probability=True)
}
results = {}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]
results[name] = {
'accuracy': accuracy_score(y_test, y_pred),
'auc': roc_auc_score(y_test, y_proba),
'model': model
}
# 4. 选择最佳模型
best_model_name = max(results, key=lambda x: results[x]['auc'])
best_model = results[best_model_name]['model']
print(f"最佳模型: {best_model_name}")
print(f"准确率: {results[best_model_name]['accuracy']:.4f}")
print(f"AUC: {results[best_model_name]['auc']:.4f}")
return best_model, results
# 运行示例
best_model, results = end_to_end_ml_pipeline(X, y)
总结
本文深入讲解了10个核心机器学习算法:
- ✅ 线性回归:最小二乘法与梯度下降
- ✅ 逻辑回归:Sigmoid函数与决策边界
- ✅ 决策树:信息增益与基尼系数
- ✅ 随机森林:Bagging与特征随机
- ✅ 支持向量机:核函数与软间隔
- ✅ K-Means:肘部法则与轮廓系数
- ✅ PCA:主成分分析与降维
- ✅ XGBoost:梯度提升与正则化
- ✅ 神经网络:激活函数与反向传播
- ✅ 模型融合:Stacking与Voting
📌 进阶建议:理解算法原理比调参更重要。建议在真实数据集上实践每个算法,对比它们的表现差异。
🔗 相关文章推荐
如果这篇文章对你有帮助,欢迎点赞、收藏、转发!有任何问题请在评论区留言交流。