📚 引言

在上一篇机器学习入门文章中,我们学习了机器学习的基础概念和完整流程。今天,我们将深入探讨10个核心算法的原理、数学基础和实践技巧,帮助你真正理解算法背后的工作机制,并能够在实际项目中灵活应用。


1. 线性回归:从最小二乘法到梯度下降

1.1 数学原理

线性回归的目标是找到最佳拟合直线:y = wx + b,其中w是权重,b是偏置。

# 损失函数:均方误差(MSE)
# J(w,b) = (1/2n) * Σ(y_i - (wx_i + b))²

# 梯度下降更新公式
# w = w - α * ∂J/∂w
# b = b - α * ∂J/∂b

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 生成示例数据
np.random.seed(42)
X = np.random.randn(100, 1) * 10
y = 2.5 * X.squeeze() + np.random.randn(100) * 5

# 训练模型
model = LinearRegression()
model.fit(X, y)

# 预测
y_pred = model.predict(X)

# 评估
print(f"系数: {model.coef_[0]:.2f}")
print(f"截距: {model.intercept_:.2f}")
print(f"R²分数: {r2_score(y, y_pred):.4f}")
print(f"均方误差: {mean_squared_error(y, y_pred):.2f}")

1.2 从零实现线性回归

class LinearRegressionScratch:
    """从零实现线性回归"""

    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # 初始化参数
        self.weights = np.zeros(n_features)
        self.bias = 0

        # 梯度下降
        for _ in range(self.n_iterations):
            # 前向传播
            y_pred = np.dot(X, self.weights) + self.bias

            # 计算梯度
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            db = (1/n_samples) * np.sum(y_pred - y)

            # 更新参数
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# 使用自定义实现
model_scratch = LinearRegressionScratch(learning_rate=0.01, n_iterations=1000)
model_scratch.fit(X, y)
y_pred_scratch = model_scratch.predict(X)
print(f"自定义实现 R²: {r2_score(y, y_pred_scratch):.4f}")

2. 逻辑回归:分类问题的基石

2.1 Sigmoid函数与决策边界

逻辑回归使用Sigmoid函数将输出映射到[0,1]区间,表示概率。

def sigmoid(z):
    """Sigmoid函数"""
    return 1 / (1 + np.exp(-z))

# 可视化Sigmoid函数
import matplotlib.pyplot as plt

z = np.linspace(-10, 10, 100)
s = sigmoid(z)

plt.figure(figsize=(10, 6))
plt.plot(z, s, linewidth=2, color='#2980b9')
plt.axhline(y=0.5, color='red', linestyle='--', label='决策边界 (0.5)')
plt.axvline(x=0, color='gray', linestyle='--')
plt.xlabel('z')
plt.ylabel('sigmoid(z)')
plt.title('Sigmoid函数', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 逻辑回归示例
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 生成分类数据
X, y = make_classification(n_samples=1000, n_features=4, n_classes=2, random_state=42)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# 预测概率
y_proba = lr_model.predict_proba(X_test)[:, 1]
y_pred = lr_model.predict(X_test)

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

print(f"准确率: {accuracy_score(y_test, y_pred):.4f}")
print(f"AUC分数: {roc_auc_score(y_test, y_proba):.4f}")
print(f"混淆矩阵:\n{confusion_matrix(y_test, y_pred)}")

3. 决策树:直观的树形结构

3.1 信息增益与基尼系数

指标 公式 说明
信息熵 H = -Σp_i·log₂(p_i) 衡量不确定性
基尼系数 Gini = 1 - Σp_i² 衡量不纯度
信息增益 IG = H_parent - Σ(w_i·H_child) 分裂前后的信息减少量
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_iris

# 加载数据
iris = load_iris()
X, y = iris.data, iris.target

# 训练决策树
dt_model = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_model.fit(X, y)

# 特征重要性
feature_importance = pd.DataFrame({
    'feature': iris.feature_names,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False)

print("特征重要性:")
print(feature_importance)

# 可视化决策树
plt.figure(figsize=(20, 10))
plot_tree(dt_model, feature_names=iris.feature_names, 
          class_names=iris.target_names, filled=True, rounded=True)
plt.title('决策树可视化', fontsize=16)
plt.show()

# 剪枝优化
dt_pruned = DecisionTreeClassifier(
    max_depth=4,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)
dt_pruned.fit(X_train, y_train)

4. 随机森林:集成学习的典范

4.1 Bagging与特征随机选择

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 随机森林参数详解
rf_model = RandomForestClassifier(
    n_estimators=100,      # 树的数量
    max_depth=10,          # 最大深度
    min_samples_split=5,   # 最小分裂样本数
    min_samples_leaf=2,    # 叶子节点最小样本数
    max_features='sqrt',   # 每次分裂考虑的特征数
    bootstrap=True,        # 是否使用bootstrap采样
    oob_score=True,        # 是否使用袋外分数
    random_state=42
)

# 训练
rf_model.fit(X_train, y_train)

# 袋外分数评估
print(f"袋外分数: {rf_model.oob_score_:.4f}")

# 交叉验证
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print(f"交叉验证分数: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# 特征重要性可视化
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.title('随机森林特征重要性', fontsize=14)
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), [iris.feature_names[i] for i in indices], rotation=45)
plt.tight_layout()
plt.show()

5. 支持向量机(SVM):寻找最优超平面

5.1 核函数与软间隔

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# 数据标准化(SVM对尺度敏感)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 不同核函数的SVM
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
svm_models = {}

for kernel in kernels:
    svm = SVC(kernel=kernel, random_state=42, probability=True)
    svm.fit(X_train_scaled, y_train)
    svm_models[kernel] = svm
    score = svm.score(X_test_scaled, y_test)
    print(f"{kernel}核准确率: {score:.4f}")

# RBF核参数调优
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")

# 使用最佳模型
best_svm = grid_search.best_estimator_
print(f"测试集准确率: {best_svm.score(X_test_scaled, y_test):.4f}")

6. K-Means聚类:无监督学习的核心

6.1 肘部法则与轮廓系数

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs

# 生成聚类数据
X_blobs, y_blobs = make_blobs(n_samples=500, centers=4, cluster_std=0.6, random_state=42)

# 肘部法则确定K值
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_blobs)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_blobs, kmeans.labels_))

# 可视化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(K_range, inertias, 'bo-', linewidth=2)
ax1.set_xlabel('K值')
ax1.set_ylabel('惯性(Inertia)')
ax1.set_title('肘部法则', fontsize=14)
ax1.grid(True, alpha=0.3)

ax2.plot(K_range, silhouette_scores, 'ro-', linewidth=2)
ax2.set_xlabel('K值')
ax2.set_ylabel('轮廓系数')
ax2.set_title('轮廓系数法', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 最优K值的K-Means
optimal_k = 4
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
labels = kmeans_final.fit_predict(X_blobs)

print(f"轮廓系数: {silhouette_score(X_blobs, labels):.4f}")

# 可视化聚类结果
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=labels, cmap='viridis', s=50)
plt.scatter(kmeans_final.cluster_centers_[:, 0], kmeans_final.cluster_centers_[:, 1], 
            c='red', marker='X', s=200, linewidths=3, edgecolors='white')
plt.colorbar(scatter)
plt.title(f'K-Means聚类结果 (K={optimal_k})', fontsize=14)
plt.show()

7. PCA降维:高维数据的可视化

7.1 主成分分析与方差解释率

from sklearn.decomposition import PCA
from sklearn.datasets import load_digits

# 加载手写数字数据集
digits = load_digits()
X_digits, y_digits = digits.data, digits.target

print(f"原始维度: {X_digits.shape[1]}")

# PCA降维
pca = PCA()
X_pca = pca.fit_transform(X_digits)

# 累积方差解释率
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# 可视化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-', linewidth=2)
ax1.axhline(y=0.95, color='r', linestyle='--', label='95%方差')
ax1.set_xlabel('主成分数量')
ax1.set_ylabel('累积方差解释率')
ax1.set_title('PCA累积方差解释率', fontsize=14)
ax1.legend()
ax1.grid(True, alpha=0.3)

# 确定保留95%方差所需的维度
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
ax1.axvline(x=n_components_95, color='g', linestyle='--', label=f'{n_components_95}个主成分')
ax1.legend()

# 降维到2D可视化
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_digits)

scatter = ax2.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y_digits, cmap='tab10', s=20, alpha=0.6)
ax2.set_xlabel('第一主成分')
ax2.set_ylabel('第二主成分')
ax2.set_title('手写数字PCA可视化 (2D)', fontsize=14)
plt.colorbar(scatter, ax=ax2)

plt.tight_layout()
plt.show()

print(f"保留95%方差需要 {n_components_95} 个主成分")
print(f"维度从 {X_digits.shape[1]} 降到 {n_components_95}")

8. XGBoost:竞赛神器

8.1 梯度提升与正则化

import xgboost as xgb
from sklearn.model_selection import train_test_split

# 准备数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost模型
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1正则化
    reg_lambda=1,    # L2正则化
    random_state=42
)

xgb_model.fit(X_train, y_train)

# 预测
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)

print(f"XGBoost准确率: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"XGBoost AUC: {roc_auc_score(y_test, y_proba_xgb[:, 1]):.4f}")

# 特征重要性
xgb.plot_importance(xgb_model, importance_type='weight', figsize=(10, 6))
plt.title('XGBoost特征重要性', fontsize=14)
plt.show()

# 早停法防止过拟合
eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_early_stop = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.05,
    early_stopping_rounds=20,
    eval_metric='logloss',
    random_state=42
)

xgb_early_stop.fit(X_train, y_train, eval_set=eval_set, verbose=False)

print(f"最佳迭代轮数: {xgb_early_stop.best_iteration}")
print(f"早停版准确率: {xgb_early_stop.score(X_test, y_test):.4f}")

9. 神经网络基础:感知机到多层网络

9.1 激活函数与反向传播

from sklearn.neural_network import MLPClassifier

# 常见激活函数对比
def relu(x):
    return np.maximum(0, x)

def tanh(x):
    return np.tanh(x)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

x = np.linspace(-5, 5, 100)

plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(x, relu(x), linewidth=2, color='#2980b9')
plt.title('ReLU激活函数', fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
plt.plot(x, tanh(x), linewidth=2, color='#e74c3c')
plt.title('Tanh激活函数', fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
plt.plot(x, sigmoid(x), linewidth=2, color='#27ae60')
plt.title('Sigmoid激活函数', fontsize=12)
plt.grid(True, alpha=0.3)

# MLP分类器
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # 两层隐藏层
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=200,
    random_state=42
)

mlp_model.fit(X_train, y_train)
print(f"MLP准确率: {mlp_model.score(X_test, y_test):.4f}")

# 损失曲线
plt.subplot(2, 2, 4)
plt.plot(mlp_model.loss_curve_, linewidth=2, color='#9b59b6')
plt.xlabel('迭代次数')
plt.ylabel('损失值')
plt.title('训练损失曲线', fontsize=12)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

10. 模型融合:Stacking与Voting

10.1 集成策略

from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# 定义基础模型
base_models = [
    ('lr', LogisticRegression(random_state=42)),
    ('dt', DecisionTreeClassifier(max_depth=5, random_state=42)),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42))
]

# 硬投票(多数表决)
voting_hard = VotingClassifier(estimators=base_models, voting='hard')
voting_hard.fit(X_train, y_train)

# 软投票(概率平均)
voting_soft = VotingClassifier(estimators=base_models, voting='soft')
voting_soft.fit(X_train, y_train)

# 堆叠(Stacking)
stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),
    cv=5
)
stacking.fit(X_train, y_train)

# 对比结果
models = {
    '逻辑回归': lr_model,
    '决策树': dt_model,
    'SVM': best_svm,
    'XGBoost': xgb_model,
    '硬投票': voting_hard,
    '软投票': voting_soft,
    'Stacking': stacking
}

results = []
for name, model in models.items():
    if hasattr(model, 'predict'):
        score = model.score(X_test, y_test)
        results.append({'模型': name, '准确率': f'{score:.4f}'})

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# 可视化对比
plt.figure(figsize=(12, 6))
bars = plt.bar(results_df['模型'], [float(x) for x in results_df['准确率']], 
               color='#2980b9', edgecolor='white')
plt.ylim(0.8, 1.0)
plt.ylabel('准确率')
plt.title('不同模型性能对比', fontsize=14)
plt.xticks(rotation=45, ha='right')

for bar, score in zip(bars, results_df['准确率']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002, 
             score, ha='center', va='bottom')

plt.tight_layout()
plt.show()

算法选择指南

问题类型 推荐算法 优势 劣势
回归预测 线性回归、随机森林、XGBoost 解释性强、效果好 易过拟合
二分类 逻辑回归、SVM、XGBoost 概率输出、边界明确 参数敏感
多分类 随机森林、XGBoost、神经网络 准确率高 计算复杂
聚类 K-Means、DBSCAN、层次聚类 无监督、发现模式 K值难确定
降维 PCA、t-SNE、UMAP 可视化、去噪 信息损失

实战练习:端到端机器学习项目

# 完整的机器学习项目流程
def end_to_end_ml_pipeline(X, y, test_size=0.2):
    """端到端机器学习流程"""

    # 1. 数据划分
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    # 2. 数据标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 3. 模型训练与选择
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(n_estimators=100),
        'XGBoost': xgb.XGBClassifier(n_estimators=100),
        'SVM': SVC(probability=True)
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]

        results[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_proba),
            'model': model
        }

    # 4. 选择最佳模型
    best_model_name = max(results, key=lambda x: results[x]['auc'])
    best_model = results[best_model_name]['model']

    print(f"最佳模型: {best_model_name}")
    print(f"准确率: {results[best_model_name]['accuracy']:.4f}")
    print(f"AUC: {results[best_model_name]['auc']:.4f}")

    return best_model, results

# 运行示例
best_model, results = end_to_end_ml_pipeline(X, y)

总结

本文深入讲解了10个核心机器学习算法:

  • 线性回归:最小二乘法与梯度下降
  • 逻辑回归:Sigmoid函数与决策边界
  • 决策树:信息增益与基尼系数
  • 随机森林:Bagging与特征随机
  • 支持向量机:核函数与软间隔
  • K-Means:肘部法则与轮廓系数
  • PCA:主成分分析与降维
  • XGBoost:梯度提升与正则化
  • 神经网络:激活函数与反向传播
  • 模型融合:Stacking与Voting

📌 进阶建议:理解算法原理比调参更重要。建议在真实数据集上实践每个算法,对比它们的表现差异。


🔗 相关文章推荐


如果这篇文章对你有帮助,欢迎点赞、收藏、转发!有任何问题请在评论区留言交流。