感知机、多层感知机与万能近似定理

前言

神经网络是深度学习的基础。本文从最简单的感知机开始,逐步介绍多层感知机(MLP)的结构和原理,为后续深度学习内容奠定基础。


生物神经元

结构

          树突(输入)
              ↓
   输入1 ──→ [  细胞体  ] ──→ 轴突(输出)
   输入2 ──→ [  (求和)  ]
   输入3 ──→ [   ↓     ]
          [ 激活阈值 ]
生物结构 人工神经元
树突 输入
突触权重 权重
细胞体 加权求和
激活阈值 激活函数
轴突 输出

感知机(Perceptron)

模型定义

\[y = \text{sign}(\sum_{i=1}^{n} w_i x_i + b) = \text{sign}(\mathbf{w}^T \mathbf{x} + b)\]
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

class Perceptron:
    """感知机"""
    
    def __init__(self, learning_rate=0.1, n_iterations=1000):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.errors = []
    
    def fit(self, X, y):
        """训练感知机"""
        n_samples, n_features = X.shape
        
        # 初始化权重
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        # 确保标签是-1和1
        y_ = np.where(y <= 0, -1, 1)
        
        for _ in range(self.n_iterations):
            errors = 0
            for idx, x_i in enumerate(X):
                linear_output = np.dot(x_i, self.weights) + self.bias
                y_predicted = np.sign(linear_output) if linear_output != 0 else -1
                
                # 更新权重(如果分类错误)
                if y_[idx] * y_predicted <= 0:
                    self.weights += self.lr * y_[idx] * x_i
                    self.bias += self.lr * y_[idx]
                    errors += 1
            
            self.errors.append(errors)
            
            if errors == 0:
                break
        
        return self
    
    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return np.sign(linear_output)

训练示例

# 生成线性可分数据
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
                          n_informative=2, n_clusters_per_class=1,
                          flip_y=0, class_sep=2.0, random_state=42)
y = np.where(y == 0, -1, 1)

# 训练感知机
perceptron = Perceptron(learning_rate=0.1, n_iterations=100)
perceptron.fit(X, y)

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 决策边界
ax = axes[0]
xx, yy = np.meshgrid(np.linspace(X[:, 0].min()-1, X[:, 0].max()+1, 100),
                      np.linspace(X[:, 1].min()-1, X[:, 1].max()+1, 100))
Z = perceptron.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
ax.scatter(X[y==-1, 0], X[y==-1, 1], c='blue', label='Class -1', edgecolors='k')
ax.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', edgecolors='k')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('感知机决策边界')
ax.legend()

# 错误数变化
ax = axes[1]
ax.plot(perceptron.errors)
ax.set_xlabel('Iteration')
ax.set_ylabel('Errors')
ax.set_title('训练过程中的错误数')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

感知机的局限性

# XOR问题 - 感知机无法解决
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([-1, 1, 1, -1])  # XOR

perceptron_xor = Perceptron(n_iterations=1000)
perceptron_xor.fit(X_xor, y_xor)

print("XOR问题预测结果:")
for i, (x, y_true) in enumerate(zip(X_xor, y_xor)):
    y_pred = perceptron_xor.predict(x.reshape(1, -1))[0]
    print(f"  输入{x} -> 预测:{int(y_pred):2d}, 真实:{int(y_true):2d}")

# 可视化XOR问题
plt.figure(figsize=(8, 6))
colors = ['blue' if y == -1 else 'red' for y in y_xor]
plt.scatter(X_xor[:, 0], X_xor[:, 1], c=colors, s=200, edgecolors='k')
for i, (x, y) in enumerate(zip(X_xor, y_xor)):
    plt.annotate(f'XOR={1 if y==1 else 0}', (x[0]+0.05, x[1]+0.05), fontsize=12)
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('XOR问题(线性不可分)')
plt.grid(True, alpha=0.3)
plt.show()

多层感知机(MLP)

网络结构

输入层      隐藏层       输出层
  x1 ───→  h1 
     ╲  ╱    ╲
  x2 ───→  h2 ───→  y
     ╱  ╲    ╱
  x3 ───→  h3

从零实现

class NeuralNetwork:
    """简单的多层感知机"""
    
    def __init__(self, layer_sizes, learning_rate=0.01):
        """
        layer_sizes: 各层神经元数量,如 [2, 4, 1]
        """
        self.layer_sizes = layer_sizes
        self.lr = learning_rate
        self.weights = []
        self.biases = []
        
        # 初始化权重
        for i in range(len(layer_sizes) - 1):
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * 0.5
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
    
    def sigmoid_derivative(self, a):
        return a * (1 - a)
    
    def forward(self, X):
        """前向传播"""
        self.activations = [X]
        
        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            z = np.dot(self.activations[-1], w) + b
            a = self.sigmoid(z)
            self.activations.append(a)
        
        return self.activations[-1]
    
    def backward(self, X, y):
        """反向传播"""
        m = X.shape[0]
        
        # 输出层误差
        delta = self.activations[-1] - y
        
        # 反向传播
        for i in range(len(self.weights) - 1, -1, -1):
            # 计算梯度
            dw = np.dot(self.activations[i].T, delta) / m
            db = np.mean(delta, axis=0, keepdims=True)
            
            # 更新权重
            self.weights[i] -= self.lr * dw
            self.biases[i] -= self.lr * db
            
            # 计算下一层的误差(除了输入层)
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self.sigmoid_derivative(self.activations[i])
    
    def fit(self, X, y, epochs=1000, verbose=True):
        """训练网络"""
        losses = []
        
        for epoch in range(epochs):
            # 前向传播
            output = self.forward(X)
            
            # 计算损失
            loss = np.mean((output - y) ** 2)
            losses.append(loss)
            
            # 反向传播
            self.backward(X, y)
            
            if verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}")
        
        return losses
    
    def predict(self, X):
        return (self.forward(X) > 0.5).astype(int)

解决XOR问题

# 使用MLP解决XOR问题
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([[0], [1], [1], [0]])

# 创建网络: 2输入 -> 4隐藏 -> 1输出
mlp = NeuralNetwork([2, 4, 1], learning_rate=0.5)

# 训练
losses = mlp.fit(X_xor, y_xor, epochs=5000, verbose=False)

# 测试
print("\nMLP解决XOR问题:")
for x, y_true in zip(X_xor, y_xor):
    y_pred = mlp.forward(x.reshape(1, -1))[0, 0]
    print(f"  输入{x} -> 输出:{y_pred:.4f}, 预测:{int(y_pred > 0.5)}, 真实:{y_true[0]}")

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 决策边界
ax = axes[0]
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100),
                      np.linspace(-0.5, 1.5, 100))
Z = mlp.forward(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

ax.contourf(xx, yy, Z, levels=20, cmap='coolwarm', alpha=0.8)
ax.scatter(X_xor[:, 0], X_xor[:, 1], c=y_xor.flatten(), 
           s=200, cmap='coolwarm', edgecolors='k', linewidths=2)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_title('MLP解决XOR问题')
ax.set_xlim(-0.5, 1.5)
ax.set_ylim(-0.5, 1.5)

# 损失曲线
ax = axes[1]
ax.plot(losses)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('训练损失曲线')
ax.set_yscale('log')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

万能近似定理

定理内容

具有一个隐藏层的前馈神经网络,只要隐藏神经元数量足够多,可以以任意精度逼近任何连续函数。

函数近似示例

# 近似复杂函数
def target_function(x):
    return np.sin(2 * np.pi * x) * np.exp(-x)

# 生成训练数据
X_func = np.linspace(0, 2, 100).reshape(-1, 1)
y_func = target_function(X_func)

# 归一化
y_func_norm = (y_func - y_func.min()) / (y_func.max() - y_func.min())

# 不同隐藏单元数量
hidden_sizes = [2, 5, 20, 50]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for ax, n_hidden in zip(axes, hidden_sizes):
    nn = NeuralNetwork([1, n_hidden, 1], learning_rate=0.5)
    nn.fit(X_func, y_func_norm, epochs=5000, verbose=False)
    
    y_pred = nn.forward(X_func)
    
    ax.plot(X_func, y_func_norm, 'b-', label='目标函数', linewidth=2)
    ax.plot(X_func, y_pred, 'r--', label='神经网络', linewidth=2)
    ax.set_title(f'隐藏单元数: {n_hidden}')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.suptitle('万能近似定理演示', y=1.02)
plt.show()

使用sklearn

from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 生成非线性数据
X_moons, y_moons = make_moons(n_samples=500, noise=0.2, random_state=42)

# 数据标准化
scaler = StandardScaler()
X_moons_scaled = scaler.fit_transform(X_moons)

# 划分数据
X_train, X_test, y_train, y_test = train_test_split(
    X_moons_scaled, y_moons, test_size=0.2, random_state=42
)

# 训练MLP
mlp_sklearn = MLPClassifier(
    hidden_layer_sizes=(10, 5),  # 两个隐藏层
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42
)
mlp_sklearn.fit(X_train, y_train)

print(f"训练准确率: {mlp_sklearn.score(X_train, y_train):.4f}")
print(f"测试准确率: {mlp_sklearn.score(X_test, y_test):.4f}")

# 可视化决策边界
plt.figure(figsize=(10, 6))
xx, yy = np.meshgrid(np.linspace(X_moons_scaled[:, 0].min()-1, X_moons_scaled[:, 0].max()+1, 100),
                      np.linspace(X_moons_scaled[:, 1].min()-1, X_moons_scaled[:, 1].max()+1, 100))
Z = mlp_sklearn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.4, cmap='coolwarm')
plt.scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1], c=y_moons, cmap='coolwarm', edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('MLP分类 (sklearn)')
plt.show()

不同网络结构比较

# 比较不同网络结构
structures = [
    (10,),
    (10, 10),
    (50,),
    (10, 10, 10)
]

results = []

for hidden_layers in structures:
    mlp = MLPClassifier(
        hidden_layer_sizes=hidden_layers,
        max_iter=1000,
        random_state=42
    )
    mlp.fit(X_train, y_train)
    
    train_score = mlp.score(X_train, y_train)
    test_score = mlp.score(X_test, y_test)
    
    results.append({
        'structure': str(hidden_layers),
        'train': train_score,
        'test': test_score,
        'n_params': sum(coef.size for coef in mlp.coefs_) + sum(b.size for b in mlp.intercepts_)
    })

import pandas as pd
results_df = pd.DataFrame(results)
print("\n不同网络结构比较:")
print(results_df.to_string(index=False))

前向传播详解

计算过程

对于输入 $\mathbf{x}$,第 $l$ 层的计算:

\(\mathbf{z}^{(l)} = \mathbf{W}^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)}\) \(\mathbf{a}^{(l)} = f(\mathbf{z}^{(l)})\)

def forward_pass_visualization(X, weights, biases, activations_func):
    """可视化前向传播过程"""
    
    print("前向传播过程:")
    print(f"输入: {X}")
    
    a = X
    for i, (w, b) in enumerate(zip(weights, biases)):
        z = np.dot(a, w) + b
        a = activations_func(z)
        print(f"\n{i+1}:")
        print(f"  z = a * W + b = {z}")
        print(f"  a = sigmoid(z) = {a}")
    
    return a

# 示例
sample_weights = [np.array([[0.5, -0.5], [0.5, -0.5]]), np.array([[1.0], [1.0]])]
sample_biases = [np.array([0.0, 0.0]), np.array([0.0])]
sample_input = np.array([[1, 0]])

sigmoid = lambda z: 1 / (1 + np.exp(-z))
output = forward_pass_visualization(sample_input, sample_weights, sample_biases, sigmoid)

常见问题

Q1: 为什么需要隐藏层?

隐藏层使网络能够学习输入的非线性变换,从而解决线性不可分问题。

Q2: 如何选择隐藏层数量和大小?

问题复杂度 建议
简单问题 1层,10-100神经元
中等问题 2-3层
复杂问题 深层网络

Q3: 为什么使用非线性激活函数?

多层线性变换等价于单层,非线性激活函数是实现非线性映射的关键。

Q4: 神经网络如何初始化权重?

  • 零初始化:错误(对称性问题)
  • 随机初始化:常用
  • Xavier/He初始化:推荐

总结

概念 描述
感知机 单层线性分类器
MLP 多层非线性网络
前向传播 输入→输出的计算
万能近似 足够宽的网络可以近似任意函数

参考资料

  • Rosenblatt, F. (1958). “The Perceptron: A Probabilistic Model”
  • Cybenko, G. (1989). “Approximation by Superpositions of a Sigmoidal Function”
  • Goodfellow, I. et al. “Deep Learning” - Chapter 6
  • Nielsen, M. “Neural Networks and Deep Learning”

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——神经网络基础 》

本文链接:http://localhost:3015/ai/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C%E5%9F%BA%E7%A1%80.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!