感知机、多层感知机与万能近似定理
前言
神经网络是深度学习的基础。本文从最简单的感知机开始,逐步介绍多层感知机(MLP)的结构和原理,为后续深度学习内容奠定基础。
生物神经元
结构
树突(输入)
↓
输入1 ──→ [ 细胞体 ] ──→ 轴突(输出)
输入2 ──→ [ (求和) ]
输入3 ──→ [ ↓ ]
[ 激活阈值 ]
| 生物结构 | 人工神经元 |
|---|---|
| 树突 | 输入 |
| 突触权重 | 权重 |
| 细胞体 | 加权求和 |
| 激活阈值 | 激活函数 |
| 轴突 | 输出 |
感知机(Perceptron)
模型定义
\[y = \text{sign}(\sum_{i=1}^{n} w_i x_i + b) = \text{sign}(\mathbf{w}^T \mathbf{x} + b)\]import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
class Perceptron:
"""感知机"""
def __init__(self, learning_rate=0.1, n_iterations=1000):
self.lr = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
self.errors = []
def fit(self, X, y):
"""训练感知机"""
n_samples, n_features = X.shape
# 初始化权重
self.weights = np.zeros(n_features)
self.bias = 0
# 确保标签是-1和1
y_ = np.where(y <= 0, -1, 1)
for _ in range(self.n_iterations):
errors = 0
for idx, x_i in enumerate(X):
linear_output = np.dot(x_i, self.weights) + self.bias
y_predicted = np.sign(linear_output) if linear_output != 0 else -1
# 更新权重(如果分类错误)
if y_[idx] * y_predicted <= 0:
self.weights += self.lr * y_[idx] * x_i
self.bias += self.lr * y_[idx]
errors += 1
self.errors.append(errors)
if errors == 0:
break
return self
def predict(self, X):
linear_output = np.dot(X, self.weights) + self.bias
return np.sign(linear_output)
训练示例
# 生成线性可分数据
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
flip_y=0, class_sep=2.0, random_state=42)
y = np.where(y == 0, -1, 1)
# 训练感知机
perceptron = Perceptron(learning_rate=0.1, n_iterations=100)
perceptron.fit(X, y)
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 决策边界
ax = axes[0]
xx, yy = np.meshgrid(np.linspace(X[:, 0].min()-1, X[:, 0].max()+1, 100),
np.linspace(X[:, 1].min()-1, X[:, 1].max()+1, 100))
Z = perceptron.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
ax.scatter(X[y==-1, 0], X[y==-1, 1], c='blue', label='Class -1', edgecolors='k')
ax.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', edgecolors='k')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('感知机决策边界')
ax.legend()
# 错误数变化
ax = axes[1]
ax.plot(perceptron.errors)
ax.set_xlabel('Iteration')
ax.set_ylabel('Errors')
ax.set_title('训练过程中的错误数')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
感知机的局限性
# XOR问题 - 感知机无法解决
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([-1, 1, 1, -1]) # XOR
perceptron_xor = Perceptron(n_iterations=1000)
perceptron_xor.fit(X_xor, y_xor)
print("XOR问题预测结果:")
for i, (x, y_true) in enumerate(zip(X_xor, y_xor)):
y_pred = perceptron_xor.predict(x.reshape(1, -1))[0]
print(f" 输入{x} -> 预测:{int(y_pred):2d}, 真实:{int(y_true):2d}")
# 可视化XOR问题
plt.figure(figsize=(8, 6))
colors = ['blue' if y == -1 else 'red' for y in y_xor]
plt.scatter(X_xor[:, 0], X_xor[:, 1], c=colors, s=200, edgecolors='k')
for i, (x, y) in enumerate(zip(X_xor, y_xor)):
plt.annotate(f'XOR={1 if y==1 else 0}', (x[0]+0.05, x[1]+0.05), fontsize=12)
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('XOR问题(线性不可分)')
plt.grid(True, alpha=0.3)
plt.show()
多层感知机(MLP)
网络结构
输入层 隐藏层 输出层
x1 ───→ h1
╲ ╱ ╲
x2 ───→ h2 ───→ y
╱ ╲ ╱
x3 ───→ h3
从零实现
class NeuralNetwork:
"""简单的多层感知机"""
def __init__(self, layer_sizes, learning_rate=0.01):
"""
layer_sizes: 各层神经元数量,如 [2, 4, 1]
"""
self.layer_sizes = layer_sizes
self.lr = learning_rate
self.weights = []
self.biases = []
# 初始化权重
for i in range(len(layer_sizes) - 1):
w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * 0.5
b = np.zeros((1, layer_sizes[i+1]))
self.weights.append(w)
self.biases.append(b)
def sigmoid(self, z):
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def sigmoid_derivative(self, a):
return a * (1 - a)
def forward(self, X):
"""前向传播"""
self.activations = [X]
for i, (w, b) in enumerate(zip(self.weights, self.biases)):
z = np.dot(self.activations[-1], w) + b
a = self.sigmoid(z)
self.activations.append(a)
return self.activations[-1]
def backward(self, X, y):
"""反向传播"""
m = X.shape[0]
# 输出层误差
delta = self.activations[-1] - y
# 反向传播
for i in range(len(self.weights) - 1, -1, -1):
# 计算梯度
dw = np.dot(self.activations[i].T, delta) / m
db = np.mean(delta, axis=0, keepdims=True)
# 更新权重
self.weights[i] -= self.lr * dw
self.biases[i] -= self.lr * db
# 计算下一层的误差(除了输入层)
if i > 0:
delta = np.dot(delta, self.weights[i].T) * self.sigmoid_derivative(self.activations[i])
def fit(self, X, y, epochs=1000, verbose=True):
"""训练网络"""
losses = []
for epoch in range(epochs):
# 前向传播
output = self.forward(X)
# 计算损失
loss = np.mean((output - y) ** 2)
losses.append(loss)
# 反向传播
self.backward(X, y)
if verbose and epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.6f}")
return losses
def predict(self, X):
return (self.forward(X) > 0.5).astype(int)
解决XOR问题
# 使用MLP解决XOR问题
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([[0], [1], [1], [0]])
# 创建网络: 2输入 -> 4隐藏 -> 1输出
mlp = NeuralNetwork([2, 4, 1], learning_rate=0.5)
# 训练
losses = mlp.fit(X_xor, y_xor, epochs=5000, verbose=False)
# 测试
print("\nMLP解决XOR问题:")
for x, y_true in zip(X_xor, y_xor):
y_pred = mlp.forward(x.reshape(1, -1))[0, 0]
print(f" 输入{x} -> 输出:{y_pred:.4f}, 预测:{int(y_pred > 0.5)}, 真实:{y_true[0]}")
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 决策边界
ax = axes[0]
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 100),
np.linspace(-0.5, 1.5, 100))
Z = mlp.forward(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, levels=20, cmap='coolwarm', alpha=0.8)
ax.scatter(X_xor[:, 0], X_xor[:, 1], c=y_xor.flatten(),
s=200, cmap='coolwarm', edgecolors='k', linewidths=2)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_title('MLP解决XOR问题')
ax.set_xlim(-0.5, 1.5)
ax.set_ylim(-0.5, 1.5)
# 损失曲线
ax = axes[1]
ax.plot(losses)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('训练损失曲线')
ax.set_yscale('log')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
万能近似定理
定理内容
具有一个隐藏层的前馈神经网络,只要隐藏神经元数量足够多,可以以任意精度逼近任何连续函数。
函数近似示例
# 近似复杂函数
def target_function(x):
return np.sin(2 * np.pi * x) * np.exp(-x)
# 生成训练数据
X_func = np.linspace(0, 2, 100).reshape(-1, 1)
y_func = target_function(X_func)
# 归一化
y_func_norm = (y_func - y_func.min()) / (y_func.max() - y_func.min())
# 不同隐藏单元数量
hidden_sizes = [2, 5, 20, 50]
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for ax, n_hidden in zip(axes, hidden_sizes):
nn = NeuralNetwork([1, n_hidden, 1], learning_rate=0.5)
nn.fit(X_func, y_func_norm, epochs=5000, verbose=False)
y_pred = nn.forward(X_func)
ax.plot(X_func, y_func_norm, 'b-', label='目标函数', linewidth=2)
ax.plot(X_func, y_pred, 'r--', label='神经网络', linewidth=2)
ax.set_title(f'隐藏单元数: {n_hidden}')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.suptitle('万能近似定理演示', y=1.02)
plt.show()
使用sklearn
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 生成非线性数据
X_moons, y_moons = make_moons(n_samples=500, noise=0.2, random_state=42)
# 数据标准化
scaler = StandardScaler()
X_moons_scaled = scaler.fit_transform(X_moons)
# 划分数据
X_train, X_test, y_train, y_test = train_test_split(
X_moons_scaled, y_moons, test_size=0.2, random_state=42
)
# 训练MLP
mlp_sklearn = MLPClassifier(
hidden_layer_sizes=(10, 5), # 两个隐藏层
activation='relu',
solver='adam',
max_iter=1000,
random_state=42
)
mlp_sklearn.fit(X_train, y_train)
print(f"训练准确率: {mlp_sklearn.score(X_train, y_train):.4f}")
print(f"测试准确率: {mlp_sklearn.score(X_test, y_test):.4f}")
# 可视化决策边界
plt.figure(figsize=(10, 6))
xx, yy = np.meshgrid(np.linspace(X_moons_scaled[:, 0].min()-1, X_moons_scaled[:, 0].max()+1, 100),
np.linspace(X_moons_scaled[:, 1].min()-1, X_moons_scaled[:, 1].max()+1, 100))
Z = mlp_sklearn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4, cmap='coolwarm')
plt.scatter(X_moons_scaled[:, 0], X_moons_scaled[:, 1], c=y_moons, cmap='coolwarm', edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('MLP分类 (sklearn)')
plt.show()
不同网络结构比较
# 比较不同网络结构
structures = [
(10,),
(10, 10),
(50,),
(10, 10, 10)
]
results = []
for hidden_layers in structures:
mlp = MLPClassifier(
hidden_layer_sizes=hidden_layers,
max_iter=1000,
random_state=42
)
mlp.fit(X_train, y_train)
train_score = mlp.score(X_train, y_train)
test_score = mlp.score(X_test, y_test)
results.append({
'structure': str(hidden_layers),
'train': train_score,
'test': test_score,
'n_params': sum(coef.size for coef in mlp.coefs_) + sum(b.size for b in mlp.intercepts_)
})
import pandas as pd
results_df = pd.DataFrame(results)
print("\n不同网络结构比较:")
print(results_df.to_string(index=False))
前向传播详解
计算过程
对于输入 $\mathbf{x}$,第 $l$ 层的计算:
\(\mathbf{z}^{(l)} = \mathbf{W}^{(l)} \mathbf{a}^{(l-1)} + \mathbf{b}^{(l)}\) \(\mathbf{a}^{(l)} = f(\mathbf{z}^{(l)})\)
def forward_pass_visualization(X, weights, biases, activations_func):
"""可视化前向传播过程"""
print("前向传播过程:")
print(f"输入: {X}")
a = X
for i, (w, b) in enumerate(zip(weights, biases)):
z = np.dot(a, w) + b
a = activations_func(z)
print(f"\n层 {i+1}:")
print(f" z = a * W + b = {z}")
print(f" a = sigmoid(z) = {a}")
return a
# 示例
sample_weights = [np.array([[0.5, -0.5], [0.5, -0.5]]), np.array([[1.0], [1.0]])]
sample_biases = [np.array([0.0, 0.0]), np.array([0.0])]
sample_input = np.array([[1, 0]])
sigmoid = lambda z: 1 / (1 + np.exp(-z))
output = forward_pass_visualization(sample_input, sample_weights, sample_biases, sigmoid)
常见问题
Q1: 为什么需要隐藏层?
隐藏层使网络能够学习输入的非线性变换,从而解决线性不可分问题。
Q2: 如何选择隐藏层数量和大小?
| 问题复杂度 | 建议 |
|---|---|
| 简单问题 | 1层,10-100神经元 |
| 中等问题 | 2-3层 |
| 复杂问题 | 深层网络 |
Q3: 为什么使用非线性激活函数?
多层线性变换等价于单层,非线性激活函数是实现非线性映射的关键。
Q4: 神经网络如何初始化权重?
- 零初始化:错误(对称性问题)
- 随机初始化:常用
- Xavier/He初始化:推荐
总结
| 概念 | 描述 |
|---|---|
| 感知机 | 单层线性分类器 |
| MLP | 多层非线性网络 |
| 前向传播 | 输入→输出的计算 |
| 万能近似 | 足够宽的网络可以近似任意函数 |
参考资料
- Rosenblatt, F. (1958). “The Perceptron: A Probabilistic Model”
- Cybenko, G. (1989). “Approximation by Superpositions of a Sigmoidal Function”
- Goodfellow, I. et al. “Deep Learning” - Chapter 6
- Nielsen, M. “Neural Networks and Deep Learning”
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——神经网络基础 》
本文链接:http://localhost:3015/ai/%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C%E5%9F%BA%E7%A1%80.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!