机器学习基础系列——卷积神经网络

前言

卷积神经网络（CNN）是处理图像数据的核心架构。它利用卷积操作自动学习空间特征，在计算机视觉任务中取得了巨大成功。

全连接网络的问题

参数爆炸

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 一张224x224x3的图像
image_size = 224 * 224 * 3
hidden_size = 1000

# 全连接层参数
fc_params = image_size * hidden_size
print(f"全连接层参数: {fc_params:,} ({fc_params / 1e6:.1f}M)")

# 卷积层参数 (3x3卷积，64个滤波器)
conv_params = 3 * 3 * 3 * 64  # kernel_size * kernel_size * in_channels * out_channels
print(f"卷积层参数: {conv_params:,} ({conv_params / 1e3:.1f}K)")

print(f"\n参数比例: {fc_params / conv_params:.0f}x")

缺乏空间不变性

全连接网络不能识别平移后的模式。

卷积操作

2D卷积

\[y_{i,j} = \sum_{m}\sum_{n} x_{i+m, j+n} \cdot k_{m,n}\]

def conv2d(image, kernel, stride=1, padding=0):
    """2D卷积实现"""
    # 添加padding
    if padding > 0:
        image = np.pad(image, ((padding, padding), (padding, padding)), mode='constant')
    
    h, w = image.shape
    kh, kw = kernel.shape
    
    out_h = (h - kh) // stride + 1
    out_w = (w - kw) // stride + 1
    
    output = np.zeros((out_h, out_w))
    
    for i in range(out_h):
        for j in range(out_w):
            region = image[i*stride:i*stride+kh, j*stride:j*stride+kw]
            output[i, j] = np.sum(region * kernel)
    
    return output

# 示例
image = np.array([
    [1, 2, 3, 4, 5],
    [6, 7, 8, 9, 10],
    [11, 12, 13, 14, 15],
    [16, 17, 18, 19, 20],
    [21, 22, 23, 24, 25]
], dtype=float)

# 边缘检测卷积核
kernel_edge = np.array([
    [-1, -1, -1],
    [-1,  8, -1],
    [-1, -1, -1]
], dtype=float)

output = conv2d(image, kernel_edge)
print("输入:")
print(image)
print("\n边缘检测输出:")
print(output)

可视化卷积过程

def visualize_convolution():
    # 创建示例图像
    img = np.random.randn(28, 28)
    
    # 不同的卷积核
    kernels = {
        '恒等': np.array([[0, 0, 0], [0, 1, 0], [0, 0, 0]]),
        '边缘检测': np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]]),
        '锐化': np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]),
        '模糊': np.ones((3, 3)) / 9,
        'Sobel-X': np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]),
        'Sobel-Y': np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]])
    }
    
    fig, axes = plt.subplots(2, 4, figsize=(16, 8))
    axes = axes.flatten()
    
    # 原图
    axes[0].imshow(img, cmap='gray')
    axes[0].set_title('原图')
    axes[0].axis('off')
    
    # 空白
    axes[1].axis('off')
    
    # 各种卷积结果
    for idx, (name, kernel) in enumerate(kernels.items()):
        result = conv2d(img, kernel, padding=1)
        axes[idx + 2].imshow(result, cmap='gray')
        axes[idx + 2].set_title(name)
        axes[idx + 2].axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_convolution()

多通道卷积

def conv2d_multichannel(image, kernels, bias=0, stride=1, padding=0):
    """
    多通道卷积
    image: (C_in, H, W)
    kernels: (C_out, C_in, kH, kW)
    """
    C_out = kernels.shape[0]
    
    # 对第一个输出通道进行卷积
    out_single = conv2d(image[0], kernels[0, 0], stride, padding)
    H_out, W_out = out_single.shape
    
    output = np.zeros((C_out, H_out, W_out))
    
    for c_out in range(C_out):
        for c_in in range(image.shape[0]):
            output[c_out] += conv2d(image[c_in], kernels[c_out, c_in], stride, padding)
        output[c_out] += bias
    
    return output

# 测试RGB图像卷积
rgb_image = np.random.randn(3, 32, 32)  # 3通道, 32x32
kernels = np.random.randn(16, 3, 3, 3)  # 16个输出通道

output = conv2d_multichannel(rgb_image, kernels, padding=1)
print(f"输入形状: {rgb_image.shape}")
print(f"卷积核形状: {kernels.shape}")
print(f"输出形状: {output.shape}")

池化层

最大池化

def max_pool2d(image, pool_size=2, stride=2):
    """最大池化"""
    h, w = image.shape
    out_h = (h - pool_size) // stride + 1
    out_w = (w - pool_size) // stride + 1
    
    output = np.zeros((out_h, out_w))
    
    for i in range(out_h):
        for j in range(out_w):
            region = image[i*stride:i*stride+pool_size, 
                          j*stride:j*stride+pool_size]
            output[i, j] = np.max(region)
    
    return output

# 平均池化
def avg_pool2d(image, pool_size=2, stride=2):
    """平均池化"""
    h, w = image.shape
    out_h = (h - pool_size) // stride + 1
    out_w = (w - pool_size) // stride + 1
    
    output = np.zeros((out_h, out_w))
    
    for i in range(out_h):
        for j in range(out_w):
            region = image[i*stride:i*stride+pool_size, 
                          j*stride:j*stride+pool_size]
            output[i, j] = np.mean(region)
    
    return output

# 测试
image = np.array([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12],
    [13, 14, 15, 16]
], dtype=float)

print("输入:")
print(image)
print("\n最大池化 (2x2, stride=2):")
print(max_pool2d(image))
print("\n平均池化 (2x2, stride=2):")
print(avg_pool2d(image))

池化的作用

作用	说明
降维	减少计算量
平移不变性	对小位移鲁棒
防止过拟合	减少参数

CNN完整实现

class ConvLayer:
    """卷积层"""
    
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        self.stride = stride
        self.padding = padding
        
        # Xavier初始化
        scale = np.sqrt(2.0 / (in_channels * kernel_size * kernel_size))
        self.W = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) * scale
        self.b = np.zeros(out_channels)
        
        self.dW = None
        self.db = None
    
    def forward(self, X):
        """
        X: (batch_size, in_channels, H, W)
        """
        self.X = X
        batch_size, in_channels, H, W = X.shape
        out_channels, _, kH, kW = self.W.shape
        
        # 计算输出尺寸
        H_out = (H + 2 * self.padding - kH) // self.stride + 1
        W_out = (W + 2 * self.padding - kW) // self.stride + 1
        
        # 输出
        out = np.zeros((batch_size, out_channels, H_out, W_out))
        
        # Padding
        if self.padding > 0:
            X_pad = np.pad(X, ((0, 0), (0, 0), 
                              (self.padding, self.padding), 
                              (self.padding, self.padding)), mode='constant')
        else:
            X_pad = X
        
        self.X_pad = X_pad
        
        # 卷积
        for n in range(batch_size):
            for c_out in range(out_channels):
                for i in range(H_out):
                    for j in range(W_out):
                        region = X_pad[n, :, 
                                      i*self.stride:i*self.stride+kH,
                                      j*self.stride:j*self.stride+kW]
                        out[n, c_out, i, j] = np.sum(region * self.W[c_out]) + self.b[c_out]
        
        return out


class MaxPool2D:
    """最大池化层"""
    
    def __init__(self, pool_size=2, stride=2):
        self.pool_size = pool_size
        self.stride = stride
    
    def forward(self, X):
        batch_size, channels, H, W = X.shape
        
        H_out = (H - self.pool_size) // self.stride + 1
        W_out = (W - self.pool_size) // self.stride + 1
        
        out = np.zeros((batch_size, channels, H_out, W_out))
        
        for n in range(batch_size):
            for c in range(channels):
                for i in range(H_out):
                    for j in range(W_out):
                        region = X[n, c,
                                  i*self.stride:i*self.stride+self.pool_size,
                                  j*self.stride:j*self.stride+self.pool_size]
                        out[n, c, i, j] = np.max(region)
        
        return out


class Flatten:
    """展平层"""
    
    def forward(self, X):
        self.shape = X.shape
        return X.reshape(X.shape[0], -1)


class Dense:
    """全连接层"""
    
    def __init__(self, in_features, out_features):
        scale = np.sqrt(2.0 / in_features)
        self.W = np.random.randn(in_features, out_features) * scale
        self.b = np.zeros(out_features)
    
    def forward(self, X):
        self.X = X
        return X @ self.W + self.b


class ReLU:
    """ReLU激活"""
    
    def forward(self, X):
        self.X = X
        return np.maximum(0, X)


# 构建简单CNN
class SimpleCNN:
    """简单CNN"""
    
    def __init__(self):
        self.layers = [
            ConvLayer(1, 16, 3, padding=1),    # 28x28 -> 28x28
            ReLU(),
            MaxPool2D(2, 2),                    # 28x28 -> 14x14
            ConvLayer(16, 32, 3, padding=1),   # 14x14 -> 14x14
            ReLU(),
            MaxPool2D(2, 2),                    # 14x14 -> 7x7
            Flatten(),                          # 32*7*7 = 1568
            Dense(32 * 7 * 7, 128),
            ReLU(),
            Dense(128, 10)
        ]
    
    def forward(self, X):
        for layer in self.layers:
            X = layer.forward(X)
        return X

# 测试
cnn = SimpleCNN()
x = np.random.randn(2, 1, 28, 28)  # 2张28x28灰度图
output = cnn.forward(x)
print(f"输入形状: {x.shape}")
print(f"输出形状: {output.shape}")

经典CNN架构

LeNet-5

# LeNet-5结构
lenet_structure = """
LeNet-5 (1998, Yann LeCun)
├── Conv: 1@32x32 -> 6@28x28 (5x5 kernel)
├── Pool: 6@28x28 -> 6@14x14 (2x2)
├── Conv: 6@14x14 -> 16@10x10 (5x5 kernel)
├── Pool: 16@10x10 -> 16@5x5 (2x2)
├── Flatten: 16*5*5 = 400
├── FC: 400 -> 120
├── FC: 120 -> 84
└── FC: 84 -> 10
"""
print(lenet_structure)

AlexNet

# AlexNet结构
alexnet_structure = """
AlexNet (2012, Alex Krizhevsky)
├── Conv: 3@224x224 -> 96@55x55 (11x11, stride=4)
├── Pool: 96@55x55 -> 96@27x27 (3x3, stride=2)
├── Conv: 96@27x27 -> 256@27x27 (5x5, padding=2)
├── Pool: 256@27x27 -> 256@13x13
├── Conv: 256@13x13 -> 384@13x13 (3x3, padding=1)
├── Conv: 384@13x13 -> 384@13x13 (3x3, padding=1)
├── Conv: 384@13x13 -> 256@13x13 (3x3, padding=1)
├── Pool: 256@13x13 -> 256@6x6
├── FC: 256*6*6 -> 4096 + Dropout
├── FC: 4096 -> 4096 + Dropout
└── FC: 4096 -> 1000

关键创新:
- ReLU激活函数
- Dropout正则化
- 数据增强
- GPU训练
"""
print(alexnet_structure)

VGGNet

# VGG16结构
vgg16_structure = """
VGG16 (2014, Karen Simonyan)

特点: 全部使用3x3卷积

├── Block 1: 2x Conv(3x3, 64) + Pool
├── Block 2: 2x Conv(3x3, 128) + Pool
├── Block 3: 3x Conv(3x3, 256) + Pool
├── Block 4: 3x Conv(3x3, 512) + Pool
├── Block 5: 3x Conv(3x3, 512) + Pool
├── FC: 25088 -> 4096
├── FC: 4096 -> 4096
└── FC: 4096 -> 1000

参数: 138M
"""
print(vgg16_structure)

ResNet

# ResNet残差块
class ResidualBlock:
    """残差块"""
    
    def __init__(self, in_channels, out_channels, stride=1):
        self.conv1 = ConvLayer(in_channels, out_channels, 3, stride, 1)
        self.conv2 = ConvLayer(out_channels, out_channels, 3, 1, 1)
        
        # 如果维度不匹配，用1x1卷积调整
        self.shortcut = None
        if stride != 1 or in_channels != out_channels:
            self.shortcut = ConvLayer(in_channels, out_channels, 1, stride, 0)
    
    def forward(self, X):
        identity = X
        
        out = self.conv1.forward(X)
        out = np.maximum(0, out)  # ReLU
        out = self.conv2.forward(out)
        
        if self.shortcut:
            identity = self.shortcut.forward(X)
        
        out += identity  # 残差连接
        out = np.maximum(0, out)  # ReLU
        
        return out

# 测试残差块
res_block = ResidualBlock(64, 64)
x = np.random.randn(1, 64, 28, 28)
y = res_block.forward(x)
print(f"残差块 输入形状: {x.shape}, 输出形状: {y.shape}")

PyTorch实现

try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    
    class CNN(nn.Module):
        def __init__(self, num_classes=10):
            super().__init__()
            
            self.features = nn.Sequential(
                nn.Conv2d(1, 32, 3, padding=1),
                nn.BatchNorm2d(32),
                nn.ReLU(),
                nn.MaxPool2d(2),
                
                nn.Conv2d(32, 64, 3, padding=1),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d(2),
                
                nn.Conv2d(64, 128, 3, padding=1),
                nn.BatchNorm2d(128),
                nn.ReLU(),
                nn.AdaptiveAvgPool2d(1)
            )
            
            self.classifier = nn.Sequential(
                nn.Flatten(),
                nn.Linear(128, 256),
                nn.ReLU(),
                nn.Dropout(0.5),
                nn.Linear(256, num_classes)
            )
        
        def forward(self, x):
            x = self.features(x)
            x = self.classifier(x)
            return x
    
    model = CNN()
    print("PyTorch CNN:")
    print(model)
    
    # 测试
    x = torch.randn(4, 1, 28, 28)
    y = model(x)
    print(f"\n输入形状: {x.shape}")
    print(f"输出形状: {y.shape}")
    
    # 参数统计
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\n总参数: {total_params:,}")
    print(f"可训练参数: {trainable_params:,}")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: 卷积核大小如何选择？

大小	特点	用途
1x1	通道混合	降维、升维
3x3	最常用	特征提取
5x5	较大感受野	早期网络
7x7	大感受野	第一层

Q2: 为什么用3x3卷积？

两个3x3等效于一个5x5，但参数更少，非线性更强。

Q3: padding=”same”是什么意思？

输出尺寸与输入相同，$\text{padding} = (\text{kernel_size} - 1) / 2$。

Q4: 如何计算感受野？

\[RF_{l} = RF_{l-1} + (k_l - 1) \times \prod_{i=1}^{l-1} s_i\]

总结

概念	描述
卷积层	提取局部特征
池化层	降维、增加不变性
感受野	输出像素对应的输入区域
残差连接	解决深层网络训练问题

参考资料

LeCun, Y. et al. (1998). “Gradient-Based Learning Applied to Document Recognition”
Krizhevsky, A. et al. (2012). “ImageNet Classification with Deep Convolutional Neural Networks”
He, K. et al. (2016). “Deep Residual Learning for Image Recognition”
CS231n: Convolutional Neural Networks for Visual Recognition

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——卷积神经网络》

本文链接：http://localhost:3015/ai/%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.html

本文最后一次更新为天前，文章中的某些内容可能已过时！