深度学习：CNN之验证码识别

CodeGoat24 • 2023-01-02 • 云技术社区 • 304 阅读

一、前言

当想要训练处理图像的深度学习模型时，遇到了大尺寸图像，如果我们直接使用全连接层来无脑叠加，那会造成以下几个问题：

（1）将图像展开为向量会丢失空间信息； （2）参数过多导致效率低下，训练困难；（3）大量的参数也很快会导致网络过拟合。

此时，使用CNN卷积神经网络，就能很好地解决上述问题。

今天，我将分享一下我自己使用pytorch搭建的CNN模型，识别由英文字母和数字组成的验证码图像。

阅读本文需提前掌握以下知识：

卷积神经网络的原理
卷积层和全连接层的构成
pytorch建立CNN模型
pytorch的Dataset类

二、CNN的2大特点：

能够有效的将大数据量的图片降维成小数据量
能够有效的保留图片特征，符合图片处理的原则

三、什么类型的图像识别适用CNN？

关键特征只出现在整个图像的一小部分位置

关键特征可能在一个图像中重复出现
适当的压缩图像不影响识别

接下来我将开始介绍我是如何一步一步从0到1搭建这个模型并成功训练的

四、全局变量的设置

创建config.yaml文件
配置参数

#图片宽高
width: 150
height: 30
#结果分类
alphabet: 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
#验证码长度
numchar: 5 

train:
#  遍历数据集训练的次数
  epoch: 100
#  批处理数量
  batch_size: 32
#  学习率
  lr: 0.001
#  训练数据存放路径
  train_data: ./data2/train
#  测试数据存放路径
  test_data: ./data2/test
#  是否使用gpu
  is_gpu: True
#  gpu并行处理进程数
  num_workers: 3
#  训练后的模型输出的路径
  out_model_path: ./model2

test:
#  测试用的模型路径
  model_path: ./model2/model_76_91%.path
#  是否使用gpu
  is_gpu: False
#  样例数据路径
  samples_path: ./data2/samples

这些都是之后建模、训练、测试会用到的通用参数，为了避免在不同文件重复输入参数值，这里单独适用yaml统一管理这些配置参数。

该模型使用了尺寸为150*30的数据集，验证码长度为5。如果拿到了不同尺寸和验证码长度的数据集想要训练，只需要在这里修改即可。

五、搭建模型

这里我使用了3层卷积层+2层全连接层。

本来只使用了一层全连接层，但是准确度一直上不去，改为2层后，准确度能到91%

什么是flatten？

import torch.nn as nn


class CNN(nn.Module):
    def __init__(self, num_class=36, num_char=4, width=180, height=100):
        super(CNN, self).__init__()
        self.num_class = num_class
        self.num_char = num_char
        # 卷积层后，全连接层的一维数组输入长度
        # 512是卷积处理后图片的通道数，长度和宽度各除以16是因为图像经过了四次2*2池化层（MaxPool2d）
        self.line_size = int(512 * (width // 2 // 2 // 2 // 2) * (height // 2 // 2 // 2 // 2))
        self.conv1 = nn.Sequential(
            # 输入的是RGB图像，所以是3通道。
            # 这里设置该层有16个卷积核，所以输出是16通道
            # padding（1，1）表示在图像上下左右各加1行、1列，保证在卷积后图像大小不变
            nn.Conv2d(3, 16, 3, padding=(1, 1)),
            # 池化层，保留图像每2*2片段像素的最大值
            nn.MaxPool2d(2, 2),
            # 对每个通道的图像都归一化，防止梯度爆炸
            nn.BatchNorm2d(16),
            # 激活函数
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 64, 3, padding=(1, 1)),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 512, 3, padding=(1, 1)),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(512, 512, 3, padding=(1, 1)),
            nn.MaxPool2d(2, 2),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )
        # 全连接层
        self.fc = nn.Sequential(
            nn.Linear(self.line_size, self.line_size),
            # nn.Identity(),
            # 输出应为 验证码长度*字符的分类数
            nn.Linear(self.line_size, self.num_char * self.num_class)
        )



    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        # resize输入数组的尺寸，相当于flatten
        x = x.view(-1, self.line_size)
        x = self.fc(x)

        return x

六、配置数据集处理的Dataset类

Dataset是pytorch提供的对数据进行读取和预处理的工具类，这里不进行过多介绍。

import os
from PIL import Image
import torch
from torch.utils.data import Dataset

def img_loader(img_path):
    img = Image.open(img_path)
    # 将图像转换为 RGB
    return img.convert('RGB')

# 处理数据集所在文件夹下的数据
def make_dataset(data_path, alphabet, num_class, num_char):
    # 获取数据集所在文件夹的所有文件名
    img_names = os.listdir(data_path)
    samples = []
    for img_name in img_names:
        # 拼接每个图像数据集的路径
        img_path = os.path.join(data_path, img_name)
        # 找出该图像的label
        target_str = img_name.replace("\\\\", '/').split('/')[-1].split('.')[0].split("_")[0]
        # 判断lable和结果的长度是否一致
        assert len(target_str) == num_char
        target = []
        # 创建每个数据的target数组 4 * alphabet，这里使用one hot
        for char in target_str:
            vec = [0] * num_class
            vec[alphabet.find(char)] = 1
            target += vec

        # 加入数据集
        samples.append((img_path, target))
    # 返回数据集
    return samples


class CaptchaData(Dataset):
    def __init__(self, data_path, num_class=62, num_char=5, transform=None, target_transform=None,
                 alphabet="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"):
        super(Dataset, self).__init__();
        self.data_path = data_path
        self.num_class = num_class
        self.num_char = num_char
        self.transform = transform
        self.target_transform = target_transform
        self.alphabet = alphabet
        self.samples = make_dataset(self.data_path, self.alphabet,
                                    self.num_class, self.num_char)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, target = self.samples[index]
        img = img_loader(img_path)
        # 如果有传入预处理函数，就预处理数据集
        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, torch.Tensor(target)

七、训练模型

import logging
import torch
import torch.nn as nn
import yaml
from torch.autograd import Variable
from models import CNN
from datasets import CaptchaData
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, ToTensor, Resize
import time
import os
logging.basicConfig(level=logging.INFO,
                format='%(asctime)s -[PID:%(process)s]-%(levelname)s-%(funcName)s-%(lineno)d: [ %(message)s ]',
                datefmt="%Y-%m-%d %H:%M:%S")

with open('./config.yaml', 'r', encoding='utf-8') as f_config:
    config_result = f_config.read()
    config = yaml.load(config_result, Loader=yaml.FullLoader)


batch_size = config["train"]["batch_size"]
base_lr = config["train"]["lr"]
max_epoch = config["train"]["epoch"]
model_path = config["train"]["out_model_path"]
train_data_path = config["train"]["train_data"]
test_data_path = config["train"]["test_data"]
num_workers = config["train"]["num_workers"]
use_gpu = config["train"]["is_gpu"]
width = config["width"]
height = config["height"]
alphabet = config["alphabet"]
numchar = config["numchar"]
# restor = False

if not os.path.exists(model_path):
    logging.info("新建训练模型保存路径：{}".format(model_path))
    os.makedirs(model_path)


# 计算准确度
def calculat_acc(output, target):
    output, target = output.view(-1, len(alphabet)), target.view(-1, len(alphabet))
    output = nn.functional.softmax(output, dim=1)
    output = torch.argmax(output, dim=1)
    target = torch.argmax(target, dim=1)
    output, target = output.view(-1, int(numchar)), target.view(-1, int(numchar))
    correct_list = []
    for i, j in zip(target, output):
        if torch.equal(i, j):
            correct_list.append(1)
        else:
            correct_list.append(0)
    acc = sum(correct_list) / len(correct_list)
    return acc


def train():
    # 数据shape的预处理
    transforms = Compose([Resize((height, width)), ToTensor()])
    # 创建训练数据集对象
    train_dataset = CaptchaData(train_data_path, num_class=len(alphabet), num_char=int(numchar), transform=transforms, alphabet=alphabet)
    # 初始化DataLoader，之后训练的数据由它按照我们的要求如batch_size等提供
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers,
                                   shuffle=True, drop_last=True)
    # 创建测试数据集对象
    test_data = CaptchaData(test_data_path, num_class=len(alphabet), num_char=int(numchar), transform=transforms, alphabet=alphabet)
    test_data_loader = DataLoader(test_data, batch_size=batch_size,
                                  num_workers=num_workers, shuffle=True, drop_last=True)
    # 初始化模型
    cnn = CNN(num_class=len(alphabet), num_char=int(numchar), width=width, height=height)
    if use_gpu:
        cnn.cuda()

    # 使用Adam优化方法
    optimizer = torch.optim.Adam(cnn.parameters(), lr=base_lr)
    # 使用多标签分类的损失函数
    criterion = nn.MultiLabelSoftMarginLoss()

    # 训练我们指定的epoch次
    for epoch in range(max_epoch):
        start_ = time.time()

        loss_history = []
        acc_history = []
        # 切换到训练模式
        cnn.train()
        for img, target in train_data_loader:
            # img = Variable(img)
            # target = Variable(target)
            if use_gpu:
                img = img.cuda()
                target = target.cuda()
            # 获取神经网络的输出
            output = cnn(img)
            # 计算损失函数
            loss = criterion(output, target)
            # 初始化梯度
            optimizer.zero_grad()
            # 反向传播计算梯度
            loss.backward()
            # 优化参数
            optimizer.step()
            # 计算准确度
            acc = calculat_acc(output, target)
            acc_history.append(float(acc))
            loss_history.append(float(loss))
        print('epoch:{},train_loss: {:.4}|train_acc: {:.4}'.format(
            epoch,
            torch.mean(torch.Tensor(loss_history)),
            torch.mean(torch.Tensor(acc_history)),
        ))

        loss_history = []
        acc_history = []
        # 切换到测试模式
        cnn.eval()
        for img, target in test_data_loader:
            # img = Variable(img)
            # target = Variable(target)
            if torch.cuda.is_available():
                img = img.cuda()
                target = target.cuda()
            output = cnn(img)

            acc = calculat_acc(output, target)
            acc_history.append(float(acc))
        print('test_loss: {:.4}|test_acc: {:.4}'.format(
            torch.mean(torch.Tensor(loss_history)),
            torch.mean(torch.Tensor(acc_history)),
        ))
        print('epoch: {}|time: {:.4f}'.format(epoch, time.time() - start_))
        torch.save(cnn.state_dict(), os.path.join(model_path, "model_{}.path".format(epoch)))


if __name__ == "__main__":
    train()

训练效果图：

八、测试模型

import logging
import torch
import torch.nn as nn
import yaml
from PIL import Image
from models import CNN
from torchvision.transforms import Compose, ToTensor, Resize
import matplotlib.pyplot as plt
import os
import random

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s -[PID:%(process)s]-%(levelname)s-%(funcName)s-%(lineno)d: [ %(message)s ]',
                    datefmt="%Y-%m-%d %H:%M:%S")

with open('./config.yaml', 'r', encoding='utf-8') as f_config:
    config_result = f_config.read()
    config = yaml.load(config_result, Loader=yaml.FullLoader)

# 成品模型路径
model_path = config["test"]["model_path"]
# 是否使用gpu
use_gpu = config["train"]["is_gpu"]
# 图片宽度
width = config["width"]
# 图片高度
height = config["height"]
# 结果类别
alphabet = config["alphabet"]
# 结果个数
numchar = config["numchar"]
# 样例数据路径
samples_path = config["test"]["samples_path"]
model_net = CNN()

# 获取模型
def load_net():
    global model_net
    # 初始化模型
    model_net = CNN(num_class=len(alphabet), num_char=int(numchar), width=width, height=height)
    # 读取成品模型
    if use_gpu:
        model_net = model_net.cuda()
        model_net.eval()
        model_net.load_state_dict(torch.load(model_path))
    else:
        model_net.eval()
        model_net.load_state_dict(torch.load(model_path, map_location='cpu'))

# 预测验证码
def predict_image(img):
    global model_net
    with torch.no_grad():
        img = img.convert('RGB')
        transforms = Compose([Resize((height, width)), ToTensor()])
        img = transforms(img)

        if use_gpu:
            img = img.view(1, 3, height, width).cuda()
        else:
            img = img.view(1, 3, height, width)
        output = model_net(img)

        output = output.view(-1, len(alphabet))
        output = nn.functional.softmax(output, dim=1)
        output = torch.argmax(output, dim=1)
        output = output.view(-1, numchar)[0]
        return ''.join([alphabet[i] for i in output.cpu().detach().numpy()])



if __name__ == "__main__":
    load_net()
    # 枚举数据所在文件夹
    img_names = os.listdir(samples_path)
    random.shuffle(img_names)
    samples = []
    for img_name in img_names:
        # 拼接每个数据的路径
        img_path = os.path.join(samples_path, img_name)
        img = Image.open(img_path)
        v_code = predict_image(img)
        plt.figure()
        plt.title("{}".format(v_code))
        plt.imshow(img)
        plt.show()