OpenVINO CPU加速调研

理论部分

一. 介绍

OpenVINO™ 是用于优化和部署 AI 推理的开源工具包。

  • 提升计算机视觉、自动语音识别、自然语言处理和其他常见任务中的深度学习性能
  • 使用通过 TensorFlow、PyTorch 等流行框架训练的模型
  • 减少资源需求并在从边缘到云的一系列英特尔® 平台上高效部署
image.png

训练、优化、部署

image.png

二. 优化原理

  • Linear Operations Fusing(算子融合)
image.png
  • Precision Calibration(精度校准)
    其实就是指模型INT8量化, 当然也可以使用inter 的NNCF进行其他模型压缩操作

三. OpenVINO 常用工具介绍

实战部分

一. 环境准备

# 拉取并启动容器
docker pull openvino/ubuntu18_dev:latest
docker run -itd  -p 8501:8501 -p 8500:8500 -p 8889:8889 -v "/root/openvino_notebooks:/openvino_notebooks" openvino/ubuntu18_dev:latest

# 进入容器
docker exec -it -u root bc89fe5f98e6 /bin/bash

# 拉取案例库
git clone --depth=1 https://github.com/openvinotoolkit/openvino_notebooks.git

# 安装jupyter
cd openvino_notebooks
sudo apt-get update
sudo apt-get upgrade
sudo apt-get install python3-venv build-essential python3-dev git-all
python -m pip install --upgrade pip
pip install -r requirements.txt
python -m ipykernel install --user --name openvino_env

apt-get install vim
# 启动jupyter
jupyter lab notebooks --allow-root

二. 模型转换(使用jupyter notebook)

import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown

# Construct the command for Model Optimizer
mo_command = f"""mo
                 --saved_model_dir "/openvino_notebooks/open_model_zoo_models/custom/origin_model"
                 --data_type FP32
                 --input dense_input,sparse_ids_input,sparse_wgt_input,seq_50_input
                 --input_shape [100,587],[100,53],[100,53],[100,6,50]
                 --output_dir "/openvino_notebooks/open_model_zoo_models/custom/fp32"
                 --output "Identity"
                 """
mo_command = " ".join(mo_command.split())
print("Model Optimizer command to convert TensorFlow to OpenVINO:")
display(Markdown(f"`{mo_command}`"))

! $mo_command

三. 模型量化(使用jupyter notebook) 

import os
from pathlib import Path
from openvino.tools.pot import DataLoader
import tensorflow as tf
import math
from yaspin import yaspin


# 从TFRecord读取数据
def input_fn_tfrecord(filenames, batch_size=256):
    """make input fn for tfrecord file
    """
    reader = tf.data.TFRecordDataset(
        filenames,
        num_parallel_reads=10,
    ).shuffle(100000, reshuffle_each_iteration=True)

    features = {
        'dense_input': tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
        'sparse_ids_input': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
        'sparse_wgt_input': tf.io.FixedLenSequenceFeature([], tf.float32, allow_missing=True),
        'seq_50_input': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
        'is_click': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
    }

    def _parse_example(example):
        """
            parse data
        """
        parse_data = tf.io.parse_single_example(example, features)
        return [
            tf.reshape(parse_data['dense_input'][:587], shape=[587]),
            tf.reshape(tf.cast(parse_data["sparse_ids_input"], tf.int32), shape=[53]),
            tf.reshape(parse_data["sparse_wgt_input"], shape=[53]),
            tf.reshape(tf.reshape(tf.cast(parse_data['seq_50_input'], tf.int32), [-1, 50])[:6, :], shape=[6, 50]),
            tf.reshape(parse_data['is_click'], shape=[1])]

    dataset = reader.map(_parse_example, num_parallel_calls=11)  # 解析数据
    dataset = dataset.prefetch(buffer_size=batch_size)
    batch = dataset.batch(batch_size=batch_size)
    return batch

# 数据预处理
data_file = "/openvino_notebooks/open_model_zoo_models/custom/eval_processed_data.tfrecords"
batch_size = 100
inputs_list = ['dense_input', 'sparse_ids_input', 'sparse_wgt_input', 'seq_50_input']
total_samples = sum(1 for _ in tf.compat.v1.python_io.tf_record_iterator(data_file))
n = math.ceil(float(total_samples) / batch_size)
data = []
with tf.compat.v1.Session() as sess:
    dataset = input_fn_tfrecord(data_file, 100)
    dataset_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
    next_element = dataset_iterator.get_next()
    next_element = sess.run(next_element)
    for i in range(n):
        records = {
            'dense_input': next_element[0],
            'sparse_ids_input': next_element[1],
            'sparse_wgt_input': next_element[2],
            'seq_50_input': next_element[3],
            'label': next_element[4],
        }
        data.append(records)


class OriginModelDataLoader(DataLoader):
    def __init__(self, data_list):
        """dataloader generator

        Args:
            data_location (str): tf recorder local path
            batch_size (int): dataloader batch size
        """
        self.data_list =  data_list

    def __getitem__(self, index):
        if index >= len(self.data_list):
            raise IndexError("Index out of dataset size")
        current_item = self.data_list[index]
        label = self.data_list[index]['label']
        feat_names = {'dense_input', 'sparse_ids_input', 'sparse_wgt_input', 'seq_50_input'}
        p2 = {key: value for key, value in current_item.items() if key in feat_names}
        return ((index, label), p2)

    def __len__(self):
        return len(self.data_list)

# 执行模型量化
from openvino.tools.pot import IEEngine
import addict
from openvino.tools.pot import load_model,save_model
from openvino.tools.pot import compress_model_weights
from openvino.tools.pot import create_pipeline
from compression.api import DataLoader, Metric

path_to_xml = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.xml"
path_to_bin = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.bin"
data_file = "/openvino_notebooks/open_model_zoo_models/custom/eval_processed_data.tfrecords"
batch_size = 512

# Model config specifies the model name and paths to model .xml and .bin file
model_config = addict.Dict(
    {
        "model_name": "origin_model",
        "model": path_to_xml,
        "weights": path_to_bin,
    }
)

# Engine config
engine_config = addict.Dict({"device": "CPU"})

algorithms = [
    {
        "name": "AccuracyAwareQuantization",
        "params": {
            "target_device": "CPU",
            "stat_subset_size": 300,
            "maximal_drop": 0.001, # 制定精度损失不超过0.001
        },
    }
]

# Step 1: implement and create user's data loader
data_loader = OriginModelDataLoader(data)

# Step 2: load model
ir_model = load_model(model_config=model_config)
metric = Accuracy()

# Step 3: Initialize the engine for metric calculation and statistics collection.
engine = IEEngine(config=engine_config, data_loader=data_loader, metric=metric)

# Step 4: Create a pipeline of compression algorithms and run it.
pipeline = create_pipeline(algorithms, engine)
algorithm_name = pipeline.algo_seq[0].name
with yaspin(
    text=f"Executing POT pipeline on {model_config['model']} with {algorithm_name}"
) as sp:
    start_time = time.perf_counter()
    compressed_model = pipeline.run(ir_model)
    end_time = time.perf_counter()
    sp.ok("✔")
print(f"Quantization finished in {end_time - start_time:.2f} seconds")

# Step 5 (Optional): Compress model weights to quantized precision
#                    in order to reduce the size of the final .bin file.
compress_model_weights(compressed_model)

# Step 6: Save the compressed model to the desired path.
# Set save_path to the directory where the model should be saved
compressed_model_paths = save_model(
    model=compressed_model,
    save_path="optimized_model",
    model_name="optimized_model",
)

# Step 7 (Optional): Evaluate the compressed model. Print the results.
metric_results = pipeline.evaluate(compressed_model)

original_metric_results = pipeline.evaluate(ir_model)
if original_metric_results:
    print(f"Accuracy of the original model:  {next(iter(original_metric_results.values())):.5f}")
    
quantized_metric_results = pipeline.evaluate(compressed_model)
if quantized_metric_results:
    print(f"Accuracy of the quantized model: {next(iter(quantized_metric_results.values())):.5f}")

优化前后对比测试

# 优化前后模型大小比较
ir_path = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.xml"
quantized_model_path = "/openvino_notebooks/notebooks/002-openvino-api/optimized_model/optimized_model.xml"
original_model_size = Path(ir_path).with_suffix(".bin").stat().st_size / 1024
quantized_model_size = Path(quantized_model_path).with_suffix(".bin").stat().st_size / 1024
compression_ratio = (34231.60-12384.25)/342.3160
print(f"FP32 model size: {original_model_size:.2f} KB")
print(f"INT8 model size: {quantized_model_size:.2f} KB")
print(f"Compression ratio : {compression_ratio:.4f}%")

# 模型性能比较, benchmark_app是openvion官方提供的性能测试工具
#!benchmark_app --help
model_name = "quantized_model"
benchmark_command = f"benchmark_app -m {quantized_model_path} -t 15 -d CPU -api async -hint latency"
display(Markdown(f"Benchmark command: `{benchmark_command}`"))
display(Markdown(f"Benchmarking {model_name} on CPU with async inference for 15 seconds..."))
! $benchmark_command

#!benchmark_app --help
model_path = "/openvino_notebooks/open_model_zoo_models/custom/fp32/saved_model.xml"
model_name = "origin_model"
benchmark_command = f"benchmark_app -m {model_path} -t 15 -hint latency "
display(Markdown(f"Benchmark command: `{benchmark_command}`"))
display(Markdown(f"Benchmarking {model_name} on CPU with async inference for 15 seconds..."))
! $benchmark_command

四. 实验结论

模型名称

大小

QPS

origin_model

34231.60 KB

88.93

quantiztion model

12384.25 KB

105.58

优化比率

减少了63.8222%

18.72%

通过观察转换期间日志, 发现由于模型结构比较简单紧凑, 特征也非常稀疏, 导致转换时可以被算子融合和量化的节点并不多, 故性能提升不是特别明显.

本站文章资源均来源自网络,除非特别声明,否则均不代表站方观点,并仅供查阅,不作为任何参考依据!
如有侵权请及时跟我们联系,本站将及时删除!
如遇版权问题,请查看 本站版权声明
THE END
分享
二维码
海报
OpenVINO CPU加速调研
通过观察转换期间日志, 发现由于模型结构比较简单紧凑, 特征也非常稀疏, 导致转换时可以被算子融合和量化的节点并不多, 故性能提升不是特别明显.
<<上一篇
下一篇>>