Advanced PTQ

MQBench provides a simple API for advanced PTQ, learn our step-by-step instructions to quantize your model.

1. Prepare FP32 model firstly.

import torchvision.models as models
from mqbench.convert_deploy import convert_deploy
from mqbench.prepare_by_platform import prepare_qat_fx_by_platform, BackendType
from mqbench.utils.state import enable_calibration, enable_quantization
from mqbench.advanced_ptq import ptq_reconstruction

# first, initialize the FP32 model with pretrained parameters.
model = models.__dict__["resnet18"](pretrained=True)
model.eval()

2. Configure advanced ptq and backend.

# configuration
ptq_reconstruction_config = {
    'pattern': 'block',                   #? 'layer' for Adaround or 'block' for BRECQ and QDROP
    'scale_lr': 4.0e-5,                   #? learning rate for learning step size of activation
    'warm_up': 0.2,                       #? 0.2 * max_count iters without regularization to floor or ceil
    'weight': 0.01,                       #? loss weight for regularization item
    'max_count': 20000,                   #? optimization iteration
    'b_range': [20,2],                    #? beta decaying range
    'keep_gpu': True,                     #? calibration data restore in gpu or cpu
    'round_mode': 'learned_hard_sigmoid', #? ways to reconstruct the weight, currently only support learned_hard_sigmoid
    'prob': 1.0,                          #? dropping probability of QDROP, 1.0 for Adaround and BRECQ
}

# backend options
backend = BackendType.Tensorrt
# backend = BackendType.SNPE
# backend = BackendType.PPLW8A16
# backend = BackendType.NNIE
# backend = BackendType.Vitis
# backend = BackendType.ONNX_QNN
# backend = BackendType.PPLCUDA
# backend = BackendType.OPENVINO
# backend = BackendType.Tengine_u8
# backend = BackendType.Tensorrt_NLP

3. Prepare to quantize the model.

# trace model and add quant nodes for model on backend
model = prepare_by_platform(model, backend)

# calibration loop
enable_calibration(model)
for i, batch in enumerate(data):
    # do forward procedures
    ...

# ptq_reconstruction loop
stacked_tensor = []
# add ptq_reconstruction data to stack
for i, batch_data in enumerate(data):
    if i == cali_batchsize:
        break
    stacked_tensor.append(batch_data)
# start ptq_reconstruction
model = ptq_reconstruction(model, stacked_tensor, ptq_reconstruction_config)

# evaluation loop
for i, batch in enumerate(data):
    # do forward procedures
    ...

4. Export quantized model.

# deploy model, remove fake quantize nodes, and dump quantization params like clip ranges.
input_shape={'data': [10, 3, 224, 224]}
convert_deploy(model, backend, input_shape)

you can find algorithm details in Advanced PTQ. We also provides an example in here.