TensorRT推理加速

TensorRT的使用

work3

定义预处理和后处理函数如下:

  • 冻结图
    • 导入Keras模型,将变量转换为常量并删除用于模型训练的节点。
    • get_freeze_session():返回冻结图形。
  • TF引擎
    • init(): 初始化,以设置TF图的会话。
    • infer(): 推理,即对一个输入图像进行推理并得到结果。
    • show_graph(): 显示图形,即使用第三方库在notebook中显示TensorBoard,以可视化计算图的节点。
    • time_graphdef():对推理速度的性能进行基准测试,返回 个图像每秒 推理基准测试。
  • TF-TRT引擎
    • init(): 初始化设置TF graph会话。
    • infer(): 推理,即对一个输入图像进行推理并得到结果。
    • show_graph(): 显示图形,即使用第三方库在notebook中显示TensorBoard,以可视化计算图的节点。
    • save_engine(): 保存引擎,即把TensorRT优化的引擎序列化并保存到磁盘。
    • time_graphdef():对推理速度的性能进行基准测试,返回 个图像/秒 个推理基准测试。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
class FrozenGraph(object):
def __init__(self, model, shape):
shape = (None, shape[0], shape[1], shape[2])
x_name = 'image_tensor_x'
with K.get_session() as sess:
#只需要入口输入和结尾输出的名字
x_tensor = tf.placeholder(tf.float32, shape, x_name)
K.set_learning_phase(0)#删除用于训练节点之前必须
y_tensor = model(x_tensor)
y_name = y_tensor.name[:-2]
graph = sess.graph.as_graph_def()
#将权重变量都转化成常量储存
graph0 = tf.graph_util.convert_variables_to_constants(sess, graph, [y_name])
#删除用于训练的节点
graph1 = tf.graph_util.remove_training_nodes(graph0)

self.x_name = [x_name]
self.y_name = [y_name]
self.frozen = graph1

def get_freeze_session(self, keep_var_names=None, output_names=None, clear_devices=True):
return self.frozen
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class TfEngine(object):
def __init__(self, graph,batch_size):
g = tf.Graph()
with g.as_default():
x_op, y_op = tf.import_graph_def(
graph_def=graph.frozen, return_elements=graph.x_name + graph.y_name)
print("graph.x_name + graph.y_name : ", graph.x_name , " + ",graph.y_name)
self.x_tensor = x_op.outputs[0]
self.y_tensor = y_op.outputs[0]

config = tf.ConfigProto(gpu_options=
tf.GPUOptions(per_process_gpu_memory_fraction=0.2,
allow_growth=True))

self.sess = tf.Session(graph=g, config=config)
self.batch_size = batch_size


def infer(self, x):
y = self.sess.run(self.y_tensor,
feed_dict={self.x_tensor: x})
return y

def show_graph(self):
# Show current session graph with TensorBoard in Jupyter Notebook.

return show_graph(tf.get_default_graph().as_graph_def())

def time_graphdef(self, input_data, iteration_time):
# we can now import trt_graph into Tensorflow and execute it. If given target
with self.sess as sess:
times = []
# do a few dummy iterations to let things warm up
for i in range(20):
val = sess.run(self.y_tensor, {self.x_tensor: input_data})

print("start per second inference benchmark ")
for i in range(iteration_time):
start = time.time()
val = sess.run(self.y_tensor, {self.x_tensor: input_data})
times.append(time.time() - start)
if (i%100 ==0):

imgs_per = (1/times[i]) * self.batch_size
print("Iteration %8.2f / %8.2f inference speed %5.2f imgs/sec, Sec/batch time: %8.5f " %(i , iteration_time, imgs_per, times[i]))

print("start per second inference benchmark ")
imgs_per = (1/np.mean(times)) * self.batch_size


print("Benchmark inference the TFTRT Engine : %8.2f imgs/sec" % (imgs_per))

return times
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class TftrtEngine(TfEngine):
def __init__(self, graph, batch_size, precision):
tftrt_graph = tftrt.create_inference_graph(
graph.frozen, # This parameter is the GraphDef object that contains the model to be transformed.
outputs=graph.y_name, # This parameter lists the output nodes in the graph. Tensors which are not marked as outputs are considered to be transient values that may be optimized away by the builder.
max_batch_size=batch_size, #This parameter is the maximum batch size that specifies the batch size for which TensorRT will optimize. At runtime, a smaller batch size may be chosen. At runtime, larger batch size is not supported.
max_workspace_size_bytes=1 << 25, #TensorRT operators often require temporary workspace. This parameter limits the maximum size that any layer in the network can use. If insufficient scratch is provided, it is possible that TensorRT may not be able to find an implementation for a given layer.
precision_mode=precision, #This parameter sets the precision mode; which can be one of fp32, fp16, or int8.
minimum_segment_size=50 #This parameter determines the minimum number of TensorFlow nodes in a TensorRT engine, which means the TensorFlow subgraphs that have fewer nodes than this number will not be converted to TensorRT. Therefore, in general smaller numbers such as 5 are preferred. This can also be used to change the minimum number of nodes in the optimized INT8 engines to change the final optimized graph to fine tune result accuracy.
)

self.opt_graph = copy.deepcopy(graph)
self.opt_graph.frozen = tftrt_graph
self.batch_size = batch_size
super(TftrtEngine, self).__init__(self.opt_graph,self.batch_size)

tf.reset_default_graph()


# We can verify that we can access the list of operations in the graph
with tf.Graph().as_default() as graph:
tf.import_graph_def(tftrt_graph, name="")

for op in graph.get_operations():
print(op.name) # <--- printing the operations snapshot below
# prefix/Placeholder/inputs_placeholder
# ...
# prefix/Accuracy/predictions


def infer(self, x):
num_tests = x.shape[0]
y = np.empty((num_tests, self.y_tensor.shape[1]), np.float32)
batch_size = self.batch_size


for i in range(0, num_tests, batch_size):
x_part = x[i : i + batch_size]
y_part = self.sess.run(self.y_tensor,
feed_dict={self.x_tensor: x_part})
y[i : i + batch_size] = y_part
return y

def save_engine(self,output_file_name):
print('Number of nodes after conversion: {}'.format(len(self.opt_graph.frozen.node)))
with tf.gfile.GFile(output_file_name, 'wb') as f:
f.write(self.opt_graph.frozen.SerializeToString())

def show_graph(self):
# Show current session graph with TensorBoard in Jupyter Notebook.
ops.reset_default_graph()
with tf.Graph().as_default() as graph:
tf.import_graph_def(self.opt_graph.frozen, name="")

for op in graph.get_operations():
print(op.name)

def time_graphdef(self, input_data,iteration_time):
# we can now import trt_graph into Tensorflow and execute it. If given target
with self.sess as sess:
times = []
# do a few dummy iterations to let things warm up
for i in range(20):
val = sess.run(self.y_tensor, {self.x_tensor: input_data})

print("start per second inference benchmark ")
for i in range(iteration_time):
start = time.time()
val = sess.run(self.y_tensor, {self.x_tensor: input_data})
times.append(time.time() - start)
if (i%100 ==0):
print(times[i])

imgs_per = (1/times[i]) * self.batch_size
print("Iteration %8.2f / %8.2f inference speed %5.2f imgs/sec, Sec/batch time: %8.5f " %(i , iteration_time, imgs_per, times[i]))


print("start per second inference benchmark ")
imgs_per = (1/np.mean(times)) * self.batch_size


print("Benchmark inference the TFTRT Engine : %8.2f imgs/sec" % (imgs_per))
return times

使用方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# This line must be executed before loading Keras model.
K.set_learning_phase(0)
model = load_model('/dli/data/1217model_v2.h5')
batch_size = 128

img_shape = (120, 120, 3)
input_data = np.random.randn(batch_size,120, 120, 3)

frozen_graph = FrozenGraph(model, img_shape)
#frozen_graph = FrozenGraph(prebuilt_graph_path, img_shape)
del model

tf_engine = TfEngine(frozen_graph, batch_size)
tf_times = tf_engine.time_graphdef(input_data,500)

print('Average TF graph execution time: {:0.5f} s'.format(np.mean(tf_times)))

del tf_engine

##
trt_graph_def = TftrtEngine(frozen_graph, batch_size, 'FP16')
#trt_graph_def = TftrtEngine(frozen_graph, batch_size, 'FP32')
trt_times = trt_graph_def.time_graphdef(input_data,500)
##

#tf_times = time_graphdef(tf_engine, input_data)
#trt_times = time_graphdef(trt_graph, input_data)

print("\n \n Summary : \n ")
print('Average TF graph execution time: {:0.5f} s'.format(np.mean(tf_times)))
print('Average TRT graph execution time: {:0.5f} s'.format(np.mean(trt_times)))
print('Speedup factor: {:0.2f}'.format(np.mean(tf_times) / np.mean(trt_times)))

#为将来的部署而保存引擎
trt_graph_def.save_engine('TF_TRT_FP16.engine')

滑动窗口demo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import caffe
import time

MODEL_JOB_NUM = '##FIXME##' ## Remember to set this to be the job number for your model
DATASET_JOB_NUM = '##FIXME##' ## Remember to set this to be the job number for your dataset

MODEL_FILE = '/dli/data/digits/' + MODEL_JOB_NUM + '/deploy.prototxt' # Do not change
PRETRAINED = '/dli/data/digits/' + MODEL_JOB_NUM + '/snapshot_iter_735.caffemodel' # Do not change
MEAN_IMAGE = '/dli/data/digits/' + DATASET_JOB_NUM + '/mean.jpg' # Do not change

# load the mean image
mean_image = caffe.io.load_image(MEAN_IMAGE)

# Choose a random image to test against
#RANDOM_IMAGE = str(np.random.randint(10))
IMAGE_FILE = '/dli/data/LouieReady.png'

# Tell Caffe to use the GPU
caffe.set_mode_gpu()
# Initialize the Caffe model using the model trained in DIGITS
net = caffe.Classifier(MODEL_FILE, PRETRAINED,
channel_swap=(2,1,0),
raw_scale=255,
image_dims=(256, 256))

# Load the input image into a numpy array and display it
input_image = caffe.io.load_image(IMAGE_FILE)
plt.imshow(input_image)
plt.show()

# Calculate how many 256x256 grid squares are in the image
rows = input_image.shape[0]/256
cols = input_image.shape[1]/256

# Subtract the mean image
for i in range(0,rows):
for j in range(0,cols):
input_image[i*256:(i+1)*256,j*256:(j+1)*256] -= mean_image

# Initialize an empty array for the detections
detections = np.zeros((rows,cols))

# Iterate over each grid square using the model to make a class prediction
start = time.time()
for i in range(0,rows):
for j in range(0,cols):
grid_square = input_image[i*256:(i+1)*256,j*256:(j+1)*256]
# make prediction
prediction = net.predict([grid_square])
detections[i,j] = prediction[0].argmax()
end = time.time()

# Display the predicted class for each grid square
plt.imshow(detections)
plt.show()

# Display total time to perform inference
print 'Total inference time (sliding window without overlap): ' + str(end-start) + ' seconds'

# define the amount of overlap between grid cells
OVERLAP = 0.25
grid_rows = int((rows-1)/(1-OVERLAP))+1
grid_cols = int((cols-1)/(1-OVERLAP))+1

print "Image has %d*%d blocks of 256 pixels" % (rows, cols)
print "With overlap=%f grid_size=%d*%d" % (OVERLAP, grid_rows, grid_cols)

# Initialize an empty array for the detections
detections = np.zeros((grid_rows,grid_cols))

# Iterate over each grid square using the model to make a class prediction
start = time.time()
for i in range(0,grid_rows):
for j in range(0,grid_cols):
start_col = int(j*256*(1-OVERLAP))
start_row = int(i*256*(1-OVERLAP))
grid_square = input_image[start_row:start_row+256, start_col:start_col+256]
# make prediction
prediction = net.predict([grid_square])
detections[i,j] = prediction[0].argmax()
end = time.time()

# Display the predicted class for each grid square
plt.imshow(detections)
plt.show()

# Display total time to perform inference
print ('Total inference time (sliding window with %f%% overlap: ' % (OVERLAP*100)) + str(end-start) + ' seconds'

# now with batched inference (one column at a time)
# we are not using a caffe.Classifier here so we need to do the pre-processing
# manually. The model was trained on random crops (256*256->227*227) so we
# need to do the cropping below. Similarly, we need to convert images
# from Numpy's Height*Width*Channel (HWC) format to Channel*Height*Width (CHW)
# Lastly, we need to swap channels from RGB to BGR
net = caffe.Net(MODEL_FILE, PRETRAINED, caffe.TEST)
start = time.time()
net.blobs['data'].reshape(*[grid_cols, 3, 227, 227])

# Initialize an empty array for the detections
detections = np.zeros((rows,cols))

for i in range(0,rows):
for j in range(0,cols):
grid_square = input_image[i*256:(i+1)*256,j*256:(j+1)*256]
# add to batch
grid_square = grid_square[14:241,14:241] # 227*227 center crop
image = np.copy(grid_square.transpose(2,0,1)) # transpose from HWC to CHW
image = image * 255 # rescale
image = image[(2,1,0), :, :] # swap channels
net.blobs['data'].data[j] = image
# make prediction
output = net.forward()[net.outputs[-1]]
for j in range(0,cols):
detections[i,j] = output[j].argmax()
end = time.time()

# Display the predicted class for each grid square
plt.imshow(detections)
plt.show()

# Display total time to perform inference
print 'Total inference time (batched inference): ' + str(end-start) + ' seconds'

F1-Score计算demo

  • FP: False Positive(假阳性)
  • TP: True Positive(真阳性)
  • FN: False Negative (假阴性)
  • TN: True Negative(真阴性)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# import the required modules/libraries.
# This should be a self-sufficient method.
import numpy as np
import pandas as pd
# function to calculate the correct threshold for the given data based on the given f1-score
def get_threshold(predictions, ground_truth, f1_score):
"""
Return the proper threshold value

:param predictions: An array of prediction scores


:param ground_truth: And array of the same size as predeictions containing lables for of the dataset


:f1_score: The desired f1_score

:return: The proper threshold value
"""
#HINT* to find the right value, you may iterate the threshold value from 0.01 to 0.99 with 0.01 increments and calculate the f1 scores,
#then check if the resulting f1_score matches the input f1_score and return the first matching value

result = -0.01

predicted_values= predictions.reshape(len(ground_truth))

#TODO: create a list for for thresholds: [0.01, 0.02, ..., 0.99]
threshold = [x/100 for x in range(1, 100)]

for value in threshold:

predicted_labels =(predicted_values > value).astype(int)

# True Positives: Number of items with predicted labels of 1 and true labels of 1
TP = np.sum(((predicted_labels == 1) & (ground_truth == 1)).astype(int) == 1)

# True Negatives: Number of items with predicted labels of 0 and true labels of 0
TN = np.sum(((predicted_labels == 0) & (ground_truth == 0)).astype(int) == 1)

# False Positives: Number of items with predicted labels of 1 and true labels of 0
FP = np.sum(((predicted_labels == 1) & (ground_truth == 0)).astype(int) == 1)

# False Negatives: Number of items with predicted labels of 0 and true labels of 1
FN = np.sum(((predicted_labels == 0) & (ground_truth == 1)).astype(int) == 1)

# use the formulas provided in the course to calculate precision and recall

precision = TP/(TP+FP)
recall = TP/(TP+FN)

# calculate the f1-score using the formula provided in the course
f1_score_calculated = round(2 * (precision * recall / (precision + recall) ),2)
print(f1_score_calculated)
if f1_score_calculated == f1_score:
result = value
break

return result