1
1
import torch
2
- from shark .shark_runner import SharkBenchmarkRunner
2
+ from shark .shark_benchmark_runner import SharkBenchmarkRunner
3
3
from shark .parser import shark_args
4
4
from transformers import AutoTokenizer , AutoModelForSequenceClassification
5
- from onnxruntime .transformers .benchmark import run_pytorch , run_tensorflow , run_onnxruntime
5
+ from onnxruntime .transformers .benchmark import (
6
+ run_pytorch ,
7
+ run_tensorflow ,
8
+ run_onnxruntime ,
9
+ )
6
10
from onnxruntime .transformers .huggingface_models import MODELS
7
11
from onnxruntime .transformers .benchmark_helper import ConfigModifier , Precision
8
12
import os
9
13
import psutil
10
14
11
15
12
16
class OnnxFusionOptions (object ):
13
-
14
17
def __init__ (self ):
15
18
self .disable_gelu = False
16
19
self .disable_layer_norm = False
@@ -25,17 +28,13 @@ def __init__(self):
25
28
26
29
27
30
class HuggingFaceLanguage (torch .nn .Module ):
28
-
29
31
def __init__ (self , hf_model_name ):
30
32
super ().__init__ ()
31
33
self .model = AutoModelForSequenceClassification .from_pretrained (
32
34
hf_model_name , # The pretrained model.
33
- num_labels =
34
- 2 , # The number of output labels--2 for binary classification.
35
- output_attentions =
36
- False , # Whether the model returns attentions weights.
37
- output_hidden_states =
38
- False , # Whether the model returns all hidden-states.
35
+ num_labels = 2 , # The number of output labels--2 for binary classification.
36
+ output_attentions = False , # Whether the model returns attentions weights.
37
+ output_hidden_states = False , # Whether the model returns all hidden-states.
39
38
torchscript = True ,
40
39
)
41
40
@@ -62,8 +61,16 @@ def __init__(
62
61
)
63
62
self .model_name = model_name
64
63
model = HuggingFaceLanguage (model_name )
65
- SharkBenchmarkRunner .__init__ (self , model , input , dynamic , self .device ,
66
- jit_trace , from_aot , frontend )
64
+ SharkBenchmarkRunner .__init__ (
65
+ self ,
66
+ model ,
67
+ input ,
68
+ dynamic ,
69
+ self .device ,
70
+ jit_trace ,
71
+ from_aot ,
72
+ frontend ,
73
+ )
67
74
68
75
def benchmark_torch (self , inputs ):
69
76
use_gpu = self .device == "gpu"
@@ -74,10 +81,20 @@ def benchmark_torch(self, inputs):
74
81
sequence_lengths = [inputs .shape [- 1 ]]
75
82
cache_dir = os .path .join ("." , "cache_models" )
76
83
verbose = False
77
- result = run_pytorch (use_gpu , [self .model_name ], None , config_modifier ,
78
- Precision .FLOAT32 , num_threads , batch_sizes ,
79
- sequence_lengths , shark_args .num_iterations , False ,
80
- cache_dir , verbose )
84
+ result = run_pytorch (
85
+ use_gpu ,
86
+ [self .model_name ],
87
+ None ,
88
+ config_modifier ,
89
+ Precision .FLOAT32 ,
90
+ num_threads ,
91
+ batch_sizes ,
92
+ sequence_lengths ,
93
+ shark_args .num_iterations ,
94
+ False ,
95
+ cache_dir ,
96
+ verbose ,
97
+ )
81
98
print (
82
99
f"ONNX Pytorch-benchmark:{ result [0 ]['QPS' ]} iter/second, Total Iterations:{ shark_args .num_iterations } "
83
100
)
@@ -92,10 +109,19 @@ def benchmark_tf(self, inputs):
92
109
sequence_lengths = [inputs .shape [- 1 ]]
93
110
cache_dir = os .path .join ("." , "cache_models" )
94
111
verbose = False
95
- result = run_tensorflow (use_gpu , [self .model_name ], None ,
96
- config_modifier , Precision .FLOAT32 , num_threads ,
97
- batch_sizes , sequence_lengths ,
98
- shark_args .num_iterations , cache_dir , verbose )
112
+ result = run_tensorflow (
113
+ use_gpu ,
114
+ [self .model_name ],
115
+ None ,
116
+ config_modifier ,
117
+ Precision .FLOAT32 ,
118
+ num_threads ,
119
+ batch_sizes ,
120
+ sequence_lengths ,
121
+ shark_args .num_iterations ,
122
+ cache_dir ,
123
+ verbose ,
124
+ )
99
125
print (
100
126
f"ONNX TF-benchmark:{ result [0 ]['QPS' ]} iter/second, Total Iterations:{ shark_args .num_iterations } "
101
127
)
@@ -105,7 +131,8 @@ def benchmark_onnx(self, inputs):
105
131
print (
106
132
f"{ self .model_name } is currently not supported in ORT's HF. Check \
107
133
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
108
- for currently supported models. Exiting benchmark ONNX." )
134
+ for currently supported models. Exiting benchmark ONNX."
135
+ )
109
136
return
110
137
use_gpu = self .device == "gpu"
111
138
num_threads = psutil .cpu_count (logical = False )
@@ -121,17 +148,34 @@ def benchmark_onnx(self, inputs):
121
148
use_raw_attention_mask = True
122
149
model_fusion_statistics = {}
123
150
overwrite = False
124
- model_source = "pt" #Either "pt" or "tf"
151
+ model_source = "pt" # Either "pt" or "tf"
125
152
provider = None
126
153
config_modifier = ConfigModifier (None )
127
154
onnx_args = OnnxFusionOptions ()
128
155
result = run_onnxruntime (
129
- use_gpu , provider , [self .model_name ], None , config_modifier ,
130
- Precision .FLOAT32 , num_threads , batch_sizes , sequence_lengths ,
131
- shark_args .num_iterations , input_counts , optimize_onnx ,
132
- validate_onnx , cache_dir , onnx_dir , verbose , overwrite ,
133
- disable_ort_io_binding , use_raw_attention_mask ,
134
- model_fusion_statistics , model_source , onnx_args )
156
+ use_gpu ,
157
+ provider ,
158
+ [self .model_name ],
159
+ None ,
160
+ config_modifier ,
161
+ Precision .FLOAT32 ,
162
+ num_threads ,
163
+ batch_sizes ,
164
+ sequence_lengths ,
165
+ shark_args .num_iterations ,
166
+ input_counts ,
167
+ optimize_onnx ,
168
+ validate_onnx ,
169
+ cache_dir ,
170
+ onnx_dir ,
171
+ verbose ,
172
+ overwrite ,
173
+ disable_ort_io_binding ,
174
+ use_raw_attention_mask ,
175
+ model_fusion_statistics ,
176
+ model_source ,
177
+ onnx_args ,
178
+ )
135
179
print (
136
180
f"ONNX ORT-benchmark:{ result [0 ]['QPS' ]} iter/second, Total Iterations:{ shark_args .num_iterations } "
137
181
)
0 commit comments