@@ -31,6 +31,10 @@ class QWen2VLTokenizer(BaseMultiModalTokenizer):
31
31
def __init__ (self , tokenizer = None , image_processor = None , ** kwargs ):
32
32
super ().__init__ (tokenizer )
33
33
self .image_processor = image_processor
34
+ self .min_pixel = self .image_processor .min_pixels
35
+ self .max_pixel = self .image_processor .max_pixels
36
+ self .patch_size = self .image_processor .patch_size
37
+ self .merge_size = self .image_processor .merge_size
34
38
self .image_start_id = kwargs ["model_cfg" ]["vision_start_token_id" ]
35
39
self .image_end_id = kwargs ["model_cfg" ]["vision_end_token_id" ]
36
40
self .image_token_id = kwargs ["model_cfg" ]["image_token_id" ]
@@ -46,17 +50,13 @@ def init_audioitem_extral_params(
46
50
raise NotImplementedError
47
51
48
52
def get_image_token_length (self , img : ImageItem ):
49
- width = img .image_w
50
- height = img .image_h
51
- resized_height , resized_width = smart_resize (height = height , width = width )
52
- self .patch_size = self .image_processor .image_processor .patch_size
53
- self .merge_size = self .image_processor .image_processor .merge_size
54
- grid_t = 1
53
+ width , height = img .image_w , img .image_h
54
+ resized_height , resized_width = smart_resize (
55
+ height = height , width = width , min_pixels = self .min_pixel , max_pixels = self .max_pixel
56
+ )
55
57
grid_h , grid_w = resized_height // self .patch_size , resized_width // self .patch_size
56
- merge_length = self .merge_size ** 2
57
- self .token_num = (grid_t * grid_h * grid_w ) // merge_length
58
- self .image_length = self .token_num
59
- return self .image_length
58
+ token_num = (grid_h * grid_w ) // (self .merge_size ** 2 )
59
+ return token_num
60
60
61
61
def get_audio_token_length (self , audio : AudioItem ):
62
62
raise NotImplementedError
0 commit comments