diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 2b3024187bf7f5..3b458d7a23e6c1 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -291,8 +291,8 @@ def _preprocess( patches = np.array(processed_images) if data_format == ChannelDimension.LAST: patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] == 1: - patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) + if patches.shape[0] % 2 == 1: + patches = np.concatenate([patches, patches[-1][np.newaxis]], axis=0) channel = patches.shape[1] grid_t = patches.shape[0] // self.temporal_patch_size grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index a6004349b49d11..6900e22b182706 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -247,3 +247,14 @@ def test_nested_input(self): # Image processor should return same pixel values, independently of ipnut format self.assertTrue((encoded_images_nested == encoded_images).all()) self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all()) + + def test_odd_number_of_frames(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + expected_dims_by_frames = {1: 324, 3: 648, 5: 972, 7: 1296, 9: 1620} + + for num_frames, expected_dims in expected_dims_by_frames.items(): + image_inputs = np.random.randint(0, 255, size=(num_frames, 256, 256, 3)) + prcocess_out = image_processing(image_inputs, return_tensors="pt") + encoded_images = prcocess_out.pixel_values + expected_output_image_shape = (expected_dims, 1176) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)