From e6784c985eb2321046f6b78f1ba921567256789a Mon Sep 17 00:00:00 2001 From: Jacky Lee Date: Thu, 26 Dec 2024 21:06:01 -0800 Subject: [PATCH 1/3] fix: processing odd number of frames --- src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 2b3024187bf7f5..3b458d7a23e6c1 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -291,8 +291,8 @@ def _preprocess( patches = np.array(processed_images) if data_format == ChannelDimension.LAST: patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] == 1: - patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) + if patches.shape[0] % 2 == 1: + patches = np.concatenate([patches, patches[-1][np.newaxis]], axis=0) channel = patches.shape[1] grid_t = patches.shape[0] // self.temporal_patch_size grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size From 105fa10c9f669eb4ff6414af2cc323e18376396f Mon Sep 17 00:00:00 2001 From: Jacky Lee Date: Thu, 26 Dec 2024 21:20:34 -0800 Subject: [PATCH 2/3] feat: add test case --- .../models/qwen2_vl/test_image_processing_qwen2_vl.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index a6004349b49d11..3cc3f836d080c4 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -247,3 +247,14 @@ def test_nested_input(self): # Image processor should return same pixel values, independently of ipnut format self.assertTrue((encoded_images_nested == encoded_images).all()) self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all()) + + def test_odd_number_of_frames(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + expected_dims_by_frames = {3: 648, 5: 972, 7: 1296, 9: 1620} + + for num_frames, expected_dims in expected_dims_by_frames.items(): + image_inputs = np.random.randint(0, 255, size=(num_frames, 256, 256, 3)) + prcocess_out = image_processing(image_inputs, return_tensors="pt") + encoded_images = prcocess_out.pixel_values + expected_output_image_shape = (expected_dims, 1176) + self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) From a0afcb8e9a4c4d24875a399344fe27ad6d5915b4 Mon Sep 17 00:00:00 2001 From: Jacky Lee Date: Fri, 27 Dec 2024 16:19:28 -0800 Subject: [PATCH 3/3] update: test one frame --- tests/models/qwen2_vl/test_image_processing_qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py index 3cc3f836d080c4..6900e22b182706 100644 --- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py @@ -250,7 +250,7 @@ def test_nested_input(self): def test_odd_number_of_frames(self): image_processing = self.image_processing_class(**self.image_processor_dict) - expected_dims_by_frames = {3: 648, 5: 972, 7: 1296, 9: 1620} + expected_dims_by_frames = {1: 324, 3: 648, 5: 972, 7: 1296, 9: 1620} for num_frames, expected_dims in expected_dims_by_frames.items(): image_inputs = np.random.randint(0, 255, size=(num_frames, 256, 256, 3))