From bed02119479d38dfabd70b1cd0a92e4691e9b56f Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:34:57 +0800 Subject: [PATCH 01/10] chang function names --- tests/models/test_gpt_4_1.py | 6 +++--- tests/models/test_gpt_4_v.py | 6 +++--- tests/models/test_grok_2_v.py | 6 +++--- tests/models/test_qwen_vl.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/models/test_gpt_4_1.py b/tests/models/test_gpt_4_1.py index 1982848..e11a894 100644 --- a/tests/models/test_gpt_4_1.py +++ b/tests/models/test_gpt_4_1.py @@ -4,7 +4,7 @@ from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator -def test_claude_text_generation(): +def test_gpt_4_1_text_generation(): model = GPT4_1() prompt = "What is spectroscopy?" response = model.generate(prompt) @@ -12,7 +12,7 @@ def test_claude_text_generation(): assert len(response) > 0 -def test_claude_multimodal_generation(): +def test_gpt_4_1_multimodal_generation(): model = GPT4_1() image_path = "playground/models/test.jpg" image_base64 = encode_image_to_base64(image_path) @@ -30,7 +30,7 @@ def test_claude_multimodal_generation(): assert len(response) > 0 -def test_claude_signalgroup_evaluation(): +def test_gpt_4_1_signalgroup_evaluation(): model = GPT4_1() signal_group = SignalGroup("data") data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) diff --git a/tests/models/test_gpt_4_v.py b/tests/models/test_gpt_4_v.py index 9b52442..59e87be 100644 --- a/tests/models/test_gpt_4_v.py +++ b/tests/models/test_gpt_4_v.py @@ -4,7 +4,7 @@ from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator -def test_claude_text_generation(): +def test_gpt_4_vision_text_generation(): model = GPT4_Vision() prompt = "What is spectroscopy?" response = model.generate(prompt) @@ -12,7 +12,7 @@ def test_claude_text_generation(): assert len(response) > 0 -def test_claude_multimodal_generation(): +def test_gpt_4_vision_multimodal_generation(): model = GPT4_Vision() image_path = "playground/models/test.jpg" image_base64 = encode_image_to_base64(image_path) @@ -30,7 +30,7 @@ def test_claude_multimodal_generation(): assert len(response) > 0 -def test_claude_signalgroup_evaluation(): +def test_gpt_4_vision_signalgroup_evaluation(): model = GPT4_Vision() signal_group = SignalGroup("data") data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) diff --git a/tests/models/test_grok_2_v.py b/tests/models/test_grok_2_v.py index 29bb571..9fc1c6c 100644 --- a/tests/models/test_grok_2_v.py +++ b/tests/models/test_grok_2_v.py @@ -4,7 +4,7 @@ from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator -def test_claude_text_generation(): +def test_grok_2_vision_text_generation(): model = Grok_2_Vision() prompt = "What is spectroscopy?" response = model.generate(prompt) @@ -12,7 +12,7 @@ def test_claude_text_generation(): assert len(response) > 0 -def test_claude_multimodal_generation(): +def test_grok_2_vision_multimodal_generation(): model = Grok_2_Vision() image_path = "playground/models/test.jpg" image_base64 = encode_image_to_base64(image_path) @@ -30,7 +30,7 @@ def test_claude_multimodal_generation(): assert len(response) > 0 -def test_claude_signalgroup_evaluation(): +def test_grok_2_vision_signalgroup_evaluation(): model = Grok_2_Vision() signal_group = SignalGroup("data") data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) diff --git a/tests/models/test_qwen_vl.py b/tests/models/test_qwen_vl.py index 8310935..7b7c256 100644 --- a/tests/models/test_qwen_vl.py +++ b/tests/models/test_qwen_vl.py @@ -4,7 +4,7 @@ from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator -def test_claude_text_generation(): +def test_qwen_vl_max_text_generation(): model = Qwen_VL_Max() prompt = "What is spectroscopy?" response = model.generate(prompt) @@ -12,7 +12,7 @@ def test_claude_text_generation(): assert len(response) > 0 -def test_claude_multimodal_generation(): +def test_qwen_vl_max_multimodal_generation(): model = Qwen_VL_Max() image_path = "playground/models/test.jpg" image_base64 = encode_image_to_base64(image_path) @@ -30,7 +30,7 @@ def test_claude_multimodal_generation(): assert len(response) > 0 -def test_claude_signalgroup_evaluation(): +def test_qwen_vl_max_signalgroup_evaluation(): model = Qwen_VL_Max() signal_group = SignalGroup("data") data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) From 93fa1811301e71b7667b2abfac8e4eeab0db3484 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:35:16 +0800 Subject: [PATCH 02/10] add qwen vl models --- tests/models/test_qwen_vl_2_5_32b.py | 40 ++++++++++++++++++++++++++++ tests/models/test_qwen_vl_2_5_72b.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tests/models/test_qwen_vl_2_5_32b.py create mode 100644 tests/models/test_qwen_vl_2_5_72b.py diff --git a/tests/models/test_qwen_vl_2_5_32b.py b/tests/models/test_qwen_vl_2_5_32b.py new file mode 100644 index 0000000..e77c3d0 --- /dev/null +++ b/tests/models/test_qwen_vl_2_5_32b.py @@ -0,0 +1,40 @@ +from spectrumlab.models import Qwen_2_5_VL_32B +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_qwen_2_5_vl_32b_text_generation(): + model = Qwen_2_5_VL_32B() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_qwen_2_5_vl_32b_multimodal_generation(): + model = Qwen_2_5_VL_32B() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_qwen_2_5_vl_32b_signalgroup_evaluation(): + model = Qwen_2_5_VL_32B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] diff --git a/tests/models/test_qwen_vl_2_5_72b.py b/tests/models/test_qwen_vl_2_5_72b.py new file mode 100644 index 0000000..c12351b --- /dev/null +++ b/tests/models/test_qwen_vl_2_5_72b.py @@ -0,0 +1,40 @@ +from spectrumlab.models import Qwen_2_5_VL_72B +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_qwen_2_5_vl_72b_text_generation(): + model = Qwen_2_5_VL_72B() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_qwen_2_5_vl_72b_multimodal_generation(): + model = Qwen_2_5_VL_72B() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_qwen_2_5_vl_72b_signalgroup_evaluation(): + model = Qwen_2_5_VL_72B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] From c98c8ea68853d86e32aa89b62fefa6de286f8674 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:35:33 +0800 Subject: [PATCH 03/10] add llama vision instruction models --- tests/models/test_llama_3_2_vision_11b.py | 40 +++++++++++++++++++++++ tests/models/test_llama_3_2_vision_90b.py | 40 +++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tests/models/test_llama_3_2_vision_11b.py create mode 100644 tests/models/test_llama_3_2_vision_90b.py diff --git a/tests/models/test_llama_3_2_vision_11b.py b/tests/models/test_llama_3_2_vision_11b.py new file mode 100644 index 0000000..340e8ab --- /dev/null +++ b/tests/models/test_llama_3_2_vision_11b.py @@ -0,0 +1,40 @@ +from spectrumlab.models import Llama_Vision_11B +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_llama_vision_11b_text_generation(): + model = Llama_Vision_11B() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_llama_vision_11b_multimodal_generation(): + model = Llama_Vision_11B() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_llama_vision_11b_signalgroup_evaluation(): + model = Llama_Vision_11B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] diff --git a/tests/models/test_llama_3_2_vision_90b.py b/tests/models/test_llama_3_2_vision_90b.py new file mode 100644 index 0000000..06139c3 --- /dev/null +++ b/tests/models/test_llama_3_2_vision_90b.py @@ -0,0 +1,40 @@ +from spectrumlab.models import Llama_Vision_90B +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_llama_vision_90b_text_generation(): + model = Llama_Vision_90B() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_llama_vision_90b_multimodal_generation(): + model = Llama_Vision_90B() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_llama_vision_90b_signalgroup_evaluation(): + model = Llama_Vision_90B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] From ae907bbfe9c12964103070450a16e81cf76a8f5b Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:35:52 +0800 Subject: [PATCH 04/10] add doubao vision models --- tests/models/test_doubao_1_5_vision_pro.py | 40 +++++++++++++++++++ .../test_doubao_1_5_vision_pro_thinking.py | 40 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tests/models/test_doubao_1_5_vision_pro.py create mode 100644 tests/models/test_doubao_1_5_vision_pro_thinking.py diff --git a/tests/models/test_doubao_1_5_vision_pro.py b/tests/models/test_doubao_1_5_vision_pro.py new file mode 100644 index 0000000..c104264 --- /dev/null +++ b/tests/models/test_doubao_1_5_vision_pro.py @@ -0,0 +1,40 @@ +from spectrumlab.models import Doubao_1_5_Vision_Pro +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_doubao_1_5_vision_pro_text_generation(): + model = Doubao_1_5_Vision_Pro() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_doubao_1_5_vision_pro_multimodal_generation(): + model = Doubao_1_5_Vision_Pro() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_doubao_1_5_vision_pro_signalgroup_evaluation(): + model = Doubao_1_5_Vision_Pro() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] diff --git a/tests/models/test_doubao_1_5_vision_pro_thinking.py b/tests/models/test_doubao_1_5_vision_pro_thinking.py new file mode 100644 index 0000000..bc4395e --- /dev/null +++ b/tests/models/test_doubao_1_5_vision_pro_thinking.py @@ -0,0 +1,40 @@ +from spectrumlab.models import Doubao_1_5_Vision_Pro_Thinking +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_doubao_1_5_vision_pro_thinking_text_generation(): + model = Doubao_1_5_Vision_Pro_Thinking() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_doubao_1_5_vision_pro_thinking_multimodal_generation(): + model = Doubao_1_5_Vision_Pro_Thinking() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_doubao_1_5_vision_pro_thinking_signalgroup_evaluation(): + model = Doubao_1_5_Vision_Pro_Thinking() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] From 5eb94dca8267004595fe9bb4bc7f44c1ac0a48de Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:36:14 +0800 Subject: [PATCH 05/10] add model api --- spectrumlab/models/deepseek_vl.py | 75 +++++++++++++++ spectrumlab/models/doubao_api.py | 146 ++++++++++++++++++++++++++++++ spectrumlab/models/llama_api.py | 146 ++++++++++++++++++++++++++++++ spectrumlab/models/qwen_vl_api.py | 143 +++++++++++++++++++++++++++++ 4 files changed, 510 insertions(+) create mode 100644 spectrumlab/models/deepseek_vl.py create mode 100644 spectrumlab/models/doubao_api.py create mode 100644 spectrumlab/models/llama_api.py diff --git a/spectrumlab/models/deepseek_vl.py b/spectrumlab/models/deepseek_vl.py new file mode 100644 index 0000000..da2fae4 --- /dev/null +++ b/spectrumlab/models/deepseek_vl.py @@ -0,0 +1,75 @@ +from typing import Optional, Union, Dict, Any +from .base_api import BaseAPIModel +from spectrumlab.config import Config +from openai import OpenAI + + +class DeepSeek_VL2(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.deepseek_vl_2_api_key + self.base_url = base_url or config.deepseek_vl_2_base_url + self.model_name = model_name or config.deepseek_vl_2_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "InternVL API key not found. Please set INTERNVL_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + + # Link: https://internlm.intern-ai.org.cn/api/document + messages = [] + + if isinstance(prompt, dict) and "images" in prompt: + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"InternVL API call failed: {e}") diff --git a/spectrumlab/models/doubao_api.py b/spectrumlab/models/doubao_api.py new file mode 100644 index 0000000..940e75e --- /dev/null +++ b/spectrumlab/models/doubao_api.py @@ -0,0 +1,146 @@ +from typing import Optional, Union, Dict, Any +from .base_api import BaseAPIModel +from spectrumlab.config import Config +from openai import OpenAI + + +class Doubao_1_5_Vision_Pro(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.doubao_1_5_vision_pro_api_key + self.base_url = base_url or config.doubao_1_5_vision_pro_base_url + self.model_name = model_name or config.doubao_1_5_vision_pro_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "Doubao-1.5-Vision-Pro API key not found. Please set DOUBAO_1_5_VISION_PRO_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + + # Link: https://internlm.intern-ai.org.cn/api/document + messages = [] + + if isinstance(prompt, dict) and "images" in prompt: + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Doubao-1.5-Vision-Pro API call failed: {e}") + + +class Doubao_1_5_Vision_Pro_Thinking(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.doubao_1_5_vision_pro_thinking_api_key + self.base_url = base_url or config.doubao_1_5_vision_pro_thinking_base_url + self.model_name = model_name or config.doubao_1_5_vision_pro_thinking_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "Doubao-1.5-Vision-Pro-Thinking API key not found. Please set DOUBAO_1_5_VISION_PRO_THINKING_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + + # Link: https://internlm.intern-ai.org.cn/api/document + messages = [] + + if isinstance(prompt, dict) and "images" in prompt: + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Doubao-1.5-Vision-Pro-Thinking API call failed: {e}") diff --git a/spectrumlab/models/llama_api.py b/spectrumlab/models/llama_api.py new file mode 100644 index 0000000..2de27f1 --- /dev/null +++ b/spectrumlab/models/llama_api.py @@ -0,0 +1,146 @@ +from typing import Optional, Union, Dict, Any +from .base_api import BaseAPIModel +from spectrumlab.config import Config +from openai import OpenAI + + +class Llama_Vision_11B(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.llama_vision_11b_api_key + self.base_url = base_url or config.llama_vision_11b_base_url + self.model_name = model_name or config.llama_vision_11b_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "Llama-Vision-11B API key not found. Please set LLAMA_VISION_11B_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + + # Link: https://internlm.intern-ai.org.cn/api/document + messages = [] + + if isinstance(prompt, dict) and "images" in prompt: + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Llama-Vision-11B API call failed: {e}") + + +class Llama_Vision_90B(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.llama_vision_90b_api_key + self.base_url = base_url or config.llama_vision_90b_base_url + self.model_name = model_name or config.llama_vision_90b_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "Llama-Vision-90B API key not found. Please set LLAMA_VISION_90B_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + + # Link: https://internlm.intern-ai.org.cn/api/document + messages = [] + + if isinstance(prompt, dict) and "images" in prompt: + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Llama-Vision-90B API call failed: {e}") \ No newline at end of file diff --git a/spectrumlab/models/qwen_vl_api.py b/spectrumlab/models/qwen_vl_api.py index d4c6817..36c4d90 100644 --- a/spectrumlab/models/qwen_vl_api.py +++ b/spectrumlab/models/qwen_vl_api.py @@ -74,3 +74,146 @@ def generate( return response.choices[0].message.content except Exception as e: raise RuntimeError(f"Qwen-VL-Max API call failed: {e}") + +class Qwen_2_5_VL_32B(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.qwen_2_5_vl_32b_api_key + self.base_url = base_url or config.qwen_2_5_vl_32b_base_url + self.model_name = model_name or config.qwen_2_5_vl_32b_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "Qwen-2.5-VL-32B API key not found. Please set QWEN_2_5_VL_32B_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + messages = [] + + # Handle multimodal vs text-only prompts + if isinstance(prompt, dict) and "images" in prompt: + # Multimodal prompt + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + # Text-only prompt + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Qwen-2.5-VL-32B API call failed: {e}") + +class Qwen_2_5_VL_72B(BaseAPIModel): + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + **kwargs, + ): + config = Config() + + # Use provided parameters or fall back to config + self.api_key = api_key or config.qwen_2_5_vl_72b_api_key + self.base_url = base_url or config.qwen_2_5_vl_72b_base_url + self.model_name = model_name or config.qwen_2_5_vl_72b_model_name + + # Validate that we have required configuration + if not self.api_key: + raise ValueError( + "Qwen-2.5-VL-72B API key not found. Please set QWEN_2_5_VL_72B_API_KEY in your .env file " + "or provide api_key parameter." + ) + + self.client = OpenAI( + api_key=self.api_key, + base_url=self.base_url, + ) + + # Initialize parent class + super().__init__(model_name=self.model_name, **kwargs) + + def generate( + self, prompt: Union[str, Dict[str, Any]], max_tokens: int = 512 + ) -> str: + """ + Generate response supporting both text and multimodal input. + + Args: + prompt: Either text string or multimodal dict + max_tokens: Maximum tokens to generate + + Returns: + Generated response string + """ + messages = [] + + # Handle multimodal vs text-only prompts + if isinstance(prompt, dict) and "images" in prompt: + # Multimodal prompt + content = [] + + content.append({"type": "text", "text": prompt["text"]}) + + for image_data in prompt["images"]: + content.append(image_data) + + messages.append({"role": "user", "content": content}) + else: + # Text-only prompt + text_content = prompt if isinstance(prompt, str) else prompt.get("text", "") + messages.append({"role": "user", "content": text_content}) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=messages, + max_tokens=max_tokens, + ) + return response.choices[0].message.content + except Exception as e: + raise RuntimeError(f"Qwen-2.5-VL-72B API call failed: {e}") + From 56d258ded9d9b6d2e2737e2215e3a281b507c3a8 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:37:04 +0800 Subject: [PATCH 06/10] Update base_config.py --- spectrumlab/config/base_config.py | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/spectrumlab/config/base_config.py b/spectrumlab/config/base_config.py index c0640c9..1b5198e 100644 --- a/spectrumlab/config/base_config.py +++ b/spectrumlab/config/base_config.py @@ -55,3 +55,38 @@ class Config: qwen_vl_api_key: str = BOYUE_API_KEY qwen_vl_base_url: str = BOYUE_BASE_URL qwen_vl_model_name: str = os.getenv("QWEN_VL") + + # DeepSeek-VL-2 + deepseek_vl_2_api_key: str = BOYUE_API_KEY + deepseek_vl_2_base_url: str = BOYUE_BASE_URL + deepseek_vl_2_model_name: str = os.getenv("DEEPSEEK_VL_2") + + # Qwen-2.5-VL-32B + qwen_2_5_vl_32b_api_key: str = BOYUE_API_KEY + qwen_2_5_vl_32b_base_url: str = BOYUE_BASE_URL + qwen_2_5_vl_32b_model_name: str = os.getenv("QWEN_2_5_VL_32B") + + # Qwen-2.5-VL-72B + qwen_2_5_vl_72b_api_key: str = BOYUE_API_KEY + qwen_2_5_vl_72b_base_url: str = BOYUE_BASE_URL + qwen_2_5_vl_72b_model_name: str = os.getenv("QWEN_2_5_VL_72B") + + # Llama-Vision-11B + llama_vision_11b_api_key: str = BOYUE_API_KEY + llama_vision_11b_base_url: str = BOYUE_BASE_URL + llama_vision_11b_model_name: str = os.getenv("LLAMA_VISION_11B") + + # Llama-Vision-90B + llama_vision_90b_api_key: str = BOYUE_API_KEY + llama_vision_90b_base_url: str = BOYUE_BASE_URL + llama_vision_90b_model_name: str = os.getenv("LLAMA_VISION_90B") + + # Doubao-1.5-Vision-Pro + doubao_1_5_vision_pro_api_key: str = BOYUE_API_KEY + doubao_1_5_vision_pro_base_url: str = BOYUE_BASE_URL + doubao_1_5_vision_pro_model_name: str = os.getenv("DOUBAO_1_5_VISION_PRO") + + # Doubao-1.5-Vision-Pro-Thinking + doubao_1_5_vision_pro_thinking_api_key: str = BOYUE_API_KEY + doubao_1_5_vision_pro_thinking_base_url: str = BOYUE_BASE_URL + doubao_1_5_vision_pro_thinking_model_name: str = os.getenv("DOUBAO_1_5_VISION_PRO_THINKING") \ No newline at end of file From 2889e413ed1d359732ed9f94bd868555396c2bd5 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:37:20 +0800 Subject: [PATCH 07/10] add deepseek vl model --- tests/models/test_deepseek_vl_2.py | 40 ++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/models/test_deepseek_vl_2.py diff --git a/tests/models/test_deepseek_vl_2.py b/tests/models/test_deepseek_vl_2.py new file mode 100644 index 0000000..79870de --- /dev/null +++ b/tests/models/test_deepseek_vl_2.py @@ -0,0 +1,40 @@ +from spectrumlab.models import DeepSeek_VL2 +from spectrumlab.utils.image_utils import encode_image_to_base64 +from spectrumlab.benchmark.signal_group import SignalGroup +from spectrumlab.evaluator.choice_evaluator import ChoiceEvaluator + + +def test_deepseek_vl_2_text_generation(): + model = DeepSeek_VL2() + prompt = "What is spectroscopy?" + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_deepseek_vl_2_multimodal_generation(): + model = DeepSeek_VL2() + image_path = "playground/models/test.jpg" + image_base64 = encode_image_to_base64(image_path) + prompt = { + "text": "Please explain this spectroscopy image.", + "images": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpg;base64,{image_base64}"}, + } + ], + } + response = model.generate(prompt) + assert isinstance(response, str) + assert len(response) > 0 + + +def test_deepseek_vl_2_signalgroup_evaluation(): + model = DeepSeek_VL2() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] From 332310846e1d750f51c91aeceb286a93d3171193 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 13:37:41 +0800 Subject: [PATCH 08/10] Update __init__.py --- spectrumlab/models/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spectrumlab/models/__init__.py b/spectrumlab/models/__init__.py index 0750ae3..dfde2a2 100644 --- a/spectrumlab/models/__init__.py +++ b/spectrumlab/models/__init__.py @@ -4,7 +4,11 @@ from .claude_api import Claude_Sonnet_3_5, Claude_Opus_4, Claude_Haiku_3_5, Claude_Sonnet_4 from .gpt4_v_api import GPT4_1, GPT4_Vision from .grok_api import Grok_2_Vision -from .qwen_vl_api import Qwen_VL_Max +from .deepseek_vl import DeepSeek_VL2 +from .qwen_vl_api import Qwen_VL_Max, Qwen_2_5_VL_32B, Qwen_2_5_VL_72B +from .llama_api import Llama_Vision_11B, Llama_Vision_90B +from .doubao_api import Doubao_1_5_Vision_Pro, Doubao_1_5_Vision_Pro_Thinking __all__ = ["DeepSeek", "GPT4o", "InternVL", "Claude_Sonnet_3_5", "Claude_Opus_4", - "Claude_Haiku_3_5", "Claude_Sonnet_4", "GPT4_1", "GPT4_Vision", "Grok_2_Vision", "Qwen_VL_Max"] + "Claude_Haiku_3_5", "Claude_Sonnet_4", "GPT4_1", "GPT4_Vision", "Grok_2_Vision", "Qwen_VL_Max", + "DeepSeek_VL2", "Qwen_2_5_VL_32B", "Qwen_2_5_VL_72B", "Llama_Vision_11B", "Llama_Vision_90B", "Doubao_1_5_Vision_Pro", "Doubao_1_5_Vision_Pro_Thinking" ] From 1f83daa87d7a08dfc205a73d68bc03edfeded993 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 14:07:25 +0800 Subject: [PATCH 09/10] add parallel processing to speed up --- spectrumlab/evaluator/base.py | 154 +++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 3 deletions(-) diff --git a/spectrumlab/evaluator/base.py b/spectrumlab/evaluator/base.py index 080a850..62acb78 100644 --- a/spectrumlab/evaluator/base.py +++ b/spectrumlab/evaluator/base.py @@ -96,9 +96,157 @@ def evaluate( "total_items": len(data_items), } - def evaluate_many(): - # TODO - pass + def evaluate_many( + self, + data_items: List[Dict], + model, + max_out_len: int = 512, + batch_size: Optional[int] = None, + save_path: str = "./eval_results", + n_jobs: int = -1, + ) -> Dict: + """ + Evaluate a single model on data_items with parallel processing. + + Args: + data_items: List of data items to evaluate + model: Model instance to evaluate + max_out_len: Maximum output length for model generation + batch_size: Batch size for processing (if None, will be auto-calculated) + save_path: Base path to save results + n_jobs: Number of parallel jobs (-1 for all available cores) + + Returns: + Dictionary containing evaluation results + """ + import multiprocessing as mp + from concurrent.futures import ThreadPoolExecutor, as_completed + import math + + if not data_items: + print("❌ No data items provided") + return {"error": "No data items provided"} + + # Set number of jobs + if n_jobs == -1: + n_jobs = mp.cpu_count() + + # Calculate batch size if not provided + if batch_size is None: + batch_size = max(1, math.ceil(len(data_items) / n_jobs)) + + print(f"πŸ”„ Starting parallel evaluation on {len(data_items)} items...") + print(f"πŸ“ Model: {type(model).__name__}") + print(f"⚑ Using {n_jobs} parallel workers with batch size {batch_size}") + + # Split data into batches + batches = [ + data_items[i:i + batch_size] + for i in range(0, len(data_items), batch_size) + ] + + print(f"πŸ“¦ Split into {len(batches)} batches") + + # Build prompts for all items + print("πŸ“ Building prompts...") + all_prompts = [self._build_prompt(item) for item in data_items] + + # Split prompts into batches + prompt_batches = [ + all_prompts[i:i + batch_size] + for i in range(0, len(all_prompts), batch_size) + ] + + def process_batch(batch_data): + """Process a batch of prompts and return responses.""" + batch_prompts, batch_indices = batch_data + batch_responses = [] + + for i, prompt in enumerate(batch_prompts): + try: + response = model.generate(prompt, max_out_len) + batch_responses.append(response) + except Exception as e: + # 保持与evaluateζ–Ήζ³•δΈ€θ‡΄ηš„ι”™θ――ε€„η† + original_index = batch_indices[i] + print(f"\n⚠️ Error on item {original_index + 1}: {e}") + batch_responses.append(f"Error: {str(e)}") + + return batch_indices, batch_responses + + # Prepare batch data with indices + batch_data_list = [] + for i, prompt_batch in enumerate(prompt_batches): + start_idx = i * batch_size + end_idx = min(start_idx + batch_size, len(data_items)) + batch_indices = list(range(start_idx, end_idx)) + batch_data_list.append((prompt_batch, batch_indices)) + + # Execute parallel processing + all_responses = [None] * len(data_items) + + with ThreadPoolExecutor(max_workers=n_jobs) as executor: + # Submit all batch tasks + future_to_batch = { + executor.submit(process_batch, batch_data): batch_data[1][0] + for batch_data in batch_data_list + } + + # Collect results as they complete + for future in tqdm( + as_completed(future_to_batch), + total=len(future_to_batch), + desc="Processing batches", + unit="batch" + ): + try: + batch_indices, batch_responses = future.result() + for idx, response in zip(batch_indices, batch_responses): + all_responses[idx] = response + except Exception as e: + print(f"❌ Error processing batch: {e}") + + # Process responses and calculate results + print("πŸ” Processing responses...") + processed_items = [] + for item, response in tqdm( + zip(data_items, all_responses), + desc="Processing responses", + total=len(data_items), + unit="item", + ): + item_copy = item.copy() + prediction = self._extract_prediction(response, item) + item_copy[self.prediction_key] = prediction + item_copy["model_response"] = response + + answer = item.get("answer", "") + is_correct = self._calculate_accuracy(answer, prediction, item) + item_copy["pass"] = is_correct + + processed_items.append(item_copy) + + # Save results + saved_files = self._save_results(processed_items, save_path) + print(f"πŸ’Ύ Results saved to: {saved_files}") + + # Calculate metrics + print("πŸ“Š Calculating metrics...") + metrics = self._calculate_metrics(processed_items) + + # Print results + self._print_results(metrics) + + return { + "metrics": metrics, + "saved_files": saved_files, + "total_items": len(data_items), + "parallel_info": { + "n_jobs": n_jobs, + "batch_size": batch_size, + "n_batches": len(batches) + } + } def _save_results(self, results_data: List[Dict], save_path: str) -> List[str]: """Save results grouped by subcategory. If save_path is None, do not save.""" From e57c7c944a9e54aeaeea63622405a799da6aeb62 Mon Sep 17 00:00:00 2001 From: JiaX-TCS Date: Wed, 23 Jul 2025 14:13:42 +0800 Subject: [PATCH 10/10] add parallel tests --- tests/models/test_claude_haiku_3_5.py | 10 ++++++++++ tests/models/test_claude_opus_4.py | 10 ++++++++++ tests/models/test_claude_sonnet_3_5.py | 10 ++++++++++ tests/models/test_claude_sonnet_4.py | 10 ++++++++++ tests/models/test_deepseek_vl_2.py | 10 ++++++++++ tests/models/test_doubao_1_5_vision_pro.py | 11 +++++++++++ tests/models/test_doubao_1_5_vision_pro_thinking.py | 11 +++++++++++ tests/models/test_gpt4o.py | 11 +++++++++++ tests/models/test_gpt_4_1.py | 11 +++++++++++ tests/models/test_gpt_4_v.py | 11 +++++++++++ tests/models/test_grok_2_v.py | 11 +++++++++++ tests/models/test_internvl.py | 11 +++++++++++ tests/models/test_llama_3_2_vision_11b.py | 11 +++++++++++ tests/models/test_llama_3_2_vision_90b.py | 11 +++++++++++ tests/models/test_qwen_vl.py | 11 +++++++++++ tests/models/test_qwen_vl_2_5_32b.py | 10 ++++++++++ tests/models/test_qwen_vl_2_5_72b.py | 11 +++++++++++ 17 files changed, 181 insertions(+) diff --git a/tests/models/test_claude_haiku_3_5.py b/tests/models/test_claude_haiku_3_5.py index 16ea288..e1b5826 100644 --- a/tests/models/test_claude_haiku_3_5.py +++ b/tests/models/test_claude_haiku_3_5.py @@ -38,3 +38,13 @@ def test_claude_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_claude_signalgroup_evaluation_parallel(): + model = Claude_Haiku_3_5() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] diff --git a/tests/models/test_claude_opus_4.py b/tests/models/test_claude_opus_4.py index 66b383b..d2c0d54 100644 --- a/tests/models/test_claude_opus_4.py +++ b/tests/models/test_claude_opus_4.py @@ -38,3 +38,13 @@ def test_claude_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_claude_signalgroup_evaluation_parallel(): + model = Claude_Opus_4() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] \ No newline at end of file diff --git a/tests/models/test_claude_sonnet_3_5.py b/tests/models/test_claude_sonnet_3_5.py index 818470a..cecba2a 100644 --- a/tests/models/test_claude_sonnet_3_5.py +++ b/tests/models/test_claude_sonnet_3_5.py @@ -38,3 +38,13 @@ def test_claude_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_claude_signalgroup_evaluation_parallel(): + model = Claude_Sonnet_3_5() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] \ No newline at end of file diff --git a/tests/models/test_claude_sonnet_4.py b/tests/models/test_claude_sonnet_4.py index 1d284b3..0432c3d 100644 --- a/tests/models/test_claude_sonnet_4.py +++ b/tests/models/test_claude_sonnet_4.py @@ -38,3 +38,13 @@ def test_claude_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_claude_signalgroup_evaluation_parallel(): + model = Claude_Sonnet_4() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] \ No newline at end of file diff --git a/tests/models/test_deepseek_vl_2.py b/tests/models/test_deepseek_vl_2.py index 79870de..11c74ee 100644 --- a/tests/models/test_deepseek_vl_2.py +++ b/tests/models/test_deepseek_vl_2.py @@ -38,3 +38,13 @@ def test_deepseek_vl_2_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_deepseek_vl_2_signalgroup_evaluation_parallel(): + model = DeepSeek_VL2() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] \ No newline at end of file diff --git a/tests/models/test_doubao_1_5_vision_pro.py b/tests/models/test_doubao_1_5_vision_pro.py index c104264..b649b40 100644 --- a/tests/models/test_doubao_1_5_vision_pro.py +++ b/tests/models/test_doubao_1_5_vision_pro.py @@ -38,3 +38,14 @@ def test_doubao_1_5_vision_pro_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_doubao_1_5_vision_pro_signalgroup_evaluation_parallel(): + model = Doubao_1_5_Vision_Pro() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_doubao_1_5_vision_pro_thinking.py b/tests/models/test_doubao_1_5_vision_pro_thinking.py index bc4395e..6552ca1 100644 --- a/tests/models/test_doubao_1_5_vision_pro_thinking.py +++ b/tests/models/test_doubao_1_5_vision_pro_thinking.py @@ -38,3 +38,14 @@ def test_doubao_1_5_vision_pro_thinking_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_doubao_1_5_vision_pro_thinking_signalgroup_evaluation_parallel(): + model = Doubao_1_5_Vision_Pro_Thinking() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_gpt4o.py b/tests/models/test_gpt4o.py index 96bf2cd..e98e74e 100644 --- a/tests/models/test_gpt4o.py +++ b/tests/models/test_gpt4o.py @@ -39,3 +39,14 @@ def test_gpt4o_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_gpt4o_signalgroup_evaluation_parallel(): + model = GPT4o() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_gpt_4_1.py b/tests/models/test_gpt_4_1.py index e11a894..26c131a 100644 --- a/tests/models/test_gpt_4_1.py +++ b/tests/models/test_gpt_4_1.py @@ -38,3 +38,14 @@ def test_gpt_4_1_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_gpt_4_1_signalgroup_evaluation_parallel(): + model = GPT4_1() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_gpt_4_v.py b/tests/models/test_gpt_4_v.py index 59e87be..302375e 100644 --- a/tests/models/test_gpt_4_v.py +++ b/tests/models/test_gpt_4_v.py @@ -38,3 +38,14 @@ def test_gpt_4_vision_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_gpt_4_vision_signalgroup_evaluation_parallel(): + model = GPT4_Vision() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_grok_2_v.py b/tests/models/test_grok_2_v.py index 9fc1c6c..cfdc986 100644 --- a/tests/models/test_grok_2_v.py +++ b/tests/models/test_grok_2_v.py @@ -38,3 +38,14 @@ def test_grok_2_vision_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_grok_2_vision_signalgroup_evaluation_parallel(): + model = Grok_2_Vision() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index f469c90..79f68ad 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -38,3 +38,14 @@ def test_internvl_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_internvl_signalgroup_evaluation_parallel(): + model = InternVL() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_llama_3_2_vision_11b.py b/tests/models/test_llama_3_2_vision_11b.py index 340e8ab..968e099 100644 --- a/tests/models/test_llama_3_2_vision_11b.py +++ b/tests/models/test_llama_3_2_vision_11b.py @@ -38,3 +38,14 @@ def test_llama_vision_11b_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_llama_vision_11b_signalgroup_evaluation_parallel(): + model = Llama_Vision_11B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_llama_3_2_vision_90b.py b/tests/models/test_llama_3_2_vision_90b.py index 06139c3..7ef0568 100644 --- a/tests/models/test_llama_3_2_vision_90b.py +++ b/tests/models/test_llama_3_2_vision_90b.py @@ -38,3 +38,14 @@ def test_llama_vision_90b_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_llama_vision_90b_signalgroup_evaluation_parallel(): + model = Llama_Vision_90B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_qwen_vl.py b/tests/models/test_qwen_vl.py index 7b7c256..0ec2921 100644 --- a/tests/models/test_qwen_vl.py +++ b/tests/models/test_qwen_vl.py @@ -38,3 +38,14 @@ def test_qwen_vl_max_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_qwen_vl_max_signalgroup_evaluation_parallel(): + model = Qwen_VL_Max() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_qwen_vl_2_5_32b.py b/tests/models/test_qwen_vl_2_5_32b.py index e77c3d0..df8d83d 100644 --- a/tests/models/test_qwen_vl_2_5_32b.py +++ b/tests/models/test_qwen_vl_2_5_32b.py @@ -38,3 +38,13 @@ def test_qwen_2_5_vl_32b_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + +def test_qwen_2_5_vl_32b_signalgroup_evaluation_parallel(): + model = Qwen_2_5_VL_32B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file diff --git a/tests/models/test_qwen_vl_2_5_72b.py b/tests/models/test_qwen_vl_2_5_72b.py index c12351b..3ad1529 100644 --- a/tests/models/test_qwen_vl_2_5_72b.py +++ b/tests/models/test_qwen_vl_2_5_72b.py @@ -38,3 +38,14 @@ def test_qwen_2_5_vl_72b_signalgroup_evaluation(): results = evaluator.evaluate(data_items=data, model=model, save_path=None) assert "metrics" in results assert "overall" in results["metrics"] + + +def test_qwen_2_5_vl_72b_signalgroup_evaluation_parallel(): + model = Qwen_2_5_VL_72B() + signal_group = SignalGroup("data") + data = signal_group.get_data_by_subcategories(["Spectrum Type Classification"]) + evaluator = ChoiceEvaluator() + results = evaluator.evaluate_many(data_items=data, model=model, save_path=None) + assert "metrics" in results + assert "overall" in results["metrics"] + \ No newline at end of file