feat: run benchmark on gpt-4o & llama 3.1 (#497)

* Run benchmark on gpt-4o & llama 3.1 * update link
microsoft · Nov 26, 2024 · 64af0b5 · 64af0b5
1 parent fb97455
commit 64af0b5
Show file tree

Hide file tree

Showing 7 changed files with 389 additions and 64 deletions.
diff --git a/docs/_static/RD2bench.json b/docs/_static/RD2bench.json
diff --git a/docs/research/benchmark.rst b/docs/research/benchmark.rst
@@ -5,56 +5,44 @@ Benchmark
 Introduction
 =============
 
-
-Benchmarking the capabilities of the R&D is a very important research problem of the research area.
-
-Currently we are continuously exploring how to benchmark them.
-
-The current benchmarks are listed in this page
-
+Benchmarking the capabilities of R&D is a crucial research problem in this area. We are continuously exploring methods to benchmark these capabilities. The current benchmarks are listed on this page.
 
 Development Capability Benchmarking
 ===================================
 
-
-Benchmark is used to evaluate the effectiveness of factors with fixed data.
-
-It mainly includes the following steps:
+Benchmarking is used to evaluate the effectiveness of factors with fixed data. It mainly includes the following steps:
 
 1. :ref:`read and prepare the eval_data <data>`
 
 2. :ref:`declare the method to be tested and pass the arguments <config>`
 
 3. :ref:`declare the eval method and pass the arguments <config>`
 
-4. :ref:`run the eval <run>` 
+4. :ref:`run the eval <run>`
 
-5. :ref:`save and show the result <show>` 
+5. :ref:`save and show the result <show>`
 
-Configuration 
+Configuration
 -------------
 .. _config:
 
 .. autopydantic_settings:: rdagent.components.benchmark.conf.BenchmarkSettings
 
 Example
-++++++++
++++++++
 .. _example:
 
-The default value for ``bench_test_round`` is 10, and it will take about 2 hours to run 10 rounds.
-To modify it from ``10`` to ``2`` you can adjust this by adding environment variables in the .env file as shown below.
+The default value for ``bench_test_round`` is 10, which takes about 2 hours to run. To modify it from ``10`` to ``2``, adjust the environment variables in the .env file as shown below.
 
 .. code-block:: Properties
 
-      BENCHMARK_BENCH_TEST_ROUND=1
+      BENCHMARK_BENCH_TEST_ROUND=2
 
 Data Format
 -------------
 .. _data:
 
-The sample data in ``bench_data_path`` is a dictionary where each key represents a factor name. 
-
-The value associated with each key is factor data containing the following information:
+The sample data in ``bench_data_path`` is a dictionary where each key represents a factor name. The value associated with each key is factor data containing the following information:
 
 - **description**: A textual description of the factor.
 - **formulation**: A LaTeX formula representing the model's formulation.
@@ -63,41 +51,41 @@ The value associated with each key is factor data containing the following infor
 - **Difficulty**: The difficulty level of implementing or understanding the factor.
 - **gt_code**: A piece of code associated with the factor.
 
-Here is the example of this data format:
+Here is an example of this data format:
 
 .. literalinclude:: ../../rdagent/components/benchmark/example.json
    :language: json
 
+Ensure the data is placed in the ``FACTOR_COSTEER_SETTINGS.data_folder_debug``. The data files should be in ``.h5`` or ``.md`` format and must not be stored in any subfolders. LLM-Agents will review the file content and implement the tasks.
+
+.. TODO: Add a script to automatically generate the data in the `rdagent/app/quant_factor_benchmark/data` folder.
+
 Run Benchmark
 -------------
 .. _run:
 
-Start benchmark after finishing the :doc:`../installation_and_configuration`.
+Start the benchmark after completing the :doc:`../installation_and_configuration`.
 
 .. code-block:: Properties
 
-      python rdagent/app/quant_factor_benchmark/eval.py
-
-
+      dotenv run -- python rdagent/app/benchmark/factor/eval.py
 
 Once completed, a pkl file will be generated, and its path will be printed on the last line of the console.
 
 Show Result
 -------------
 .. _show:
 
-The ``analysis.py`` script is used to read data from pkl and convert it to an image.
-Modify the python code in ``rdagent/app/quant_factor_benchmark/analysis.py`` to specify the path to the pkl file and the output path for the png file.
+The ``analysis.py`` script reads data from the pkl file and converts it to an image. Modify the Python code in ``rdagent/app/quant_factor_benchmark/analysis.py`` to specify the path to the pkl file and the output path for the png file.
 
 .. code-block:: Properties
 
-      python rdagent/app/quant_factor_benchmark/analysis.py
+      dotenv run -- python rdagent/app/benchmark/factor/analysis.py <log/path to.pkl>
 
 A png file will be saved to the designated path as shown below.
 
 .. image:: ../_static/benchmark.png
 
-
 Related Paper
 -------------
 
@@ -116,3 +104,6 @@ Related Paper
     }
 
 .. image:: https://github.com/user-attachments/assets/494f55d3-de9e-4e73-ba3d-a787e8f9e841
+
+To replicate the benchmark detailed in the paper, please consult the factors listed in the following file: `RD2bench.json <../_static/RD2bench.json>`_.
+Please note use ``only_correct_format=False`` when evaluating the results.
diff --git a/rdagent/app/benchmark/factor/analysis.py b/rdagent/app/benchmark/factor/analysis.py
@@ -13,9 +13,10 @@
 
 
 class BenchmarkAnalyzer:
-    def __init__(self, settings):
+    def __init__(self, settings, only_correct_format=False):
         self.settings = settings
         self.index_map = self.load_index_map()
+        self.only_correct_format = only_correct_format
 
     def load_index_map(self):
         index_map = {}
@@ -119,11 +120,13 @@ def analyze_data(self, sum_df):
         format_succ_rate_f = self.reformat_index(format_succ_rate)
 
         corr = sum_df_clean["FactorCorrelationEvaluator"].fillna(0.0)
-        corr = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
-        corr_res = self.reformat_index(corr)
-        corr_max = sum_df_clean["FactorCorrelationEvaluator"]
+        if self.only_correct_format:
+            corr = corr.loc[format_issue == 1.0]
 
-        corr_max = corr_max.unstack().T.max(axis=0).to_frame("corr(only success)")
+        corr_res = corr.unstack().T.mean(axis=0).to_frame("corr(only success)")
+        corr_res = self.reformat_index(corr_res)
+
+        corr_max = corr.unstack().T.max(axis=0).to_frame("corr(only success)")
         corr_max_res = self.reformat_index(corr_max)
 
         value_max = sum_df_clean["FactorEqualValueRatioEvaluator"]
@@ -150,9 +153,15 @@ def analyze_data(self, sum_df):
             axis=1,
         )
 
-        df = result_all.sort_index(axis=1, key=self.result_all_key_order)
+        df = result_all.sort_index(axis=1, key=self.result_all_key_order).sort_index(axis=0)
         print(df)
 
+        print()
+        print(df.groupby("Category").mean())
+
+        print()
+        print(df.mean())
+
         # Calculate the mean of each column
         mean_values = df.fillna(0.0).mean()
         mean_df = pd.DataFrame(mean_values).T
@@ -196,9 +205,10 @@ def main(
     path="git_ignore_folder/eval_results/res_promptV220240724-060037.pkl",
     round=1,
     title="Comparison of Different Methods",
+    only_correct_format=False,
 ):
     settings = BenchmarkSettings()
-    benchmark = BenchmarkAnalyzer(settings)
+    benchmark = BenchmarkAnalyzer(settings, only_correct_format=only_correct_format)
     results = {
         f"{round} round experiment": path,
     }

diff --git a/rdagent/app/benchmark/factor/eval.py b/rdagent/app/benchmark/factor/eval.py
@@ -1,16 +1,9 @@
-import os
-import pickle
-import time
-from pathlib import Path
-from pprint import pprint
-
 from rdagent.app.qlib_rd_loop.conf import FACTOR_PROP_SETTING
 from rdagent.components.benchmark.conf import BenchmarkSettings
 from rdagent.components.benchmark.eval_method import FactorImplementEval
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
-from rdagent.scenarios.qlib.experiment.factor_experiment import QlibFactorScenario
 from rdagent.scenarios.qlib.factor_experiment_loader.json_loader import (
     FactorTestCaseLoaderFromJsonFile,
 )
@@ -25,7 +18,7 @@
     # 3.declare the method to be tested and pass the arguments.
 
     scen: Scenario = import_class(FACTOR_PROP_SETTING.scen)()
-    generate_method = import_class(bs.bench_method_cls)(scen=scen)
+    generate_method = import_class(bs.bench_method_cls)(scen=scen, **bs.bench_method_extra_kwargs)
     # 4.declare the eval method and pass the arguments.
     eval_method = FactorImplementEval(
         method=generate_method,
@@ -36,7 +29,7 @@
     )
 
     # 5.run the eval
-    res = eval_method.eval()
+    res = eval_method.eval(eval_method.develop())
 
     # 6.save the result
     logger.log_object(res)
diff --git a/rdagent/components/benchmark/conf.py b/rdagent/components/benchmark/conf.py
@@ -12,9 +12,6 @@ class Config:
         env_prefix = "BENCHMARK_"
         """Use `BENCHMARK_` as prefix for environment variables"""
 
-    ground_truth_dir: Path = DIRNAME / "ground_truth"
-    """ground truth dir"""
-
     bench_data_path: Path = DIRNAME / "example.json"
     """data for benchmark"""
 
@@ -24,7 +21,7 @@ class Config:
     bench_test_case_n: Optional[int] = None
     """how many test cases to run; If not given, all test cases will be run"""
 
-    bench_method_cls: str = "rdagent.components.coder.CoSTEER.FactorCoSTEER"
+    bench_method_cls: str = "rdagent.components.coder.factor_coder.FactorCoSTEER"
     """method to be used for test cases"""
 
     bench_method_extra_kwargs: dict = field(

diff --git a/rdagent/components/coder/factor_coder/eva_utils.py b/rdagent/components/coder/factor_coder/eva_utils.py
@@ -221,15 +221,11 @@ def evaluate(
                     str(resp_dict["output_format_feedback"]),
                     resp_dict["output_format_decision"],
                 )
-
-            except json.JSONDecodeError as e:
-                raise ValueError("Failed to decode JSON response from API.") from e
-
-            except KeyError as e:
+            except (KeyError, json.JSONDecodeError) as e:
                 attempts += 1
                 if attempts >= max_attempts:
                     raise KeyError(
-                        "Response from API is missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts."
+                        "Wrong JSON Response or missing 'output_format_decision' or 'output_format_feedback' key after multiple attempts."
                     ) from e
 
         return "Failed to evaluate output format after multiple attempts.", False

diff --git a/rdagent/components/coder/factor_coder/evolving_strategy.py b/rdagent/components/coder/factor_coder/evolving_strategy.py
@@ -158,14 +158,20 @@ def implement_one_task(
                 queried_similar_successful_knowledge_to_render = queried_similar_successful_knowledge_to_render[:-1]
             elif len(queried_similar_error_knowledge_to_render) > 0:
                 queried_similar_error_knowledge_to_render = queried_similar_error_knowledge_to_render[:-1]
-        code = json.loads(
-            APIBackend(
-                use_chat_cache=FACTOR_COSTEER_SETTINGS.coder_use_cache
-            ).build_messages_and_create_chat_completion(
-                user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
-            )
-        )["code"]
-        return code
+        for _ in range(10):
+            try:
+                code = json.loads(
+                    APIBackend(
+                        use_chat_cache=FACTOR_COSTEER_SETTINGS.coder_use_cache
+                    ).build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt, system_prompt=system_prompt, json_mode=True
+                    )
+                )["code"]
+                return code
+            except json.decoder.JSONDecodeError:
+                pass
+        else:
+            return ""  # return empty code if failed to get code after 10 attempts
 
     def assign_code_list_to_evo(self, code_list, evo):
         for index in range(len(evo.sub_tasks)):