```diff
--- test_files/111-original.txt	2025-03-07 19:06:22
+++ test_files/111-modified.txt	2025-03-07 19:06:22
@@ -3,12 +3,13 @@
 import random
 import re
 import time
+import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List
 
-from botocore.exceptions import ClientError
-from langchain_aws import BedrockChat
+from langchain_aws import ChatBedrock
 from langchain_core.messages import HumanMessage
+from langchain_openai import ChatOpenAI
 from langgraph.errors import GraphRecursionError
 
 from composio_langgraph import Action
@@ -22,48 +23,118 @@
 max_retries = 5
 base_delay = 1  # in seconds
 
-bedrock_client = BedrockChat(
-    credentials_profile_name="default",
-    model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
-    region_name="us-west-2",
-    model_kwargs={"temperature": 0},
-)
 
+MODEL = "openai"
 
-def build_comparison_prompt(
-    repo_name: str, issue_desc: str, patches: List[str], tests_passed: List[str]
-) -> str:
-    patch_str = "\n".join(
-        [
-            "=" * 50
-            + f"\nPatch {i+1}:\nTESTS STATUS: {str(tests_passed[i])}\n\n{patch}"
-            for i, patch in enumerate(patches)
-        ]
-        + ["=" * 50]
-    )
+
+def retry_with_exponential_backoff(func, *args, **kwargs):
+    for attempt in range(max_retries):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise e
+            delay = (2**attempt) * base_delay
+            time.sleep(delay)
+
+
+def get_llm_response(system_prompt: str, human_prompt: str) -> str:
+    try:
+        if MODEL == "claude":
+            client = ChatBedrock(
+                credentials_profile_name="default",
+                model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
+                region_name="us-west-2",
+                model_kwargs={"temperature": 0},
+            )
+            response = retry_with_exponential_backoff(
+                client.invoke, [("system", system_prompt), ("human", human_prompt)]
+            )
+        else:
+            client = ChatOpenAI(
+                model="o1-mini",
+                temperature=1,
+                max_completion_tokens=4096,
+                api_key="<OPENAI_API_KEY>",
+            )
+            response = retry_with_exponential_backoff(
+                client.invoke, [("human", human_prompt)]
+            )
+        return response.content
+    except Exception:
+        return f"Error while calling llm {MODEL}: \n{traceback.format_exc()}\n"
+
+
+def build_comparison_prompt(repo_name: str, issue_desc: str, patch_str: str) -> str:
     return """
-I facing the following issue in the repo {repo_name}. You have an older version of the codebase, so you your belief about the 
-codebase might be outdated.
+I am facing the following issue in the repo {repo_name}. You have an older version of the codebase, so your belief about the 
+codebase might be outdated. Some agents tried to solve the issue and generated patches. Your task is to choose the best patch that fixes the issue,
+or the ask the agents to run again, if you are not confident that the patches fix the issue. To help you, I have also provided a summary of the 
+run of the agent. 
 
 Issue Description:
 {issue_desc}
 
-You are given multiple patches and you need to check which one fixes the issue. 
+You are given multiple patches and details of the agent's run, and you need to check which one fixes the issue. 
 Only one of the patch will fix the issue.
 
 {patch_str}
 
 First analyse all the patches thoroughly and then choose the best patch that fixes the issue. You need to 
 consider all the edge cases very carefully. The chosen patch might be more verbose, but it should pass all the 
-possible test cases regarding the issue.
+possible test cases regarding the issue. Choose a patch if you are ABSOLUTELY SURE THAT THE PATCH SOLVES THE ISSUE.
 
+If you feel none of the patches fixes the issue, respond with "RUN AGAIN". Also give a detailed one paragraph of reasoning why you feel
+none the patches is a correct solution to the problem. Analyse the runs of the agents as well and provide what the agents did wrong. The resoning must focus
+on the aspects that the agent should take care of in future runs, as well as a summary of the patches generated by the agents.
+
+
 NOTE: ONLY JUDGE THE PATCHES BASED ON THE CHANGES IN THE SOURCE CODE.
 IGNORE THE CHANGES IN THE TESTS, DOCS OR OTHER FILES.
-GIVE PREFERENCE TO THE PATCHES THAT PASS THE TESTS.
+RESPONE WITH THE PATCH NUMBER AND REASONING ONLY IF YOU ARE ABSOLUTELY CONFIDENT THAT THE PATCH FIXED THE ISSUE. RESPOND WITH "RUN AGAIN" OTHERWISE WITH PROPER REASONING.
+YOU DON'T NEED TO WORRY ABOUT THE TESTS. ONLY JUDGE THE PATCHES BASED ON THE CHANGES IN SOURCE CODE.
 
-I am reiterating the issue again:
+If you are absolutely confident that one of the patches fixes the issue and decide to submit the patch from Provide your response in the following format:
+{{
+    "patch": "The number of the patch that best fixes the issue (1, 2, 3, ...)",
+    "reasoning": "Your explanation for why the chosen patch fixes the issue",
+    "confidence": "How confident are you that the patch fixes the issue? (0-100)"
+}}
+
+If you feel that none of the patches fixes the issue, decide to reject the patches and run again, provide your response in the format:
+{{
+    "patch": "RUN AGAIN",
+    "reasoning": "The detailed reason why none of the patch can fix the issue. Summarise the patches as well, so that next software engineer has the whole context about the
+patches and reason of their failures." 
+}}
+Please adhere to the json format strictly.
+""".format(
+        repo_name=repo_name, issue_desc=issue_desc, patch_str=patch_str
+    )
+
+
+def build_comparison_prompt_hard(
+    repo_name: str, issue_desc: str, patch_str: str
+) -> str:
+
+    return """
+I am facing the following issue in the repo {repo_name}. You have an older version of the codebase, so your belief about the 
+codebase might be outdated. Some agents tried to solve the issue and generated patches. Your task is to choose the best patch that fixes the issue.
+Issue Description:
 {issue_desc}
 
+You are given multiple patches and details of the agent's run, and you need to check which one fixes the issue. 
+Only one of the patch will fix the issue.
+
+{patch_str}
+
+First analyse all the patches thoroughly and then choose the best patch that fixes the issue. You need to 
+consider all the edge cases very carefully. The chosen patch might be more verbose, but it should pass all the 
+possible test cases regarding the issue.
+
+NOTE: ONLY JUDGE THE PATCHES BASED ON THE CHANGES IN THE SOURCE CODE.
+IGNORE THE CHANGES IN THE TESTS, DOCS OR OTHER FILES.
+
 Provide your response in the following format:
 {{
     "patch": "The number of the patch that best fixes the issue (1, 2, 3, ...)",
@@ -74,296 +145,208 @@
     )
 
 
-def bench(workspace_ids: str, issue_config: IssueConfig) -> str:
-    with ThreadPoolExecutor(max_workers=3) as executor:
-        futures = [
-            executor.submit(run_agent_function, workspace_id, issue_config)
-            for workspace_id in workspace_ids
-        ]
-        results = []
-        for future in as_completed(futures):
-            try:
-                result = future.result()
-                if result:
-                    results.append(result)
-            except Exception as e:
-                print(f"Error in future: {e}")
+def choose_patch(
+    patches, issue_config: IssueConfig, run_contents: List[str], hard=False
+):
+    if not patches:
+        return "", False
 
-    patches, tests_passed = zip(*results)
-    valid_results = [
-        (patch, test_passed)
-        for patch, test_passed in results
-        if patch and patch.strip()
-    ]
-    if not valid_results:
-        return ""
+    run_summaries = []
+    for run_content in run_contents:
+        summary_response = get_llm_response(
+            system_prompt="You are an expert summarizer of agent's output.",
+            human_prompt=f"The following is the run of the agent after it tried to fix the issue. Analyse the contents and messages of the run and give a short summary of what the agent did. \n{run_content}. Provide the output in the form of 5-7 chronological points.",  # noqa: E501
+        )
+        run_summaries.append(summary_response)
 
-    sorted_results = sorted(valid_results, key=lambda x: "PASS" in x[1])
-    patches, tests_passed = zip(*sorted_results)
+    patch_str = ""
+    for i, patch in enumerate(patches):
+        run_summary = run_summaries[i]
+        patch_str += "=" * 50
+        patch_str += f"\nPatch {i+1}:\n{patch}"
+        if not hard:
+            patch_str += f"\nSummary of the agent:\n{run_summary}\n"
+    patch_str += "=" * 50
 
-    for attempt in range(max_retries):
-        try:
-            response = bedrock_client.invoke(
-                [
-                    ("system", "You are a software engineer expert at solving bugs."),
-                    (
-                        "human",
-                        build_comparison_prompt(
-                            repo_name=issue_config.repo_name.split("/")[-1],
-                            issue_desc=issue_config.issue_desc,
-                            patches=patches,
-                            tests_passed=tests_passed,
-                        ),
-                    ),
-                ]
-            )
-            break  # If successful, break out of the retry loop
-        except ClientError:
-            if attempt == max_retries - 1:  # If this was the last attempt
-                raise  # Re-raise the last exception
-            delay = (2**attempt) * base_delay  # Exponential backoff
-            time.sleep(delay)
+    if not hard:
+        response = get_llm_response(
+            system_prompt="You are a software engineer expert at solving bugs.",
+            human_prompt=build_comparison_prompt(
+                repo_name=issue_config.repo_name.split("/")[-1],
+                issue_desc=issue_config.issue_desc,
+                patch_str=patch_str,
+            ),
+        )
+    else:
+        response = get_llm_response(
+            system_prompt="You are a software engineer expert at solving bugs.",
+            human_prompt=build_comparison_prompt_hard(
+                repo_name=issue_config.repo_name.split("/")[-1],
+                issue_desc=issue_config.issue_desc,
+                patch_str=patch_str,
+            ),
+        )
+    if "RUN AGAIN" in response:
+        return response, False
+
     print("Response", response)
-    if response.content:
+    if response:
         try:
-            match = re.search(r"patch.*?(\d+)", response.content, re.IGNORECASE)
+            match = re.search(r"patch.*?(\d+)", response, re.IGNORECASE)
             if match:
                 patch_number = int(match.group(1))
                 if 1 <= patch_number <= len(patches):
-                    return patches[patch_number - 1]
+                    return patches[patch_number - 1], True
 
-            print("\n" * 10)
-            return random.choice(patches)
+            open("error.txt", "w").write(response)
+            return random.choice(patches), True
         except Exception as e:
-            print("\n" * 10)
+            open("error.txt", "w").write(response)
             print(f"Error in response: {e}")
-            return random.choice(patches)
+            return random.choice(patches), True
     else:
-        print("\n" * 10)
+        open("error.txt", "w").write(response)
         print("No response content found")
-        return random.choice(patches)
+        return random.choice(patches), True
 
 
-def run_agent_function(workspace_id: str, issue_config: IssueConfig) -> str:
-    """Run benchmark on the agent."""
+def bench(workspace_ids: str, issue_config: IssueConfig) -> str:
+    patch = ""
+    patch_list = []
+    for _ in range(3):
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            futures = [
+                executor.submit(run_agent_function, workspace_id, issue_config, patch)
+                for workspace_id in workspace_ids
+            ]
+            patches = []
+            run_contents = []
+            for future in as_completed(futures):
+                try:
+                    patch, run_content = future.result()
+                    if patch:
+                        patches.append(patch)
+                    run_contents.append(run_content)
+                except Exception as e:
+                    print(f"Error in future: {e}")
+            patch_list.extend(patches)
 
-    # Set the workspace for the tools to run.
-    # import pdb; pdb.set_trace()
-    print("Running agent function")
-    test_command = issue_config.test_command
-    if "sphinx" in issue_config.repo_name:
-        eval_script = issue_config.eval_script
-        new_files = []
-        for i in range(len(eval_script.split("\n"))):
-            lines = eval_script.split("\n")[i]
-            if lines.startswith("new"):
-                new_files.append(eval_script.split("\n")[i - 1].split(" ")[-1][2:])
-        test_command = eval_script.splitlines()[-2]
-        test_command = test_command.replace("--current-env -e", "-e ")
-        for test in new_files:
-            test_command = test_command.replace(test, "")
-        test_command = " ".join(test_command.split())
-        if test_command.endswith("--"):
-            test_command = "echo PASSED"
+        patch, success = choose_patch(patches, issue_config, run_contents)
 
-    graph, composio_toolset, test_response_file = get_agent_graph(
-        repo_name=issue_config.repo_name.split("/")[-1],
-        workspace_id=workspace_id,
-        test_command=test_command,
-    )
+        if success:
+            return patch
 
-    # get the git tree
-    git_tree_response = composio_toolset.execute_action(
-        action=Action.FILETOOL_GIT_REPO_TREE,
-        params={},
-    )
-    print("Result of git tree", git_tree_response)
+    patch, success = choose_patch(patches, issue_config, run_contents, hard=True)
+    return patch
 
-    cd_response = composio_toolset.execute_action(
-        action=Action.SHELLTOOL_EXEC_COMMAND,
-        params={"cmd": f"cd ~/{issue_config.repo_name.split('/')[-1]}"},
-    )
-    print("Result of cd", cd_response)
 
-    # kick off the crew on the issue.
-
-    if "matplotlib" in issue_config.repo_name or "seaborn" in issue_config.repo_name:
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install pytest"},
-        )
-
-    if "scikit-learn" in issue_config.repo_name:
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install cython==0.29.33 pytest"},
-        )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={
-                "cmd": "python -m pip install -v --no-use-pep517 --no-build-isolation -e ."
-            },
-        )
-
-    if "matplotlib" in issue_config.repo_name:
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install -e ."},
-        )
-
-    if "astropy" in issue_config.repo_name:
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={
-                "cmd": 'sed -i \'s/requires = ["setuptools",/requires = ["setuptools==68.0.0",/\' pyproject.toml'
-            },
-        )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install -e .[test] --verbose"},
-        )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "sleep 120"},
-        )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install -e .[test] --verbose"},
-        )
-
-    if "pytest" in issue_config.repo_name:
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install -e ."},
-        )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={
-                "cmd": "sed -i \"s/__version__ = version = '.*'/__version__ = version = '8'/\" src/_pytest/_version.py"
-            },
-        )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={
-                "cmd": 'sed -i "s/__version_tuple__ = version_tuple = .*/__version_tuple__ = version_tuple = (8)/" src/_pytest/_version.py'
-            },
-        )
-
-    if "sympy" in issue_config.repo_name:
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={"cmd": "python -m pip install regex"},
-        )
-
-    if "sphinx" in issue_config.repo_name:
-        for line in issue_config.install_repo_script.split("\n"):
-            if "sed" in line:
-                composio_toolset.execute_action(
-                    action=Action.SHELLTOOL_EXEC_COMMAND,
-                    params={"cmd": line},
-                )
-        composio_toolset.execute_action(
-            action=Action.SHELLTOOL_EXEC_COMMAND,
-            params={
-                "cmd": 'git add -u && git config --global user.email "example@example.com" && git config --global user.name "composio" && git commit -m "fix: setup complete"'
-            },
-        )
-
-    test_response = composio_toolset.execute_action(
-        action=Action.SHELLTOOL_TEST_COMMAND,
-        params={},
-    )
-    print("Result of test command", test_response)
-
-    try:
-        graph.invoke(
-            {
-                "messages": [
-                    HumanMessage(
-                        content=f"{issue_config.issue_desc} in the repo: {issue_config.repo_name}. Output to git tree command {git_tree_response}"
-                    )
-                ]
-            },
-            {
-                "recursion_limit": 70,
-                "timeout": 1200,
-            },  # Added timeout of 20 minutes (1200 seconds)
-        )
-    except GraphRecursionError as e:
-        print(f"GraphRecursionError: {e}")
-    except Exception as e:
-        print(f"Error in graph.invoke: {e}")
-
-    if not os.path.exists(test_response_file):
-        print(f"Test response file {test_response_file} does not exist")
-        test_response = "NOT RUN"
-    else:
-        with open(test_response_file, "r") as f:
-            test_response = f.read()
-        os.remove(test_response_file)
-
-    cwd_response = composio_toolset.execute_action(
+def get_patch_from_response(composio_toolset, repo_name):
+    composio_toolset.execute_action(
         action=Action.FILETOOL_CHANGE_WORKING_DIRECTORY,
-        params={"path": f"/home/user/{issue_config.repo_name.split('/')[-1]}"},
+        params={"path": f"/home/user/{repo_name}"},
     )
-    print("Result of pwd", cwd_response)
+    # if "astropy" in repo_name:
+    #     composio_toolset.execute_action(
+    #         action=Action.SHELLTOOL_EXEC_COMMAND,
+    #         params={"cmd": "git config --global user.email \"you@example.com\""},
+    #     )
+    #     composio_toolset.execute_action(
+    #         action=Action.SHELLTOOL_EXEC_COMMAND,
+    #         params={"cmd": "git config --global user.name \"Your Name\""},
+    #     )
+    #     composio_toolset.execute_action(
+    #         action=Action.SHELLTOOL_EXEC_COMMAND,
+    #         params={"cmd": "git restore --staged :/"},
+    #     )
+    #     composio_toolset.execute_action(
+    #         action=Action.SHELLTOOL_EXEC_COMMAND,
+    #         params={"cmd": "chown -R root:root .git"},
+    #     )
+    #     composio_toolset.execute_action(
+    #         action=Action.SHELLTOOL_EXEC_COMMAND,
+    #         params={"cmd": "git add astropy_helpers && git commit -m \"add submodule\""},
+    #     )
+
     get_patch_resp = composio_toolset.execute_action(
         action=Action.FILETOOL_GIT_PATCH,
         params={},
     )
 
-    print(f"Get patch response: {get_patch_resp}")
     if not get_patch_resp.get("successful", False):
         error_message = get_patch_resp.get("error")
         if error_message:
             print(f"Error in get_patch: {error_message}")
-            return "", "FAIL"
+            return ""
         else:
             print("Unknown error occurred in get_patch")
-            return "", "FAIL"
+            return ""
 
     patch_data = get_patch_resp.get("data", {})
     if not patch_data:
         print("No data found in the patch response")
-        return "", "FAIL"
+        return ""
     patch = patch_data.get("patch")
     if not patch:
         error = patch_data.get("error")
         if error:
             print(f"Error in patch data: {error}")
-            return "", "FAIL"
+            return ""
         else:
             print("No patch found in the response data")
-            return "", "FAIL"
+            return ""
 
-    response_prompt = f"""
-    You are given the following test response:
-    {test_response}
+    print(f"Final Patch: {patch}")
+    return patch
 
-    You need to check if the test response is successful.
-    If the test response is successful, respond with "PASS".
-    If the test response is not successful, respond with "FAIL".
-    """
 
-    for attempt in range(max_retries):
-        try:
-            response = bedrock_client.invoke(
-                [
-                    ("system", "You are expert at reading test responses."),
-                    ("human", response_prompt),
-                ]
-            )
-            break  # If successful, break out of the retry loop
-        except ClientError:
-            if attempt == max_retries - 1:  # If this was the last attempt
-                break  # Break out of the retry loop on the last attempt
-            delay = (2**attempt) * base_delay  # Exponential backoff
-            time.sleep(delay)
+def run_agent_function(
+    workspace_id: str, issue_config: IssueConfig, previous_patch_str: str = ""
+):
+    """Run benchmark on the agent."""
 
-    print(f"Final Patch: {patch}")
-    return patch, "PASS" if "PASS" in response.content else "FAIL"
+    graph, composio_toolset, run_file = get_agent_graph(
+        repo_name=issue_config.repo_name.split("/")[-1], workspace_id=workspace_id
+    )
 
+    # get the git tree
+    git_tree_response = composio_toolset.execute_action(
+        action=Action.FILETOOL_GIT_REPO_TREE,
+        params={},
+    )
 
+    composio_toolset.execute_action(
+        action=Action.SHELLTOOL_EXEC_COMMAND,
+        params={"cmd": f"cd ~/{issue_config.repo_name.split('/')[-1]}"},
+    )
+
+    if previous_patch_str != "":
+        issue_desc = f"{issue_config.issue_desc}\n. I have already tried to solve this problem before, but failed for the following reason: \n {previous_patch_str}.\n The previous patches did not fix the issue. Now try again to fix the issue. {issue_config.issue_desc}. \n Output to git tree command {git_tree_response}. Pay attention to the reason why patch failed to solve the issue and try something different to fix the issue."  # noqa: E501
+    else:
+        issue_desc = f"{issue_config.issue_desc}.\n Output to git tree command {git_tree_response}"
+
+    try:
+        graph.invoke(
+            {"messages": [HumanMessage(content=issue_desc)]},
+            {"recursion_limit": 50},
+        )
+    except GraphRecursionError as e:
+        print(f"GraphRecursionError: {e}")
+    except Exception as e:
+        print(f"Error in graph.invoke: {e}")
+
+    patch = get_patch_from_response(
+        composio_toolset, issue_config.repo_name.split("/")[-1]
+    )
+    run_content = open(run_file, "r").read()
+    os.remove(run_file)
+    composio_toolset.execute_action(
+        action=Action.SHELLTOOL_EXEC_COMMAND,
+        params={"cmd": f"git reset --hard {issue_config.base_commit_id}"},
+    )
+
+    return patch, run_content
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Run benchmark on the agent.",
@@ -387,6 +370,18 @@
         default="temp",
         help="Run id",
     )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="princeton-nlp/SWE-bench_Verified",
+        help="Dataset",
+    )
+    parser.add_argument(
+        "--num-instances",
+        type=int,
+        default=3,
+        help="Number of instances",
+    )
     args = parser.parse_args()
 
     if args.test_instance_ids:
@@ -400,11 +395,11 @@
 
     evaluate(
         bench,
+        dataset_name=args.dataset,
         dry_run=False,
         test_range=test_range,
         include_hints=False,
         test_instance_ids=test_instance_ids_list,
         run_id=args.run_id,
-        num_instances=3,
-        # image_name="composio/composio:dev", # if you are doing local dev
+        num_instances=args.num_instances,
     )
```
