Refactor make_run_report to work with run_evaluation and run_evaluati…

…on_modal
swe-bench · Jan 11, 2025 · 0aac6d5 · 0aac6d5
1 parent 67a0eb3
commit 0aac6d5
Show file tree

Hide file tree

Showing 5 changed files with 144 additions and 222 deletions.
diff --git a/swebench/harness/reporting.py b/swebench/harness/reporting.py
@@ -0,0 +1,136 @@
+import docker
+import json
+from pathlib import Path
+
+from swebench.harness.constants import (
+    KEY_INSTANCE_ID,
+    KEY_MODEL,
+    KEY_PREDICTION,
+    RUN_EVALUATION_LOG_DIR,
+    LOG_REPORT,
+)
+from swebench.harness.docker_utils import list_images
+from swebench.harness.test_spec import make_test_spec
+
+
+def make_run_report(
+        predictions: dict,
+        full_dataset: list,
+        run_id: str,
+        client: docker.DockerClient | None = None,
+    ) -> Path:
+    """
+    Make a final evaluation and run report of the instances that have been run.
+    Also reports on images and containers that may still running if client is provided.
+
+    Args:
+        predictions (dict): Predictions dict generated by the model
+        full_dataset (list): List of all instances
+        run_id (str): Run ID
+        client (docker.DockerClient): Docker client (optional)
+    
+    Returns:
+        Path to report file
+    """
+    # instantiate sets to store IDs of different outcomes
+    completed_ids = set()
+    resolved_ids = set()
+    error_ids = set()
+    unstopped_containers = set()
+    unremoved_images = set()
+    unresolved_ids = set()
+    incomplete_ids = set()
+    # get instances with empty patches
+    empty_patch_ids = set()
+
+    # iterate through dataset and check if the instance has been run
+    for instance in full_dataset:
+        instance_id = instance[KEY_INSTANCE_ID]
+        if instance_id not in predictions:
+            # skip instances without predictions
+            incomplete_ids.add(instance_id)
+            continue
+        prediction = predictions[instance_id]
+        if prediction.get(KEY_PREDICTION, None) in ["", None]:
+            empty_patch_ids.add(instance_id)
+            continue
+        report_file = (
+            RUN_EVALUATION_LOG_DIR
+            / run_id
+            / prediction[KEY_MODEL].replace("/", "__")
+            / prediction[KEY_INSTANCE_ID]
+            / LOG_REPORT
+        )
+        if report_file.exists():
+            # If report file exists, then the instance has been run
+            completed_ids.add(instance_id)
+            report = json.loads(report_file.read_text())
+            if report[instance_id]["resolved"]:
+                # Record if the instance was resolved
+                resolved_ids.add(instance_id)
+            else:
+                unresolved_ids.add(instance_id)
+        else:
+            # Otherwise, the instance was not run successfully
+            error_ids.add(instance_id)
+
+    if client:
+        # get remaining images and containers
+        images = list_images(client)
+        test_specs = list(map(make_test_spec, full_dataset))
+        for spec in test_specs:
+            image_name = spec.instance_image_key
+            if image_name in images:
+                unremoved_images.add(image_name)
+        containers = client.containers.list(all=True)
+        for container in containers:
+            if run_id in container.name:
+                unstopped_containers.add(container.name)
+
+    # print final report
+    dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
+    print(f"Total instances: {len(full_dataset)}")
+    print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
+    print(f"Instances completed: {len(completed_ids)}")
+    print(f"Instances incomplete: {len(incomplete_ids)}")
+    print(f"Instances resolved: {len(resolved_ids)}")
+    print(f"Instances unresolved: {len(unresolved_ids)}")
+    print(f"Instances with empty patches: {len(empty_patch_ids)}")
+    print(f"Instances with errors: {len(error_ids)}")
+    if client:
+        print(f"Unstopped containers: {len(unstopped_containers)}")
+        print(f"Unremoved images: {len(unremoved_images)}")
+
+    # write report to file
+    report = {
+        "total_instances": len(full_dataset),
+        "submitted_instances": len(predictions),
+        "completed_instances": len(completed_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": len(empty_patch_ids),
+        "error_instances": len(error_ids),
+        "completed_ids": list(sorted(completed_ids)),
+        "incomplete_ids": list(sorted(incomplete_ids)),
+        "empty_patch_ids": list(sorted(empty_patch_ids)),
+        "submitted_ids": list(sorted(predictions.keys())),
+        "resolved_ids": list(sorted(resolved_ids)),
+        "unresolved_ids": list(sorted(unresolved_ids)),
+        "error_ids": list(sorted(error_ids)),
+        "schema_version": 2,
+    }
+    if not client:
+        report.update({
+            "unstopped_instances": len(unstopped_containers),
+            "unstopped_containers": list(sorted(unstopped_containers)),
+            "unremoved_images": list(sorted(unremoved_images)),
+        })
+    report_file = Path(
+        list(predictions.values())[0][KEY_MODEL].replace("/", "__")
+        + f".{run_id}"
+        + ".json"
+    )
+    with open(report_file, "w") as f:
+        print(json.dumps(report, indent=4), file=f)
+    print(f"Report written to {report_file}")
+    return report_file
diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
@@ -48,10 +48,10 @@
 from swebench.harness.grading import get_eval_report
 from swebench.harness.test_spec import make_test_spec, TestSpec
 from swebench.harness.utils import load_swebench_dataset, str2bool, EvaluationError
+from swebench.harness.reporting import make_run_report
 from swebench.harness.run_evaluation_modal import run_instances_modal
 
 
-
 def run_instance(
         test_spec: TestSpec,
         pred: dict,
@@ -349,124 +349,6 @@ def get_dataset_from_preds(
     return dataset
 
 
-def make_run_report(
-        predictions: dict,
-        full_dataset: list,
-        client: docker.DockerClient,
-        run_id: str
-    ) -> Path:
-    """
-    Make a final evaluation and run report of the instances that have been run.
-    Also reports on images and containers that may still running!
-
-    Args:
-        predictions (dict): Predictions dict generated by the model
-        full_dataset (list): List of all instances
-        client (docker.DockerClient): Docker client
-        run_id (str): Run ID
-    
-    Returns:
-        Path to report file
-    """
-    # instantiate sets to store IDs of different outcomes
-    completed_ids = set()
-    resolved_ids = set()
-    error_ids = set()
-    unstopped_containers = set()
-    unremoved_images = set()
-    unresolved_ids = set()
-    incomplete_ids = set()
-    # get instances with empty patches
-    empty_patch_ids = set()
-
-    # iterate through dataset and check if the instance has been run
-    for instance in full_dataset:
-        instance_id = instance[KEY_INSTANCE_ID]
-        if instance_id not in predictions:
-            # skip instances without 
-            incomplete_ids.add(instance_id)
-            continue
-        prediction = predictions[instance_id]
-        if prediction.get(KEY_PREDICTION, None) in ["", None]:
-            empty_patch_ids.add(instance_id)
-            continue
-        report_file = (
-            RUN_EVALUATION_LOG_DIR
-            / run_id
-            / prediction[KEY_MODEL].replace("/", "__")
-            / prediction[KEY_INSTANCE_ID]
-            / LOG_REPORT
-        )
-        if report_file.exists():
-            # If report file exists, then the instance has been run
-            completed_ids.add(instance_id)
-            report = json.loads(report_file.read_text())
-            if report[instance_id]["resolved"]:
-                # Record if the instance was resolved
-                resolved_ids.add(instance_id)
-            else:
-                unresolved_ids.add(instance_id)
-        else:
-            # Otherwise, the instance was not run successfully
-            error_ids.add(instance_id)
-
-    # get remaining images and containers
-    images = list_images(client)
-    test_specs = list(map(make_test_spec, full_dataset))
-    for spec in test_specs:
-        image_name = spec.instance_image_key
-        if image_name in images:
-            unremoved_images.add(image_name)
-    containers = client.containers.list(all=True)
-    for container in containers:
-        if run_id in container.name:
-            unstopped_containers.add(container.name)
-
-    # print final report
-    dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
-    print(f"Total instances: {len(full_dataset)}")
-    print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
-    print(f"Instances completed: {len(completed_ids)}")
-    print(f"Instances incomplete: {len(incomplete_ids)}")
-    print(f"Instances resolved: {len(resolved_ids)}")
-    print(f"Instances unresolved: {len(unresolved_ids)}")
-    print(f"Instances with empty patches: {len(empty_patch_ids)}")
-    print(f"Instances with errors: {len(error_ids)}")
-    print(f"Unstopped containers: {len(unstopped_containers)}")
-    print(f"Unremoved images: {len(unremoved_images)}")
-
-    # write report to file
-    report = {
-        "total_instances": len(full_dataset),
-        "submitted_instances": len(predictions),
-        "completed_instances": len(completed_ids),
-        "resolved_instances": len(resolved_ids),
-        "unresolved_instances": len(unresolved_ids),
-        "empty_patch_instances": len(empty_patch_ids),
-        "error_instances": len(error_ids),
-        "unstopped_instances": len(unstopped_containers),
-        "completed_ids": list(sorted(completed_ids)),
-        "incomplete_ids": list(sorted(incomplete_ids)),
-        "empty_patch_ids": list(sorted(empty_patch_ids)),
-        "submitted_ids": list(sorted(predictions.keys())),
-        "resolved_ids": list(sorted(resolved_ids)),
-        "unresolved_ids": list(sorted(unresolved_ids)),
-        "error_ids": list(sorted(error_ids)),
-        "unstopped_containers": list(sorted(unstopped_containers)),
-        "unremoved_images": list(sorted(unremoved_images)),
-        "schema_version": 2,
-    }
-    report_file = Path(
-        list(predictions.values())[0][KEY_MODEL].replace("/", "__")
-        + f".{run_id}"
-        + ".json"
-    )
-    with open(report_file, "w") as f:
-        print(json.dumps(report, indent=4), file=f)
-    print(f"Report written to {report_file}")
-    return report_file
-
-
 def get_gold_predictions(dataset_name: str, split: str):
     """
     Get gold predictions for the given dataset and split.
@@ -559,7 +441,7 @@ def main(
 
     # clean images + make final report
     clean_images(client, existing_images, cache_level, clean)
-    make_run_report(predictions, full_dataset, client, run_id)
+    make_run_report(predictions, full_dataset, run_id, client)
 
 if __name__ == "__main__":
     parser = ArgumentParser()