Skip to content

Commit

Permalink
Refactor make_run_report to work with run_evaluation and run_evaluati…
Browse files Browse the repository at this point in the history
…on_modal
  • Loading branch information
carlosejimenez committed Jan 11, 2025
1 parent 67a0eb3 commit 0aac6d5
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 222 deletions.
136 changes: 136 additions & 0 deletions swebench/harness/reporting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import docker
import json
from pathlib import Path

from swebench.harness.constants import (
KEY_INSTANCE_ID,
KEY_MODEL,
KEY_PREDICTION,
RUN_EVALUATION_LOG_DIR,
LOG_REPORT,
)
from swebench.harness.docker_utils import list_images
from swebench.harness.test_spec import make_test_spec


def make_run_report(
predictions: dict,
full_dataset: list,
run_id: str,
client: docker.DockerClient | None = None,
) -> Path:
"""
Make a final evaluation and run report of the instances that have been run.
Also reports on images and containers that may still running if client is provided.
Args:
predictions (dict): Predictions dict generated by the model
full_dataset (list): List of all instances
run_id (str): Run ID
client (docker.DockerClient): Docker client (optional)
Returns:
Path to report file
"""
# instantiate sets to store IDs of different outcomes
completed_ids = set()
resolved_ids = set()
error_ids = set()
unstopped_containers = set()
unremoved_images = set()
unresolved_ids = set()
incomplete_ids = set()
# get instances with empty patches
empty_patch_ids = set()

# iterate through dataset and check if the instance has been run
for instance in full_dataset:
instance_id = instance[KEY_INSTANCE_ID]
if instance_id not in predictions:
# skip instances without predictions
incomplete_ids.add(instance_id)
continue
prediction = predictions[instance_id]
if prediction.get(KEY_PREDICTION, None) in ["", None]:
empty_patch_ids.add(instance_id)
continue
report_file = (
RUN_EVALUATION_LOG_DIR
/ run_id
/ prediction[KEY_MODEL].replace("/", "__")
/ prediction[KEY_INSTANCE_ID]
/ LOG_REPORT
)
if report_file.exists():
# If report file exists, then the instance has been run
completed_ids.add(instance_id)
report = json.loads(report_file.read_text())
if report[instance_id]["resolved"]:
# Record if the instance was resolved
resolved_ids.add(instance_id)
else:
unresolved_ids.add(instance_id)
else:
# Otherwise, the instance was not run successfully
error_ids.add(instance_id)

if client:
# get remaining images and containers
images = list_images(client)
test_specs = list(map(make_test_spec, full_dataset))
for spec in test_specs:
image_name = spec.instance_image_key
if image_name in images:
unremoved_images.add(image_name)
containers = client.containers.list(all=True)
for container in containers:
if run_id in container.name:
unstopped_containers.add(container.name)

# print final report
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
print(f"Total instances: {len(full_dataset)}")
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
print(f"Instances completed: {len(completed_ids)}")
print(f"Instances incomplete: {len(incomplete_ids)}")
print(f"Instances resolved: {len(resolved_ids)}")
print(f"Instances unresolved: {len(unresolved_ids)}")
print(f"Instances with empty patches: {len(empty_patch_ids)}")
print(f"Instances with errors: {len(error_ids)}")
if client:
print(f"Unstopped containers: {len(unstopped_containers)}")
print(f"Unremoved images: {len(unremoved_images)}")

# write report to file
report = {
"total_instances": len(full_dataset),
"submitted_instances": len(predictions),
"completed_instances": len(completed_ids),
"resolved_instances": len(resolved_ids),
"unresolved_instances": len(unresolved_ids),
"empty_patch_instances": len(empty_patch_ids),
"error_instances": len(error_ids),
"completed_ids": list(sorted(completed_ids)),
"incomplete_ids": list(sorted(incomplete_ids)),
"empty_patch_ids": list(sorted(empty_patch_ids)),
"submitted_ids": list(sorted(predictions.keys())),
"resolved_ids": list(sorted(resolved_ids)),
"unresolved_ids": list(sorted(unresolved_ids)),
"error_ids": list(sorted(error_ids)),
"schema_version": 2,
}
if not client:
report.update({
"unstopped_instances": len(unstopped_containers),
"unstopped_containers": list(sorted(unstopped_containers)),
"unremoved_images": list(sorted(unremoved_images)),
})
report_file = Path(
list(predictions.values())[0][KEY_MODEL].replace("/", "__")
+ f".{run_id}"
+ ".json"
)
with open(report_file, "w") as f:
print(json.dumps(report, indent=4), file=f)
print(f"Report written to {report_file}")
return report_file
122 changes: 2 additions & 120 deletions swebench/harness/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@
from swebench.harness.grading import get_eval_report
from swebench.harness.test_spec import make_test_spec, TestSpec
from swebench.harness.utils import load_swebench_dataset, str2bool, EvaluationError
from swebench.harness.reporting import make_run_report
from swebench.harness.run_evaluation_modal import run_instances_modal



def run_instance(
test_spec: TestSpec,
pred: dict,
Expand Down Expand Up @@ -349,124 +349,6 @@ def get_dataset_from_preds(
return dataset


def make_run_report(
predictions: dict,
full_dataset: list,
client: docker.DockerClient,
run_id: str
) -> Path:
"""
Make a final evaluation and run report of the instances that have been run.
Also reports on images and containers that may still running!
Args:
predictions (dict): Predictions dict generated by the model
full_dataset (list): List of all instances
client (docker.DockerClient): Docker client
run_id (str): Run ID
Returns:
Path to report file
"""
# instantiate sets to store IDs of different outcomes
completed_ids = set()
resolved_ids = set()
error_ids = set()
unstopped_containers = set()
unremoved_images = set()
unresolved_ids = set()
incomplete_ids = set()
# get instances with empty patches
empty_patch_ids = set()

# iterate through dataset and check if the instance has been run
for instance in full_dataset:
instance_id = instance[KEY_INSTANCE_ID]
if instance_id not in predictions:
# skip instances without
incomplete_ids.add(instance_id)
continue
prediction = predictions[instance_id]
if prediction.get(KEY_PREDICTION, None) in ["", None]:
empty_patch_ids.add(instance_id)
continue
report_file = (
RUN_EVALUATION_LOG_DIR
/ run_id
/ prediction[KEY_MODEL].replace("/", "__")
/ prediction[KEY_INSTANCE_ID]
/ LOG_REPORT
)
if report_file.exists():
# If report file exists, then the instance has been run
completed_ids.add(instance_id)
report = json.loads(report_file.read_text())
if report[instance_id]["resolved"]:
# Record if the instance was resolved
resolved_ids.add(instance_id)
else:
unresolved_ids.add(instance_id)
else:
# Otherwise, the instance was not run successfully
error_ids.add(instance_id)

# get remaining images and containers
images = list_images(client)
test_specs = list(map(make_test_spec, full_dataset))
for spec in test_specs:
image_name = spec.instance_image_key
if image_name in images:
unremoved_images.add(image_name)
containers = client.containers.list(all=True)
for container in containers:
if run_id in container.name:
unstopped_containers.add(container.name)

# print final report
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
print(f"Total instances: {len(full_dataset)}")
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
print(f"Instances completed: {len(completed_ids)}")
print(f"Instances incomplete: {len(incomplete_ids)}")
print(f"Instances resolved: {len(resolved_ids)}")
print(f"Instances unresolved: {len(unresolved_ids)}")
print(f"Instances with empty patches: {len(empty_patch_ids)}")
print(f"Instances with errors: {len(error_ids)}")
print(f"Unstopped containers: {len(unstopped_containers)}")
print(f"Unremoved images: {len(unremoved_images)}")

# write report to file
report = {
"total_instances": len(full_dataset),
"submitted_instances": len(predictions),
"completed_instances": len(completed_ids),
"resolved_instances": len(resolved_ids),
"unresolved_instances": len(unresolved_ids),
"empty_patch_instances": len(empty_patch_ids),
"error_instances": len(error_ids),
"unstopped_instances": len(unstopped_containers),
"completed_ids": list(sorted(completed_ids)),
"incomplete_ids": list(sorted(incomplete_ids)),
"empty_patch_ids": list(sorted(empty_patch_ids)),
"submitted_ids": list(sorted(predictions.keys())),
"resolved_ids": list(sorted(resolved_ids)),
"unresolved_ids": list(sorted(unresolved_ids)),
"error_ids": list(sorted(error_ids)),
"unstopped_containers": list(sorted(unstopped_containers)),
"unremoved_images": list(sorted(unremoved_images)),
"schema_version": 2,
}
report_file = Path(
list(predictions.values())[0][KEY_MODEL].replace("/", "__")
+ f".{run_id}"
+ ".json"
)
with open(report_file, "w") as f:
print(json.dumps(report, indent=4), file=f)
print(f"Report written to {report_file}")
return report_file


def get_gold_predictions(dataset_name: str, split: str):
"""
Get gold predictions for the given dataset and split.
Expand Down Expand Up @@ -559,7 +441,7 @@ def main(

# clean images + make final report
clean_images(client, existing_images, cache_level, clean)
make_run_report(predictions, full_dataset, client, run_id)
make_run_report(predictions, full_dataset, run_id, client)

if __name__ == "__main__":
parser = ArgumentParser()
Expand Down
Loading

0 comments on commit 0aac6d5

Please sign in to comment.