-
Notifications
You must be signed in to change notification settings - Fork 403
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor make_run_report to work with run_evaluation and run_evaluati…
…on_modal
- Loading branch information
1 parent
67a0eb3
commit 0aac6d5
Showing
5 changed files
with
144 additions
and
222 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import docker | ||
import json | ||
from pathlib import Path | ||
|
||
from swebench.harness.constants import ( | ||
KEY_INSTANCE_ID, | ||
KEY_MODEL, | ||
KEY_PREDICTION, | ||
RUN_EVALUATION_LOG_DIR, | ||
LOG_REPORT, | ||
) | ||
from swebench.harness.docker_utils import list_images | ||
from swebench.harness.test_spec import make_test_spec | ||
|
||
|
||
def make_run_report( | ||
predictions: dict, | ||
full_dataset: list, | ||
run_id: str, | ||
client: docker.DockerClient | None = None, | ||
) -> Path: | ||
""" | ||
Make a final evaluation and run report of the instances that have been run. | ||
Also reports on images and containers that may still running if client is provided. | ||
Args: | ||
predictions (dict): Predictions dict generated by the model | ||
full_dataset (list): List of all instances | ||
run_id (str): Run ID | ||
client (docker.DockerClient): Docker client (optional) | ||
Returns: | ||
Path to report file | ||
""" | ||
# instantiate sets to store IDs of different outcomes | ||
completed_ids = set() | ||
resolved_ids = set() | ||
error_ids = set() | ||
unstopped_containers = set() | ||
unremoved_images = set() | ||
unresolved_ids = set() | ||
incomplete_ids = set() | ||
# get instances with empty patches | ||
empty_patch_ids = set() | ||
|
||
# iterate through dataset and check if the instance has been run | ||
for instance in full_dataset: | ||
instance_id = instance[KEY_INSTANCE_ID] | ||
if instance_id not in predictions: | ||
# skip instances without predictions | ||
incomplete_ids.add(instance_id) | ||
continue | ||
prediction = predictions[instance_id] | ||
if prediction.get(KEY_PREDICTION, None) in ["", None]: | ||
empty_patch_ids.add(instance_id) | ||
continue | ||
report_file = ( | ||
RUN_EVALUATION_LOG_DIR | ||
/ run_id | ||
/ prediction[KEY_MODEL].replace("/", "__") | ||
/ prediction[KEY_INSTANCE_ID] | ||
/ LOG_REPORT | ||
) | ||
if report_file.exists(): | ||
# If report file exists, then the instance has been run | ||
completed_ids.add(instance_id) | ||
report = json.loads(report_file.read_text()) | ||
if report[instance_id]["resolved"]: | ||
# Record if the instance was resolved | ||
resolved_ids.add(instance_id) | ||
else: | ||
unresolved_ids.add(instance_id) | ||
else: | ||
# Otherwise, the instance was not run successfully | ||
error_ids.add(instance_id) | ||
|
||
if client: | ||
# get remaining images and containers | ||
images = list_images(client) | ||
test_specs = list(map(make_test_spec, full_dataset)) | ||
for spec in test_specs: | ||
image_name = spec.instance_image_key | ||
if image_name in images: | ||
unremoved_images.add(image_name) | ||
containers = client.containers.list(all=True) | ||
for container in containers: | ||
if run_id in container.name: | ||
unstopped_containers.add(container.name) | ||
|
||
# print final report | ||
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset} | ||
print(f"Total instances: {len(full_dataset)}") | ||
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}") | ||
print(f"Instances completed: {len(completed_ids)}") | ||
print(f"Instances incomplete: {len(incomplete_ids)}") | ||
print(f"Instances resolved: {len(resolved_ids)}") | ||
print(f"Instances unresolved: {len(unresolved_ids)}") | ||
print(f"Instances with empty patches: {len(empty_patch_ids)}") | ||
print(f"Instances with errors: {len(error_ids)}") | ||
if client: | ||
print(f"Unstopped containers: {len(unstopped_containers)}") | ||
print(f"Unremoved images: {len(unremoved_images)}") | ||
|
||
# write report to file | ||
report = { | ||
"total_instances": len(full_dataset), | ||
"submitted_instances": len(predictions), | ||
"completed_instances": len(completed_ids), | ||
"resolved_instances": len(resolved_ids), | ||
"unresolved_instances": len(unresolved_ids), | ||
"empty_patch_instances": len(empty_patch_ids), | ||
"error_instances": len(error_ids), | ||
"completed_ids": list(sorted(completed_ids)), | ||
"incomplete_ids": list(sorted(incomplete_ids)), | ||
"empty_patch_ids": list(sorted(empty_patch_ids)), | ||
"submitted_ids": list(sorted(predictions.keys())), | ||
"resolved_ids": list(sorted(resolved_ids)), | ||
"unresolved_ids": list(sorted(unresolved_ids)), | ||
"error_ids": list(sorted(error_ids)), | ||
"schema_version": 2, | ||
} | ||
if not client: | ||
report.update({ | ||
"unstopped_instances": len(unstopped_containers), | ||
"unstopped_containers": list(sorted(unstopped_containers)), | ||
"unremoved_images": list(sorted(unremoved_images)), | ||
}) | ||
report_file = Path( | ||
list(predictions.values())[0][KEY_MODEL].replace("/", "__") | ||
+ f".{run_id}" | ||
+ ".json" | ||
) | ||
with open(report_file, "w") as f: | ||
print(json.dumps(report, indent=4), file=f) | ||
print(f"Report written to {report_file}") | ||
return report_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.