diff --git a/index.html b/index.html index fbe3a65..57f514d 100644 --- a/index.html +++ b/index.html @@ -101,6 +101,7 @@

Leaderboard

Date
Logs
Trajs
+
Site
Verified?
Open?
@@ -108,7 +109,13 @@

Leaderboard

-

Amazon Q Developer Agent (v20240430-dev)

+ +

+ 🥇 + + Amazon Q Developer Agent (v20240430-dev) +

+

13.82

2024-05-09

@@ -123,12 +130,25 @@

Leaderboard

-

-

-

+ +

+ + 🔗 + +

+ +

✘

+

✘

-

SWE-agent + GPT 4

+ +

+ 🥈 + + SWE-agent + GPT 4 +

+

12.47

2024-04-02

@@ -141,7 +161,14 @@

Leaderboard

- 🔗 + 🔗 + +

+ + +

+ + 🔗

@@ -150,7 +177,13 @@

Leaderboard

-

SWE-agent + Claude 3 Opus

+ +

+ 🥉 + + SWE-agent + Claude 3 Opus +

+

10.51

2024-04-02

@@ -163,16 +196,26 @@

Leaderboard

- 🔗 + 🔗

+ +

+ - +

+

✓

✓

-

RAG + Claude 3 Opus

+ +

+ + RAG + Claude 3 Opus +

+

3.79

2024-04-02

@@ -187,12 +230,24 @@

Leaderboard

-

+ +

+ + 🔗 + +

+

✓

✓

-

RAG + Claude 2

+ +

+ + RAG + Claude 2 +

+

1.96

2023-10-10

@@ -207,12 +262,22 @@

Leaderboard

-

+ +

+ - +

+

✓

✓

-

RAG + GPT 4

+ +

+ + RAG + GPT 4 +

+

1.31

2024-04-02

@@ -227,12 +292,22 @@

Leaderboard

-

+ +

+ - +

+

✓

✓

-

RAG + SWE-Llama 13B

+ +

+ + RAG + SWE-Llama 13B +

+

0.70

2023-10-10

@@ -247,12 +322,22 @@

Leaderboard

-

+ +

+ - +

+

✓

✓

-

RAG + SWE-Llama 7B

+ +

+ + RAG + SWE-Llama 7B +

+

0.70

2023-10-10

@@ -267,12 +352,22 @@

Leaderboard

-

+ +

+ - +

+

✓

✓

-

RAG + ChatGPT 3.5

+ +

+ + RAG + ChatGPT 3.5 +

+

0.17

2023-10-10

@@ -287,6 +382,11 @@

Leaderboard

-

+ +

+ - +

+

✓

✓

@@ -326,6 +426,7 @@

Leaderboard (Lite)

Date
Logs
Trajs
+
Site
Verified?
Open?
@@ -333,13 +434,19 @@

Leaderboard (Lite)

-

Aider

+ +

+ 🥇 + + Aider +

+

26.33

2024-05-23

- 🔗 + 🔗

@@ -348,18 +455,31 @@

Leaderboard (Lite)

-

-

-

+ +

+ + 🔗 + +

+ +

✘

+

✓

-

OpenCSG StarShip CodeGenAgent

+ +

+ 🥈 + + OpenCSG StarShip CodeGenAgent +

+

23.67

2024-05-24

- 🔗 + 🔗

@@ -368,18 +488,31 @@

Leaderboard (Lite)

-

-

-

+ +

+ + 🔗 + +

+ +

✘

+

✘

-

Amazon Q Developer Agent (v20240430-dev)

+ +

+ 🥉 + + Amazon Q Developer Agent (v20240430-dev) +

+

20.33

2024-05-09

- 🔗 + 🔗

@@ -388,25 +521,44 @@

Leaderboard (Lite)

-

-

-

+ +

+ + 🔗 + +

+ +

✘

+

✘

-

SWE-agent + GPT 4

+ +

+ + SWE-agent + GPT 4 +

+

18.00

2024-04-02

- 🔗 + 🔗

- 🔗 + 🔗 + +

+ + +

+ + 🔗

@@ -415,35 +567,50 @@

Leaderboard (Lite)

-

SWE-agent + Claude 3 Opus

+ +

+ + SWE-agent + Claude 3 Opus +

+

11.67

2024-04-02

- 🔗 + 🔗

- 🔗 + 🔗

+ +

+ - +

+

✓

✓

-

RAG + Claude 3 Opus

+ +

+ + RAG + Claude 3 Opus +

+

4.33

2024-04-02

- 🔗 + 🔗

@@ -452,18 +619,30 @@

Leaderboard (Lite)

-

+ +

+ + 🔗 + +

+

✓

✓

-

RAG + Claude 2

+ +

+ + RAG + Claude 2 +

+

3.00

2023-10-10

- 🔗 + 🔗

@@ -472,18 +651,28 @@

Leaderboard (Lite)

-

+ +

+ - +

+

✓

✓

-

RAG + GPT 4

+ +

+ + RAG + GPT 4 +

+

2.67

2024-04-02

- 🔗 + 🔗

@@ -492,18 +681,28 @@

Leaderboard (Lite)

-

+ +

+ - +

+

✓

✓

-

RAG + SWE-Llama 7B

+ +

+ + RAG + SWE-Llama 7B +

+

1.33

2023-10-10

- 🔗 + 🔗

@@ -512,18 +711,28 @@

Leaderboard (Lite)

-

+ +

+ - +

+

✓

✓

-

RAG + SWE-Llama 13B

+ +

+ + RAG + SWE-Llama 13B +

+

1.00

2023-10-10

- 🔗 + 🔗

@@ -532,18 +741,28 @@

Leaderboard (Lite)

-

+ +

+ - +

+

✓

✓

-

RAG + ChatGPT 3.5

+ +

+ + RAG + ChatGPT 3.5 +

+

0.33

2023-10-10

- 🔗 + 🔗

@@ -552,6 +771,11 @@

Leaderboard (Lite)

-

+ +

+ - +

+

✓

✓

diff --git a/template/data.json b/template/data.json index 20ade4b..d26b9b4 100644 --- a/template/data.json +++ b/template/data.json @@ -4,14 +4,16 @@ "name": "Amazon Q Developer Agent (v20240430-dev)", "resolved": "13.82", "date": "2024-05-09", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240509_amazon-q-developer-agent-20240430-dev/logs" + "logs": "test/20240509_amazon-q-developer-agent-20240430-dev/logs", + "site": "https://aws.amazon.com/q/developer/" }, { "name": "SWE-agent + GPT 4", "resolved": "12.47", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_gpt4/logs", - "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_gpt4/trajs", + "logs": "test/20240402_sweagent_gpt4/logs", + "trajs": "test/20240402_sweagent_gpt4/trajs", + "site": "https://swe-agent.com/", "verified": true, "oss": true }, @@ -19,8 +21,8 @@ "name": "SWE-agent + Claude 3 Opus", "resolved": "10.51", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_claude3opus/logs", - "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_claude3opus/trajs", + "logs": "test/20240402_sweagent_claude3opus/logs", + "trajs": "test/20240402_sweagent_claude3opus/trajs", "verified": true, "oss": true }, @@ -28,7 +30,8 @@ "name": "RAG + Claude 3 Opus", "resolved": "3.79", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_rag_claude3opus/logs", + "logs": "test/20240402_rag_claude3opus/logs", + "site": "https://github.com/princeton-nlp/SWE-bench/tree/main/inference", "verified": true, "oss": true }, @@ -36,7 +39,7 @@ "name": "RAG + Claude 2", "resolved": "1.96", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_claude2/logs", + "logs": "test/20231010_rag_claude2/logs", "verified": true, "oss": true }, @@ -44,7 +47,7 @@ "name": "RAG + GPT 4", "resolved": "1.31", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_rag_gpt4/logs", + "logs": "test/20240402_rag_gpt4/logs", "verified": true, "oss": true }, @@ -52,7 +55,7 @@ "name": "RAG + SWE-Llama 13B", "resolved": "0.70", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_swellama13b/logs", + "logs": "test/20231010_rag_swellama13b/logs", "verified": true, "oss": true }, @@ -60,7 +63,7 @@ "name": "RAG + SWE-Llama 7B", "resolved": "0.70", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_swellama7b/logs", + "logs": "test/20231010_rag_swellama7b/logs", "verified": true, "oss": true }, @@ -68,7 +71,7 @@ "name": "RAG + ChatGPT 3.5", "resolved": "0.17", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_gpt35/logs", + "logs": "test/20231010_rag_gpt35/logs", "verified": true, "oss": true } @@ -78,34 +81,39 @@ "name": "Aider", "resolved": "26.33", "date": "2024-05-23", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240523_aider/logs" + "logs": "lite/20240523_aider/logs", + "site": "https://github.com/paul-gauthier/aider", + "oss": true }, { "name": "OpenCSG StarShip CodeGenAgent", "resolved": "23.67", "date": "2024-05-24", - "logs": "https://github.com/swe-bench/experiemnts/tree/main/evaluation/lite/20240524_20240524_opencsg_starship_gpt4/logs" + "logs": "lite/20240524_20240524_opencsg_starship_gpt4/logs", + "site": "https://opencsg.com/product?class=StarShip" }, { "name": "Amazon Q Developer Agent (v20240430-dev)", "resolved": "20.33", "date": "2024-05-09", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240509_amazon-q-developer-agent-20240430-dev/logs" + "logs": "lite/20240509_amazon-q-developer-agent-20240430-dev/logs", + "site": "https://aws.amazon.com/q/developer/" }, { "name": "SWE-agent + GPT 4", "resolved": "18.00", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_gpt4/logs", - "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_gpt4/trajs", + "logs": "lite/20240402_sweagent_gpt4/logs", + "trajs": "lite/20240402_sweagent_gpt4/trajs", + "site": "https://swe-agent.com/", "verified": true, "oss": true }, { "name": "SWE-agent + Claude 3 Opus", "resolved": "11.67", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_claude3opus/logs", - "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_claude3opus/trajs", + "logs": "lite/20240402_sweagent_claude3opus/logs", + "trajs": "lite/20240402_sweagent_claude3opus/trajs", "date": "2024-04-02", "verified": true, "oss": true @@ -114,7 +122,8 @@ "name": "RAG + Claude 3 Opus", "resolved": "4.33", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_rag_claude3opus/logs", + "logs": "lite/20240402_rag_claude3opus/logs", + "site": "https://github.com/princeton-nlp/SWE-bench/tree/main/inference", "verified": true, "oss": true }, @@ -122,7 +131,7 @@ "name": "RAG + Claude 2", "resolved": "3.00", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_claude2/logs", + "logs": "lite/20231010_rag_claude2/logs", "verified": true, "oss": true }, @@ -130,7 +139,7 @@ "name": "RAG + GPT 4", "resolved": "2.67", "date": "2024-04-02", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_rag_gpt4/logs", + "logs": "lite/20240402_rag_gpt4/logs", "verified": true, "oss": true }, @@ -138,7 +147,7 @@ "name": "RAG + SWE-Llama 7B", "resolved": "1.33", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_swellama7b/logs", + "logs": "lite/20231010_rag_swellama7b/logs", "verified": true, "oss": true }, @@ -146,7 +155,7 @@ "name": "RAG + SWE-Llama 13B", "resolved": "1.00", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_swellama13b/logs", + "logs": "lite/20231010_rag_swellama13b/logs", "verified": true, "oss": true }, @@ -154,7 +163,7 @@ "name": "RAG + ChatGPT 3.5", "resolved": "0.33", "date": "2023-10-10", - "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_gpt35/logs", + "logs": "lite/20231010_rag_gpt35/logs", "verified": true, "oss": true } diff --git a/template/template.html b/template/template.html index 3689dc2..e8ef5f4 100644 --- a/template/template.html +++ b/template/template.html @@ -101,6 +101,7 @@

Leaderboard

Date
Logs
Trajs
+
Site
Verified?
Open?
@@ -108,13 +109,21 @@

Leaderboard

{% for item in test_leaderboard %} -

{{item.name}}

+ +

+ {% if loop.index == 1 %}🥇 + {% elif loop.index == 2 %}🥈 + {% elif loop.index == 3 %}🥉 + {% endif %} + {{item.name}} +

+

{{item.resolved}}

{{item.date}}

{% if item.logs is defined %} - 🔗 + 🔗 {% else %} - {% endif %}

@@ -125,8 +134,15 @@

Leaderboard

{% else %} - {% endif %}

-

{{'✓' if item.verified is defined else ' '}}

-

{{'✓' if item.oss is defined else ' '}}

+ +

+ {% if item.site is defined %} + 🔗 + {% else %} - {% endif %} +

+ +

{{'✓' if item.verified is defined else '✘'}}

+

{{'✓' if item.oss is defined else '✘'}}

{% endfor %} @@ -164,6 +180,7 @@

Leaderboard (Lite)

Date
Logs
Trajs
+
Site
Verified?
Open?
@@ -171,7 +188,15 @@

Leaderboard (Lite)

{% for item in lite_leaderboard %} -

{{item.name}}

+ +

+ {% if loop.index == 1 %}🥇 + {% elif loop.index == 2 %}🥈 + {% elif loop.index == 3 %}🥉 + {% endif %} + {{item.name}} +

+

{{item.resolved}}

{{item.date}}

@@ -188,8 +213,15 @@

Leaderboard (Lite)

{% else %} - {% endif %}

-

{{'✓' if item.verified is defined else ' '}}

-

{{'✓' if item.oss is defined else ' '}}

+ +

+ {% if item.site is defined %} + 🔗 + {% else %} - {% endif %} +

+ +

{{'✓' if item.verified is defined else '✘'}}

+

{{'✓' if item.oss is defined else '✘'}}

{% endfor %}