diff --git a/index.html b/index.html
index fbe3a65..57f514d 100644
--- a/index.html
+++ b/index.html
@@ -101,6 +101,7 @@
- Amazon Q Developer Agent (v20240430-dev) |
+
+
+ 🥇
+
+ Amazon Q Developer Agent (v20240430-dev)
+
+ |
13.82 |
2024-05-09 |
@@ -123,12 +130,25 @@ Leaderboard
-
|
- |
- |
+
+
+
+ 🔗
+
+
+ |
+ ✘ |
+ ✘ |
- SWE-agent + GPT 4 |
+
+
+ 🥈
+
+ SWE-agent + GPT 4
+
+ |
12.47 |
2024-04-02 |
@@ -141,7 +161,14 @@ Leaderboard
|
- 🔗
+ 🔗
+
+
+ |
+
+
+
+ 🔗
|
@@ -150,7 +177,13 @@ Leaderboard
- SWE-agent + Claude 3 Opus |
+
+
+ 🥉
+
+ SWE-agent + Claude 3 Opus
+
+ |
10.51 |
2024-04-02 |
@@ -163,16 +196,26 @@ Leaderboard
|
- 🔗
+ 🔗
|
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + Claude 3 Opus |
+
+
+
+ RAG + Claude 3 Opus
+
+ |
3.79 |
2024-04-02 |
@@ -187,12 +230,24 @@ Leaderboard
-
|
+
+
+
+ 🔗
+
+
+ |
✓ |
✓ |
- RAG + Claude 2 |
+
+
+
+ RAG + Claude 2
+
+ |
1.96 |
2023-10-10 |
@@ -207,12 +262,22 @@ Leaderboard
-
|
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + GPT 4 |
+
+
+
+ RAG + GPT 4
+
+ |
1.31 |
2024-04-02 |
@@ -227,12 +292,22 @@ Leaderboard
-
|
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + SWE-Llama 13B |
+
+
+
+ RAG + SWE-Llama 13B
+
+ |
0.70 |
2023-10-10 |
@@ -247,12 +322,22 @@ Leaderboard
-
|
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + SWE-Llama 7B |
+
+
+
+ RAG + SWE-Llama 7B
+
+ |
0.70 |
2023-10-10 |
@@ -267,12 +352,22 @@ Leaderboard
-
|
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + ChatGPT 3.5 |
+
+
+
+ RAG + ChatGPT 3.5
+
+ |
0.17 |
2023-10-10 |
@@ -287,6 +382,11 @@ Leaderboard
-
|
+
+
+ -
+
+ |
✓ |
✓ |
@@ -326,6 +426,7 @@ Leaderboard (Lite)
|
|
|
+ |
|
|
@@ -333,13 +434,19 @@ Leaderboard (Lite)
- Aider |
+
+
+ 🥇
+
+ Aider
+
+ |
26.33 |
2024-05-23 |
- 🔗
+ 🔗
|
@@ -348,18 +455,31 @@ Leaderboard (Lite)
-
- |
- |
+
+
+
+ 🔗
+
+
+ |
+ ✘ |
+ ✓ |
- OpenCSG StarShip CodeGenAgent |
+
+
+ 🥈
+
+ OpenCSG StarShip CodeGenAgent
+
+ |
23.67 |
2024-05-24 |
- 🔗
+ 🔗
|
@@ -368,18 +488,31 @@ Leaderboard (Lite)
-
- |
- |
+
+
+
+ 🔗
+
+
+ |
+ ✘ |
+ ✘ |
- Amazon Q Developer Agent (v20240430-dev) |
+
+
+ 🥉
+
+ Amazon Q Developer Agent (v20240430-dev)
+
+ |
20.33 |
2024-05-09 |
- 🔗
+ 🔗
|
@@ -388,25 +521,44 @@ Leaderboard (Lite)
-
- |
- |
+
+
+
+ 🔗
+
+
+ |
+ ✘ |
+ ✘ |
- SWE-agent + GPT 4 |
+
+
+
+ SWE-agent + GPT 4
+
+ |
18.00 |
2024-04-02 |
- 🔗
+ 🔗
|
- 🔗
+ 🔗
+
+
+ |
+
+
+
+ 🔗
|
@@ -415,35 +567,50 @@ Leaderboard (Lite)
- SWE-agent + Claude 3 Opus |
+
+
+
+ SWE-agent + Claude 3 Opus
+
+ |
11.67 |
2024-04-02 |
- 🔗
+ 🔗
|
- 🔗
+ 🔗
|
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + Claude 3 Opus |
+
+
+
+ RAG + Claude 3 Opus
+
+ |
4.33 |
2024-04-02 |
- 🔗
+ 🔗
|
@@ -452,18 +619,30 @@ Leaderboard (Lite)
-
+
+
+
+ 🔗
+
+
+ |
✓ |
✓ |
- RAG + Claude 2 |
+
+
+
+ RAG + Claude 2
+
+ |
3.00 |
2023-10-10 |
- 🔗
+ 🔗
|
@@ -472,18 +651,28 @@ Leaderboard (Lite)
-
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + GPT 4 |
+
+
+
+ RAG + GPT 4
+
+ |
2.67 |
2024-04-02 |
- 🔗
+ 🔗
|
@@ -492,18 +681,28 @@ Leaderboard (Lite)
-
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + SWE-Llama 7B |
+
+
+
+ RAG + SWE-Llama 7B
+
+ |
1.33 |
2023-10-10 |
- 🔗
+ 🔗
|
@@ -512,18 +711,28 @@ Leaderboard (Lite)
-
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + SWE-Llama 13B |
+
+
+
+ RAG + SWE-Llama 13B
+
+ |
1.00 |
2023-10-10 |
- 🔗
+ 🔗
|
@@ -532,18 +741,28 @@ Leaderboard (Lite)
-
+
+
+ -
+
+ |
✓ |
✓ |
- RAG + ChatGPT 3.5 |
+
+
+
+ RAG + ChatGPT 3.5
+
+ |
0.33 |
2023-10-10 |
- 🔗
+ 🔗
|
@@ -552,6 +771,11 @@ Leaderboard (Lite)
-
+
+
+ -
+
+ |
✓ |
✓ |
diff --git a/template/data.json b/template/data.json
index 20ade4b..d26b9b4 100644
--- a/template/data.json
+++ b/template/data.json
@@ -4,14 +4,16 @@
"name": "Amazon Q Developer Agent (v20240430-dev)",
"resolved": "13.82",
"date": "2024-05-09",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240509_amazon-q-developer-agent-20240430-dev/logs"
+ "logs": "test/20240509_amazon-q-developer-agent-20240430-dev/logs",
+ "site": "https://aws.amazon.com/q/developer/"
},
{
"name": "SWE-agent + GPT 4",
"resolved": "12.47",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_gpt4/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_gpt4/trajs",
+ "logs": "test/20240402_sweagent_gpt4/logs",
+ "trajs": "test/20240402_sweagent_gpt4/trajs",
+ "site": "https://swe-agent.com/",
"verified": true,
"oss": true
},
@@ -19,8 +21,8 @@
"name": "SWE-agent + Claude 3 Opus",
"resolved": "10.51",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_claude3opus/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_claude3opus/trajs",
+ "logs": "test/20240402_sweagent_claude3opus/logs",
+ "trajs": "test/20240402_sweagent_claude3opus/trajs",
"verified": true,
"oss": true
},
@@ -28,7 +30,8 @@
"name": "RAG + Claude 3 Opus",
"resolved": "3.79",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_rag_claude3opus/logs",
+ "logs": "test/20240402_rag_claude3opus/logs",
+ "site": "https://github.com/princeton-nlp/SWE-bench/tree/main/inference",
"verified": true,
"oss": true
},
@@ -36,7 +39,7 @@
"name": "RAG + Claude 2",
"resolved": "1.96",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_claude2/logs",
+ "logs": "test/20231010_rag_claude2/logs",
"verified": true,
"oss": true
},
@@ -44,7 +47,7 @@
"name": "RAG + GPT 4",
"resolved": "1.31",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_rag_gpt4/logs",
+ "logs": "test/20240402_rag_gpt4/logs",
"verified": true,
"oss": true
},
@@ -52,7 +55,7 @@
"name": "RAG + SWE-Llama 13B",
"resolved": "0.70",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_swellama13b/logs",
+ "logs": "test/20231010_rag_swellama13b/logs",
"verified": true,
"oss": true
},
@@ -60,7 +63,7 @@
"name": "RAG + SWE-Llama 7B",
"resolved": "0.70",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_swellama7b/logs",
+ "logs": "test/20231010_rag_swellama7b/logs",
"verified": true,
"oss": true
},
@@ -68,7 +71,7 @@
"name": "RAG + ChatGPT 3.5",
"resolved": "0.17",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_gpt35/logs",
+ "logs": "test/20231010_rag_gpt35/logs",
"verified": true,
"oss": true
}
@@ -78,34 +81,39 @@
"name": "Aider",
"resolved": "26.33",
"date": "2024-05-23",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240523_aider/logs"
+ "logs": "lite/20240523_aider/logs",
+ "site": "https://github.com/paul-gauthier/aider",
+ "oss": true
},
{
"name": "OpenCSG StarShip CodeGenAgent",
"resolved": "23.67",
"date": "2024-05-24",
- "logs": "https://github.com/swe-bench/experiemnts/tree/main/evaluation/lite/20240524_20240524_opencsg_starship_gpt4/logs"
+ "logs": "lite/20240524_20240524_opencsg_starship_gpt4/logs",
+ "site": "https://opencsg.com/product?class=StarShip"
},
{
"name": "Amazon Q Developer Agent (v20240430-dev)",
"resolved": "20.33",
"date": "2024-05-09",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240509_amazon-q-developer-agent-20240430-dev/logs"
+ "logs": "lite/20240509_amazon-q-developer-agent-20240430-dev/logs",
+ "site": "https://aws.amazon.com/q/developer/"
},
{
"name": "SWE-agent + GPT 4",
"resolved": "18.00",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_gpt4/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_gpt4/trajs",
+ "logs": "lite/20240402_sweagent_gpt4/logs",
+ "trajs": "lite/20240402_sweagent_gpt4/trajs",
+ "site": "https://swe-agent.com/",
"verified": true,
"oss": true
},
{
"name": "SWE-agent + Claude 3 Opus",
"resolved": "11.67",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_claude3opus/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_claude3opus/trajs",
+ "logs": "lite/20240402_sweagent_claude3opus/logs",
+ "trajs": "lite/20240402_sweagent_claude3opus/trajs",
"date": "2024-04-02",
"verified": true,
"oss": true
@@ -114,7 +122,8 @@
"name": "RAG + Claude 3 Opus",
"resolved": "4.33",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_rag_claude3opus/logs",
+ "logs": "lite/20240402_rag_claude3opus/logs",
+ "site": "https://github.com/princeton-nlp/SWE-bench/tree/main/inference",
"verified": true,
"oss": true
},
@@ -122,7 +131,7 @@
"name": "RAG + Claude 2",
"resolved": "3.00",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_claude2/logs",
+ "logs": "lite/20231010_rag_claude2/logs",
"verified": true,
"oss": true
},
@@ -130,7 +139,7 @@
"name": "RAG + GPT 4",
"resolved": "2.67",
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_rag_gpt4/logs",
+ "logs": "lite/20240402_rag_gpt4/logs",
"verified": true,
"oss": true
},
@@ -138,7 +147,7 @@
"name": "RAG + SWE-Llama 7B",
"resolved": "1.33",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_swellama7b/logs",
+ "logs": "lite/20231010_rag_swellama7b/logs",
"verified": true,
"oss": true
},
@@ -146,7 +155,7 @@
"name": "RAG + SWE-Llama 13B",
"resolved": "1.00",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_swellama13b/logs",
+ "logs": "lite/20231010_rag_swellama13b/logs",
"verified": true,
"oss": true
},
@@ -154,7 +163,7 @@
"name": "RAG + ChatGPT 3.5",
"resolved": "0.33",
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_gpt35/logs",
+ "logs": "lite/20231010_rag_gpt35/logs",
"verified": true,
"oss": true
}
diff --git a/template/template.html b/template/template.html
index 3689dc2..e8ef5f4 100644
--- a/template/template.html
+++ b/template/template.html
@@ -101,6 +101,7 @@ Leaderboard
|
|
|
+ |
|
|
@@ -108,13 +109,21 @@ Leaderboard
{% for item in lite_leaderboard %}
- {{item.name}} |
+
+
+ {% if loop.index == 1 %}🥇
+ {% elif loop.index == 2 %}🥈
+ {% elif loop.index == 3 %}🥉
+ {% endif %}
+ {{item.name}}
+
+ |
{{item.resolved}} |
{{item.date}} |
@@ -188,8 +213,15 @@ Leaderboard (Lite)
{% else %} - {% endif %}
|
- {{'✓' if item.verified is defined else ' '}} |
- {{'✓' if item.oss is defined else ' '}} |
+
+
+ {% if item.site is defined %}
+ 🔗
+ {% else %} - {% endif %}
+
+ |
+ {{'✓' if item.verified is defined else '✘'}} |
+ {{'✓' if item.oss is defined else '✘'}} |
{% endfor %}