Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reward model VLLM API upgrade #331

Merged
merged 42 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
151799d
Nemotron eval map
Jul 10, 2024
83fe2aa
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Jul 23, 2024
2ee8067
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Jul 25, 2024
6fe71aa
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Jul 29, 2024
4f125d0
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Jul 31, 2024
63dc89c
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Aug 2, 2024
f132827
Merging with main
Aug 3, 2024
271d7c3
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Aug 7, 2024
6236437
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Aug 8, 2024
c441f15
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Aug 13, 2024
6cc0c0f
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Aug 21, 2024
10b06fa
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Sep 6, 2024
99d23e2
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Sep 23, 2024
c77a584
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Jan 24, 2025
6d4a967
Reward model updates for new VLLM API
shtoshni Jan 24, 2025
475938a
Merge branch 'main' of github.com:Kipok/NeMo-Skills into main
Jan 24, 2025
0d24563
Merge branch 'main' into shtoshni/vllm-upgrade
Jan 24, 2025
9663b5b
Add logic for ORM vs PRM
shtoshni Jan 24, 2025
2e38080
Reward model type
shtoshni Jan 24, 2025
293c384
Fixes
shtoshni Jan 24, 2025
4b28a7f
Merge branch 'main' into shtoshni/vllm-upgrade
shtoshni Jan 25, 2025
efe0aff
Tests
shtoshni Jan 25, 2025
4aefaf8
Fixed test
shtoshni Jan 25, 2025
feac055
Fixed minor error
shtoshni Jan 25, 2025
77678bd
Fixing RM api
shtoshni Jan 27, 2025
f7f533d
Test change
shtoshni Jan 27, 2025
c5e9ed0
Testing
shtoshni Jan 27, 2025
70ed6dd
Testing
shtoshni Jan 27, 2025
9ca9dd7
Testing
shtoshni Jan 27, 2025
90d7c25
RM testing
shtoshni Jan 27, 2025
8fbb652
RM testing
shtoshni Jan 27, 2025
64710cf
RM testing
shtoshni Jan 27, 2025
71ccce6
Reward model test update
shtoshni Jan 27, 2025
c33ce27
Fixing test
shtoshni Jan 27, 2025
45bf8a8
Merge branch 'main' into shtoshni/vllm-upgrade
shtoshni Jan 27, 2025
c7c453f
Removing logging
shtoshni Jan 27, 2025
6a4ce67
Update tests/gpu-tests/test-local.yaml
shtoshni Jan 28, 2025
53f0d39
Update tests/gpu-tests/test-local.yaml
shtoshni Jan 28, 2025
cee9618
Update tests/gpu-tests/test_reward.py
shtoshni Jan 28, 2025
baecdb5
Merge branch 'main' into shtoshni/vllm-upgrade
shtoshni Jan 28, 2025
6413fe3
Merge branch 'main' into shtoshni/vllm-upgrade
shtoshni Jan 28, 2025
d233bf3
Adding attention heads to avoid division error
shtoshni Jan 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions nemo_skills/inference/reward_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
import logging
import sys
from dataclasses import field
from enum import Enum
from pathlib import Path

import hydra
import typer
from tqdm import tqdm

from nemo_skills.inference.server.code_execution_model import server_params
Expand Down Expand Up @@ -57,7 +59,8 @@ class RewardModelConfig:
# if > 0, will skip this many samples from the beginning of the data file.
# Useful if need to run multiple slurm jobs on the same data file
offset: int = 0

# Default reward model type
reward_model_type: str = "orm"
reward_model_score_key: str = "reward_model_score"

# can add this flag to just print the first prompt instead of running generation
Expand Down Expand Up @@ -93,7 +96,7 @@ def generate(cfg: RewardModelConfig):
cfg = RewardModelConfig(_init_nested=True, **cfg)

LOG.info("Config used: %s", cfg)
llm = get_reward_model(**cfg.server)
llm = get_reward_model(model_type=cfg.reward_model_type, **cfg.server)

# making sure output dir exists
Path(cfg.output_file).absolute().parent.mkdir(parents=True, exist_ok=True)
Expand Down
54 changes: 39 additions & 15 deletions nemo_skills/inference/server/reward_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,25 @@
# limitations under the License.

import abc
import logging
import math
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from enum import Enum

import httpx
import openai
import requests
from openai import DefaultHttpxClient, OpenAI, BadRequestError
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from openai import BadRequestError, DefaultHttpxClient, OpenAI

LOG = logging.getLogger(__file__)


class BaseModel(abc.ABC):
"""Base model class for handling requests to the inference server.
"""Base model class for handling requests to the reward model inference server.

Args:
model_type: Reward model type
host: Optional[str] = '127.0.0.1' - Host of the inference server.
port: Optional[str] = '5000' - Port of the inference server.
Only required if handle_code_execution is True.
Expand All @@ -40,11 +44,13 @@ class BaseModel(abc.ABC):

def __init__(
self,
model_type: str,
host: str = '127.0.0.1',
port: str = '5000',
ssh_server: str | None = None,
ssh_key_path: str | None = None,
):
self.model_type = model_type
self.server_host = host
self.server_port = port
self.ssh_server = ssh_server
Expand Down Expand Up @@ -104,18 +110,34 @@ def __init__(self, **kwargs):
http_client=http_client,
)

# Reward models are accessed via the "pooling" interface
# https://docs.vllm.ai/en/latest/models/pooling_models.html
self.request_url = f"http://{self.server_host}:{self.server_port}/pooling"

model_list = self.oai_client.models.list()
self.model = model_list.data[0].id

def _score_single_prompt(self, prompt):
response = self.oai_client.embeddings.create(input=[prompt], model=self.model)
raw_score = response.data[0].embedding[-1]
score = 1 / (1 + math.exp(-raw_score))
response = requests.post(self.request_url, json={"input": prompt, "model": self.model})
per_token_scores = response.json()['data'][0]['data']
last_token_score = per_token_scores[-1]

score = None
if self.model_type == "orm":
# Last token's score
if isinstance(last_token_score, list):
logit_score = last_token_score[0]
else:
logit_score = last_token_score
# Normalize the score
score = 1 / (1 + math.exp(-logit_score))
elif self.model_type == "prm":
# Last token's score, a 2-entry array where the second entry is the probability of being correct
score = last_token_score[1]

return {"reward_model_score": score}

def score(self, prompts: list[str]) -> list[float]:
# TODO: The current VLLM support for Qwen-RM uses a hack of using embedding APIs.
# Once VLLM officially adds the support, change the API.

outputs = [None] * len(prompts) # Pre-allocate a list to store results in correct order
futures = {}

Expand All @@ -128,11 +150,13 @@ def score(self, prompts: list[str]) -> list[float]:
try:
outputs[idx] = future.result()
except BadRequestError as e:
error_details = e.body
error_details = e.body
error_message = error_details.get("message", "No message found")
error_code = error_details.get("code", "No code found")
error_code = error_details.get("code", "No code found")
if error_code == 400 and 'maximum context length' in error_message:
outputs[idx] = {"reward_model_score": 0} # Default value set as 0 if we have request over maximum context length
outputs[idx] = {
"reward_model_score": 0
} # Default value set as 0 if we have request over maximum context length
LOG.warning("Maximum context length exceeded, setting reward score as 0")
else:
raise
Expand All @@ -145,7 +169,7 @@ def score(self, prompts: list[str]) -> list[float]:
}


def get_reward_model(server_type, **kwargs):
def get_reward_model(server_type, model_type, **kwargs):
"""A helper function to make it easier to set server through cmd."""
model_class = models[server_type.lower()]
return model_class(**kwargs)
return model_class(model_type=model_type, **kwargs)
26 changes: 16 additions & 10 deletions tests/gpu-tests/make_tiny_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer

parser = argparse.ArgumentParser(description="Create a tiny model for testing.")
parser.add_argument("--model_type", type=str, required=True, choices=("qwen", "llama", "mistral_emb"))
parser.add_argument("--model_type", type=str, required=True, choices=("qwen", "llama", "qwen_orm"))
args = parser.parse_args()

if args.model_type == 'qwen':
Expand All @@ -28,38 +28,44 @@
hidden_dim = 56
head_dim = 2
max_position_embeddings = 256
elif args.model_type == 'mistral_emb':
model_name = "intfloat/e5-mistral-7b-instruct"
output_dir = "/tmp/nemo-skills-tests/mistral_emb/tiny-model-hf"
hidden_dim = 128
head_dim = 64
num_attention_heads = 8
elif args.model_type == 'qwen_orm':
# vLLM requires a minimum head dimension size of 32, so we use a larger value here
model_name = "Qwen/Qwen2.5-Math-RM-72B"
output_dir = "/tmp/nemo-skills-tests/qwen_orm/tiny-model-hf"
hidden_dim = 256
head_dim = 32
num_attention_heads = 8
max_position_embeddings = 2048
else:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
output_dir = "/tmp/nemo-skills-tests/llama/tiny-model-hf"
hidden_dim = 64
head_dim = 2
max_position_embeddings = 256
num_attention_heads = 8

config = AutoConfig.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
config.update(
dict(
hidden_size=hidden_dim,
head_dim=head_dim,
intermediate_size=hidden_dim,
num_hidden_layers=2,
max_position_embeddings=max_position_embeddings,
num_attention_heads=num_attention_heads,
)
)
print("new config", config)

if args.model_type == 'mistral_emb':
tiny_model = AutoModel.from_config(config)
if args.model_type == 'qwen_orm':
tiny_model = AutoModel.from_config(config, trust_remote_code=True)
else:
# create a tiny random model
tiny_model = AutoModelForCausalLM.from_config(config)

print(f"num of params {tiny_model.num_parameters()}")
print(f"# of params: {tiny_model.num_parameters() / 1_000_000:.1f}M")

# shrink it more and save
tiny_model.bfloat16() # half-size
Expand Down
8 changes: 5 additions & 3 deletions tests/gpu-tests/run_rm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# also need to define HF_TOKEN for some of the tests
set -e

export NEMO_SKILLS_TEST_MODEL_TYPE=mistral_emb
# Qwen Outcome RM
export NEMO_SKILLS_TEST_MODEL_TYPE=qwen_orm;

docker run --rm \
-e HF_TOKEN=$HF_TOKEN \
Expand All @@ -11,5 +12,6 @@ docker run --rm \
igitman/nemo-skills-nemo:0.5.0 \
python /nemo_run/code/tests/gpu-tests/make_tiny_llm.py --model_type $NEMO_SKILLS_TEST_MODEL_TYPE;

export NEMO_SKILLS_TEST_HF_MODEL=/tmp/nemo-skills-tests/$NEMO_SKILLS_TEST_MODEL_TYPE/tiny-model-hf
pytest tests/gpu-tests/test_reward.py -s -x
export NEMO_SKILLS_TEST_HF_MODEL=/tmp/nemo-skills-tests/$NEMO_SKILLS_TEST_MODEL_TYPE/tiny-model-hf;
pytest tests/gpu-tests/test_reward.py -s -x;

Loading