from stateset.rl import GRPOTrainer, RewardFunction
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
from trl import GRPOTrainer, GRPOConfig
from datasets import Dataset
from sklearn.model_selection import train_test_split
# Model setup with LoRA for efficient training
def setup_model(model_name="Qwen/Qwen2.5-7B-Instruct"):
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
padding_side="left" # Critical for generation
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model with mixed precision
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map='auto',
trust_remote_code=True
)
# Add LoRA adapters for efficient training
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # Typically <1% of total
return model, tokenizer
# Define comprehensive reward function
class CustomerServiceReward:
def __init__(self, expected_responses=None):
self.empathy_keywords = ['sorry', 'understand', 'help', 'happy', 'glad', 'assist']
self.action_keywords = ['visit', 'email', 'click', 'check', 'provide']
self.expected_responses = expected_responses or {}
def compute_reward(self, query, response, expected=None):
response_lower = response.lower()
# Similarity to expected response (if available)
similarity_reward = 0.0
if expected:
import difflib
similarity = difflib.SequenceMatcher(None, response_lower, expected.lower()).ratio()
similarity_reward = similarity
# Empathy score
empathy_score = sum(1 for word in self.empathy_keywords if word in response_lower) / len(self.empathy_keywords)
# Action-oriented score
action_score = sum(1 for word in self.action_keywords if word in response_lower) / len(self.action_keywords)
# Length penalty (concise but complete)
word_count = len(response.split())
if word_count < 10:
length_penalty = -0.5
elif word_count > 100:
length_penalty = -0.3
else:
length_penalty = 0.0
# Weighted combination
total_reward = (
0.4 * similarity_reward +
0.3 * empathy_score +
0.2 * action_score +
0.1 * (1.0 + length_penalty)
)
return {
"total": total_reward,
"similarity": similarity_reward,
"empathy": empathy_score,
"action": action_score,
"length_penalty": length_penalty
}
# Configure GRPO training
training_config = GRPOConfig(
output_dir="./checkpoints/grpo",
per_device_train_batch_size=2,
gradient_accumulation_steps=4, # Effective batch size = 8
learning_rate=1e-5,
num_train_epochs=1,
max_grad_norm=0.5,
warmup_steps=50,
fp16=True, # Mixed precision training
logging_steps=10,
save_steps=100,
eval_steps=50,
# GRPO specific parameters
beta=0.0, # KL penalty coefficient
num_generations=4, # Generate 4 responses per prompt
num_iterations=1,
# Generation parameters
max_prompt_length=128,
max_completion_length=128,
temperature=0.7,
top_p=0.9,
# Reproducibility
seed=42,
)
# Train with proper data splitting
def train_customer_service_model(data, eval_split=0.1):
# Split data
train_data, eval_data = train_test_split(
data,
test_size=eval_split,
random_state=42,
stratify=[d['task_type'] for d in data] # Maintain task distribution
)
# Create datasets
train_dataset = Dataset.from_list([
{"prompt": f"Customer: {d['query']}\nAssistant:"}
for d in train_data
])
eval_dataset = Dataset.from_list([
{"prompt": f"Customer: {d['query']}\nAssistant:"}
for d in eval_data
])
# Setup reward function with expected responses
reward_model = CustomerServiceReward({
f"Customer: {d['query']}\nAssistant:": d['expected_response']
for d in train_data
})
def reward_fn(completions, prompts):
rewards = []
for completion, prompt in zip(completions, prompts):
expected = reward_model.expected_responses.get(prompt)
reward_dict = reward_model.compute_reward(
prompt.split("Customer: ")[1].split("\nAssistant:")[0],
completion,
expected
)
rewards.append(reward_dict["total"])
return rewards
# Initialize trainer
trainer = GRPOTrainer(
model=model,
args=training_config,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
reward_funcs=reward_fn,
)
# Train
trainer.train()
return trainer.model