upload_model / Inference.py
JackyChunKit's picture
Upload 3 files
43d4b71 verified
from vllm import LLM, SamplingParams
import argparse
import json
import os
import time
import datetime
def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85):
"""
Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration.
Args:
model_path: Path to the directory containing the trained model
tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect)
dtype: Data type for model weights (bfloat16, float16, or float32)
gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0)
"""
print(f"Loading fine-tuned Qwen model from: {model_path}")
print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, "
f"gpu_memory_utilization={gpu_memory_utilization}")
# Initialize the model with VLLM using GPU settings
llm = LLM(
model=model_path,
trust_remote_code=True,
tensor_parallel_size=tensor_parallel_size, # Number of GPUs to use
dtype=dtype, # Data type for model weights
gpu_memory_utilization=gpu_memory_utilization, # Memory usage per GPU
enforce_eager=False, # Set to True if you encounter CUDA issues
# max_model_len=8192, # Uncomment if you need longer context
)
print("Model loaded successfully!")
return llm
def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9):
"""Generate a response for a given prompt."""
sampling_params = SamplingParams(
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens
)
outputs = llm.generate([prompt], sampling_params)
return outputs[0].outputs[0].text
def chat_completion(llm, messages, temperature=0.7, max_tokens=512):
"""Generate a chat completion from messages."""
sampling_params = SamplingParams(
temperature=temperature,
top_p=0.9,
max_tokens=max_tokens
)
# Convert messages to a prompt using the model's chat template
tokenizer = llm.get_tokenizer()
if hasattr(tokenizer, "apply_chat_template"):
# For newer transformers versions
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
else:
# Fallback for models without chat template
prompt = format_messages_manually(messages)
outputs = llm.generate([prompt], sampling_params)
return outputs[0].outputs[0].text
def format_messages_manually(messages):
"""Format messages manually if chat template is not available."""
formatted_prompt = ""
for message in messages:
role = message["role"]
content = message["content"]
if role == "system":
formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
elif role == "user":
formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
elif role == "assistant":
formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
formatted_prompt += "<|im_start|>assistant\n"
return formatted_prompt
def batch_inference(llm, prompts, temperature=0.7, max_tokens=512):
"""Run batch inference on multiple prompts."""
sampling_params = SamplingParams(
temperature=temperature,
top_p=0.9,
max_tokens=max_tokens
)
outputs = llm.generate(prompts, sampling_params)
return [output.outputs[0].text for output in outputs]
def save_to_json(data, output_path=None):
"""Save results to a JSON file."""
if not output_path:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"qwen_inference_results_{timestamp}.json"
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"Results saved to: {output_path}")
return output_path
def main():
parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output")
parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory")
parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode")
parser.add_argument("--prompt", help="Prompt for single inference mode")
parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)")
parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)")
parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response")
parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling")
parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)")
parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights")
parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)")
args = parser.parse_args()
# Initialize the model with specified GPU settings
llm = setup_model(
model_path=args.model_path,
tensor_parallel_size=args.gpu_count,
dtype=args.dtype,
gpu_memory_utilization=args.gpu_memory_utilization
)
results = {}
if args.mode == "single":
if not args.prompt:
args.prompt = input("Enter your prompt: ")
print("\nGenerating response...")
start_time = time.time()
response = generate_response(
llm,
args.prompt,
temperature=args.temperature,
max_tokens=args.max_tokens
)
end_time = time.time()
print(f"\nResponse:\n{response}")
results = {
"mode": "single",
"timestamp": datetime.datetime.now().isoformat(),
"input": args.prompt,
"output": response,
"parameters": {
"temperature": args.temperature,
"max_tokens": args.max_tokens
},
"performance": {
"time_seconds": end_time - start_time
}
}
elif args.mode == "chat":
# For chat mode, we'll save the entire conversation history
messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
results = {
"mode": "chat",
"timestamp": datetime.datetime.now().isoformat(),
"conversation": []
}
print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n")
while True:
user_input = input("\nYou: ")
if user_input.lower() in ["exit", "quit"]:
print("Ending conversation and saving results...")
break
messages.append({"role": "user", "content": user_input})
start_time = time.time()
response = chat_completion(
llm,
messages,
temperature=args.temperature,
max_tokens=args.max_tokens
)
end_time = time.time()
print(f"\nAssistant: {response}")
messages.append({"role": "assistant", "content": response})
# Add this exchange to results
results["conversation"].append({
"user": user_input,
"assistant": response,
"time_seconds": end_time - start_time
})
elif args.mode == "batch":
if not args.prompt_file:
print("Error: --prompt_file required for batch mode")
return
with open(args.prompt_file, 'r', encoding='utf-8') as f:
prompts = json.load(f)
print(f"Running batch inference on {len(prompts)} prompts...")
inference_results = batch_inference(
llm,
prompts,
temperature=args.temperature,
max_tokens=args.max_tokens
)
with open(args.output_file, "w") as final:
json.dump(inference_results, final)
if __name__ == "__main__":
main()