from vllm import LLM, SamplingParams import argparse import json import os import time import datetime def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85): """ Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration. Args: model_path: Path to the directory containing the trained model tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect) dtype: Data type for model weights (bfloat16, float16, or float32) gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0) """ print(f"Loading fine-tuned Qwen model from: {model_path}") print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, " f"gpu_memory_utilization={gpu_memory_utilization}") # Initialize the model with VLLM using GPU settings llm = LLM( model=model_path, trust_remote_code=True, tensor_parallel_size=tensor_parallel_size, # Number of GPUs to use dtype=dtype, # Data type for model weights gpu_memory_utilization=gpu_memory_utilization, # Memory usage per GPU enforce_eager=False, # Set to True if you encounter CUDA issues # max_model_len=8192, # Uncomment if you need longer context ) print("Model loaded successfully!") return llm def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9): """Generate a response for a given prompt.""" sampling_params = SamplingParams( temperature=temperature, top_p=top_p, max_tokens=max_tokens ) outputs = llm.generate([prompt], sampling_params) return outputs[0].outputs[0].text def chat_completion(llm, messages, temperature=0.7, max_tokens=512): """Generate a chat completion from messages.""" sampling_params = SamplingParams( temperature=temperature, top_p=0.9, max_tokens=max_tokens ) # Convert messages to a prompt using the model's chat template tokenizer = llm.get_tokenizer() if hasattr(tokenizer, "apply_chat_template"): # For newer transformers versions prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) else: # Fallback for models without chat template prompt = format_messages_manually(messages) outputs = llm.generate([prompt], sampling_params) return outputs[0].outputs[0].text def format_messages_manually(messages): """Format messages manually if chat template is not available.""" formatted_prompt = "" for message in messages: role = message["role"] content = message["content"] if role == "system": formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n" elif role == "user": formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n" elif role == "assistant": formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n" formatted_prompt += "<|im_start|>assistant\n" return formatted_prompt def batch_inference(llm, prompts, temperature=0.7, max_tokens=512): """Run batch inference on multiple prompts.""" sampling_params = SamplingParams( temperature=temperature, top_p=0.9, max_tokens=max_tokens ) outputs = llm.generate(prompts, sampling_params) return [output.outputs[0].text for output in outputs] def save_to_json(data, output_path=None): """Save results to a JSON file.""" if not output_path: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"qwen_inference_results_{timestamp}.json" with open(output_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"Results saved to: {output_path}") return output_path def main(): parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output") parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory") parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode") parser.add_argument("--prompt", help="Prompt for single inference mode") parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)") parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)") parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response") parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling") parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)") parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights") parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)") args = parser.parse_args() # Initialize the model with specified GPU settings llm = setup_model( model_path=args.model_path, tensor_parallel_size=args.gpu_count, dtype=args.dtype, gpu_memory_utilization=args.gpu_memory_utilization ) results = {} if args.mode == "single": if not args.prompt: args.prompt = input("Enter your prompt: ") print("\nGenerating response...") start_time = time.time() response = generate_response( llm, args.prompt, temperature=args.temperature, max_tokens=args.max_tokens ) end_time = time.time() print(f"\nResponse:\n{response}") results = { "mode": "single", "timestamp": datetime.datetime.now().isoformat(), "input": args.prompt, "output": response, "parameters": { "temperature": args.temperature, "max_tokens": args.max_tokens }, "performance": { "time_seconds": end_time - start_time } } elif args.mode == "chat": # For chat mode, we'll save the entire conversation history messages = [{"role": "system", "content": "You are a helpful AI assistant."}] results = { "mode": "chat", "timestamp": datetime.datetime.now().isoformat(), "conversation": [] } print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n") while True: user_input = input("\nYou: ") if user_input.lower() in ["exit", "quit"]: print("Ending conversation and saving results...") break messages.append({"role": "user", "content": user_input}) start_time = time.time() response = chat_completion( llm, messages, temperature=args.temperature, max_tokens=args.max_tokens ) end_time = time.time() print(f"\nAssistant: {response}") messages.append({"role": "assistant", "content": response}) # Add this exchange to results results["conversation"].append({ "user": user_input, "assistant": response, "time_seconds": end_time - start_time }) elif args.mode == "batch": if not args.prompt_file: print("Error: --prompt_file required for batch mode") return with open(args.prompt_file, 'r', encoding='utf-8') as f: prompts = json.load(f) print(f"Running batch inference on {len(prompts)} prompts...") inference_results = batch_inference( llm, prompts, temperature=args.temperature, max_tokens=args.max_tokens ) with open(args.output_file, "w") as final: json.dump(inference_results, final) if __name__ == "__main__": main()