upload_model / Inference.py

Upload 3 files

43d4b71 verified 10 months ago

8.43 kB

	from vllm import LLM, SamplingParams
	import argparse
	import json
	import os
	import time
	import datetime

	def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85):
	"""
	Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration.

	Args:
	model_path: Path to the directory containing the trained model
	tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect)
	dtype: Data type for model weights (bfloat16, float16, or float32)
	gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0)
	"""
	print(f"Loading fine-tuned Qwen model from: {model_path}")
	print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, "
	f"gpu_memory_utilization={gpu_memory_utilization}")

	# Initialize the model with VLLM using GPU settings
	llm = LLM(
	model=model_path,
	trust_remote_code=True,
	tensor_parallel_size=tensor_parallel_size, # Number of GPUs to use
	dtype=dtype, # Data type for model weights
	gpu_memory_utilization=gpu_memory_utilization, # Memory usage per GPU
	enforce_eager=False, # Set to True if you encounter CUDA issues
	# max_model_len=8192, # Uncomment if you need longer context
	)

	print("Model loaded successfully!")
	return llm

	def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9):
	"""Generate a response for a given prompt."""
	sampling_params = SamplingParams(
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens
	)

	outputs = llm.generate([prompt], sampling_params)
	return outputs[0].outputs[0].text

	def chat_completion(llm, messages, temperature=0.7, max_tokens=512):
	"""Generate a chat completion from messages."""
	sampling_params = SamplingParams(
	temperature=temperature,
	top_p=0.9,
	max_tokens=max_tokens
	)

	# Convert messages to a prompt using the model's chat template
	tokenizer = llm.get_tokenizer()
	if hasattr(tokenizer, "apply_chat_template"):
	# For newer transformers versions
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	else:
	# Fallback for models without chat template
	prompt = format_messages_manually(messages)

	outputs = llm.generate([prompt], sampling_params)
	return outputs[0].outputs[0].text

	def format_messages_manually(messages):
	"""Format messages manually if chat template is not available."""
	formatted_prompt = ""
	for message in messages:
	role = message["role"]
	content = message["content"]
	if role == "system":
	formatted_prompt += f"<\|im_start\|>system\n{content}<\|im_end\|>\n"
	elif role == "user":
	formatted_prompt += f"<\|im_start\|>user\n{content}<\|im_end\|>\n"
	elif role == "assistant":
	formatted_prompt += f"<\|im_start\|>assistant\n{content}<\|im_end\|>\n"
	formatted_prompt += "<\|im_start\|>assistant\n"
	return formatted_prompt

	def batch_inference(llm, prompts, temperature=0.7, max_tokens=512):
	"""Run batch inference on multiple prompts."""
	sampling_params = SamplingParams(
	temperature=temperature,
	top_p=0.9,
	max_tokens=max_tokens
	)

	outputs = llm.generate(prompts, sampling_params)
	return [output.outputs[0].text for output in outputs]

	def save_to_json(data, output_path=None):
	"""Save results to a JSON file."""
	if not output_path:
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	output_path = f"qwen_inference_results_{timestamp}.json"

	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)

	print(f"Results saved to: {output_path}")
	return output_path

	def main():
	parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output")
	parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory")
	parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode")
	parser.add_argument("--prompt", help="Prompt for single inference mode")
	parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)")
	parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)")
	parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response")
	parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling")
	parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)")
	parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights")
	parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)")
	args = parser.parse_args()

	# Initialize the model with specified GPU settings
	llm = setup_model(
	model_path=args.model_path,
	tensor_parallel_size=args.gpu_count,
	dtype=args.dtype,
	gpu_memory_utilization=args.gpu_memory_utilization
	)

	results = {}

	if args.mode == "single":
	if not args.prompt:
	args.prompt = input("Enter your prompt: ")

	print("\nGenerating response...")
	start_time = time.time()
	response = generate_response(
	llm,
	args.prompt,
	temperature=args.temperature,
	max_tokens=args.max_tokens
	)
	end_time = time.time()

	print(f"\nResponse:\n{response}")

	results = {
	"mode": "single",
	"timestamp": datetime.datetime.now().isoformat(),
	"input": args.prompt,
	"output": response,
	"parameters": {
	"temperature": args.temperature,
	"max_tokens": args.max_tokens
	},
	"performance": {
	"time_seconds": end_time - start_time
	}
	}

	elif args.mode == "chat":
	# For chat mode, we'll save the entire conversation history
	messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
	results = {
	"mode": "chat",
	"timestamp": datetime.datetime.now().isoformat(),
	"conversation": []
	}

	print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n")

	while True:
	user_input = input("\nYou: ")
	if user_input.lower() in ["exit", "quit"]:
	print("Ending conversation and saving results...")
	break

	messages.append({"role": "user", "content": user_input})

	start_time = time.time()
	response = chat_completion(
	llm,
	messages,
	temperature=args.temperature,
	max_tokens=args.max_tokens
	)
	end_time = time.time()

	print(f"\nAssistant: {response}")
	messages.append({"role": "assistant", "content": response})

	# Add this exchange to results
	results["conversation"].append({
	"user": user_input,
	"assistant": response,
	"time_seconds": end_time - start_time
	})

	elif args.mode == "batch":
	if not args.prompt_file:
	print("Error: --prompt_file required for batch mode")
	return
	with open(args.prompt_file, 'r', encoding='utf-8') as f:
	prompts = json.load(f)

	print(f"Running batch inference on {len(prompts)} prompts...")
	inference_results = batch_inference(
	llm,
	prompts,
	temperature=args.temperature,
	max_tokens=args.max_tokens
	)

	with open(args.output_file, "w") as final:
	json.dump(inference_results, final)

	if __name__ == "__main__":
	main()