from vllm import LLM, SamplingParams
import argparse
import json
import os
import time
import datetime

def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85):
    """
    Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration.
    
    Args:
        model_path: Path to the directory containing the trained model
        tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect)
        dtype: Data type for model weights (bfloat16, float16, or float32)
        gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0)
    """
    print(f"Loading fine-tuned Qwen model from: {model_path}")
    print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, "
          f"gpu_memory_utilization={gpu_memory_utilization}")
    
    # Initialize the model with VLLM using GPU settings
    llm = LLM(
        model=model_path,
        trust_remote_code=True,
        tensor_parallel_size=tensor_parallel_size,  # Number of GPUs to use
        dtype=dtype,  # Data type for model weights
        gpu_memory_utilization=gpu_memory_utilization,  # Memory usage per GPU
        enforce_eager=False,  # Set to True if you encounter CUDA issues
        # max_model_len=8192,  # Uncomment if you need longer context
    )
    
    print("Model loaded successfully!")
    return llm

def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9):
    """Generate a response for a given prompt."""
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    
    outputs = llm.generate([prompt], sampling_params)
    return outputs[0].outputs[0].text

def chat_completion(llm, messages, temperature=0.7, max_tokens=512):
    """Generate a chat completion from messages."""
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=0.9,
        max_tokens=max_tokens
    )
    
    # Convert messages to a prompt using the model's chat template
    tokenizer = llm.get_tokenizer()
    if hasattr(tokenizer, "apply_chat_template"):
        # For newer transformers versions
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    else:
        # Fallback for models without chat template
        prompt = format_messages_manually(messages)
    
    outputs = llm.generate([prompt], sampling_params)
    return outputs[0].outputs[0].text

def format_messages_manually(messages):
    """Format messages manually if chat template is not available."""
    formatted_prompt = ""
    for message in messages:
        role = message["role"]
        content = message["content"]
        if role == "system":
            formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
        elif role == "user":
            formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
        elif role == "assistant":
            formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
    formatted_prompt += "<|im_start|>assistant\n"
    return formatted_prompt

def batch_inference(llm, prompts, temperature=0.7, max_tokens=512):
    """Run batch inference on multiple prompts."""
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=0.9,
        max_tokens=max_tokens
    )
    
    outputs = llm.generate(prompts, sampling_params)
    return [output.outputs[0].text for output in outputs]

def save_to_json(data, output_path=None):
    """Save results to a JSON file."""
    if not output_path:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = f"qwen_inference_results_{timestamp}.json"
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    print(f"Results saved to: {output_path}")
    return output_path

def main():
    parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output")
    parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory")
    parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode")
    parser.add_argument("--prompt", help="Prompt for single inference mode")
    parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)")
    parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)")
    parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response")
    parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling")
    parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)")
    parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights")
    parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)")
    args = parser.parse_args()
    
    # Initialize the model with specified GPU settings
    llm = setup_model(
        model_path=args.model_path,
        tensor_parallel_size=args.gpu_count,
        dtype=args.dtype,
        gpu_memory_utilization=args.gpu_memory_utilization
    )
    
    results = {}
    
    if args.mode == "single":
        if not args.prompt:
            args.prompt = input("Enter your prompt: ")
        
        print("\nGenerating response...")
        start_time = time.time()
        response = generate_response(
            llm, 
            args.prompt, 
            temperature=args.temperature,
            max_tokens=args.max_tokens
        )
        end_time = time.time()
        
        print(f"\nResponse:\n{response}")
        
        results = {
            "mode": "single",
            "timestamp": datetime.datetime.now().isoformat(),
            "input": args.prompt,
            "output": response,
            "parameters": {
                "temperature": args.temperature,
                "max_tokens": args.max_tokens
            },
            "performance": {
                "time_seconds": end_time - start_time
            }
        }
    
    elif args.mode == "chat":
        # For chat mode, we'll save the entire conversation history
        messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
        results = {
            "mode": "chat",
            "timestamp": datetime.datetime.now().isoformat(),
            "conversation": []
        }
        
        print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n")
        
        while True:
            user_input = input("\nYou: ")
            if user_input.lower() in ["exit", "quit"]:
                print("Ending conversation and saving results...")
                break
                
            messages.append({"role": "user", "content": user_input})
            
            start_time = time.time()
            response = chat_completion(
                llm, 
                messages, 
                temperature=args.temperature,
                max_tokens=args.max_tokens
            )
            end_time = time.time()
            
            print(f"\nAssistant: {response}")
            messages.append({"role": "assistant", "content": response})
            
            # Add this exchange to results
            results["conversation"].append({
                "user": user_input,
                "assistant": response,
                "time_seconds": end_time - start_time
            })
    
    elif args.mode == "batch":
        if not args.prompt_file:
            print("Error: --prompt_file required for batch mode")
            return
        with open(args.prompt_file, 'r', encoding='utf-8') as f:
            prompts = json.load(f)
        
        print(f"Running batch inference on {len(prompts)} prompts...")
        inference_results = batch_inference(
            llm, 
            prompts, 
            temperature=args.temperature,
            max_tokens=args.max_tokens
        )
    
        with open(args.output_file, "w") as final:
            json.dump(inference_results, final)

if __name__ == "__main__":
    main()