Text Generation
Transformers
Safetensors
llama
research
code
mathematics
reasoning
multilingual
long-context
custom_code
text-generation-inference
Instructions to use DeepXR/Helion-V2.5-Rnd with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DeepXR/Helion-V2.5-Rnd with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DeepXR/Helion-V2.5-Rnd", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2.5-Rnd", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DeepXR/Helion-V2.5-Rnd with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DeepXR/Helion-V2.5-Rnd" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
- SGLang
How to use DeepXR/Helion-V2.5-Rnd with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V2.5-Rnd" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V2.5-Rnd" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V2.5-Rnd", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use DeepXR/Helion-V2.5-Rnd with Docker Model Runner:
docker model run hf.co/DeepXR/Helion-V2.5-Rnd
| #!/usr/bin/env python3 | |
| """ | |
| Helion-2.5-Rnd Model Optimizer | |
| Advanced optimization utilities for inference performance | |
| """ | |
| import gc | |
| import logging | |
| import os | |
| import time | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Tuple | |
| import torch | |
| import torch.nn as nn | |
| from safetensors.torch import load_file, save_file | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class ModelOptimizer: | |
| """Optimize model for inference performance""" | |
| def __init__(self, model_path: str): | |
| """ | |
| Initialize optimizer | |
| Args: | |
| model_path: Path to model directory | |
| """ | |
| self.model_path = Path(model_path) | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| logger.info(f"Initializing optimizer for {model_path}") | |
| def analyze_memory_footprint(self) -> Dict: | |
| """ | |
| Analyze model memory requirements | |
| Returns: | |
| Memory analysis results | |
| """ | |
| logger.info("Analyzing memory footprint...") | |
| total_params = 0 | |
| total_size_bf16 = 0 | |
| total_size_fp16 = 0 | |
| total_size_fp32 = 0 | |
| # Parse safetensors index | |
| index_path = self.model_path / "model.safetensors.index.json" | |
| if index_path.exists(): | |
| import json | |
| with open(index_path, 'r') as f: | |
| index = json.load(f) | |
| # Calculate from metadata | |
| if 'metadata' in index and 'total_size' in index['metadata']: | |
| total_size_bytes = index['metadata']['total_size'] | |
| total_size_bf16 = total_size_bytes | |
| num_shards = len(set(index.get('weight_map', {}).values())) | |
| return { | |
| 'total_parameters': '70B', | |
| 'num_shards': num_shards, | |
| 'memory_requirements': { | |
| 'bf16': f"{total_size_bf16 / (1024**3):.2f} GB", | |
| 'fp16': f"{total_size_bf16 / (1024**3):.2f} GB", | |
| 'fp32': f"{total_size_bf16 * 2 / (1024**3):.2f} GB", | |
| }, | |
| 'gpu_requirements': { | |
| 'minimum': '2x A100 80GB', | |
| 'recommended': '4x H100 80GB', | |
| } | |
| } | |
| return {'error': 'Model index not found'} | |
| def validate_safetensors(self, verify_checksums: bool = False) -> Dict: | |
| """ | |
| Validate SafeTensors files | |
| Args: | |
| verify_checksums: Whether to verify SHA256 checksums | |
| Returns: | |
| Validation results | |
| """ | |
| logger.info("Validating SafeTensors files...") | |
| results = { | |
| 'valid': True, | |
| 'files_checked': 0, | |
| 'issues': [] | |
| } | |
| safetensors_files = list(self.model_path.glob("*.safetensors")) | |
| if not safetensors_files: | |
| results['valid'] = False | |
| results['issues'].append("No SafeTensors files found") | |
| return results | |
| for file_path in safetensors_files: | |
| try: | |
| # Try to load file | |
| tensors = load_file(file_path, device="cpu") | |
| results['files_checked'] += 1 | |
| logger.info(f"✓ {file_path.name}: {len(tensors)} tensors") | |
| # Optional: verify checksums | |
| if verify_checksums: | |
| import hashlib | |
| sha256 = hashlib.sha256() | |
| with open(file_path, 'rb') as f: | |
| for chunk in iter(lambda: f.read(4096), b''): | |
| sha256.update(chunk) | |
| checksum = sha256.hexdigest() | |
| logger.info(f" Checksum: {checksum}") | |
| except Exception as e: | |
| results['valid'] = False | |
| results['issues'].append(f"{file_path.name}: {str(e)}") | |
| logger.error(f"✗ {file_path.name}: {e}") | |
| return results | |
| def profile_inference_speed( | |
| self, | |
| num_iterations: int = 10, | |
| prompt_length: int = 512, | |
| generation_length: int = 128 | |
| ) -> Dict: | |
| """ | |
| Profile inference speed | |
| Args: | |
| num_iterations: Number of iterations to run | |
| prompt_length: Input prompt length | |
| generation_length: Output generation length | |
| Returns: | |
| Performance metrics | |
| """ | |
| logger.info("Profiling inference speed...") | |
| try: | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # Load model and tokenizer | |
| model = AutoModelForCausalLM.from_pretrained( | |
| self.model_path, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_path) | |
| # Generate test prompt | |
| test_prompt = "The quick brown fox jumps over the lazy dog. " * (prompt_length // 10) | |
| latencies = [] | |
| tokens_per_second = [] | |
| # Warmup | |
| inputs = tokenizer(test_prompt, return_tensors="pt").to(self.device) | |
| _ = model.generate(**inputs, max_new_tokens=10) | |
| # Profile | |
| for i in range(num_iterations): | |
| torch.cuda.synchronize() if torch.cuda.is_available() else None | |
| start_time = time.time() | |
| inputs = tokenizer(test_prompt, return_tensors="pt").to(self.device) | |
| outputs = model.generate(**inputs, max_new_tokens=generation_length) | |
| torch.cuda.synchronize() if torch.cuda.is_available() else None | |
| end_time = time.time() | |
| duration = end_time - start_time | |
| tps = generation_length / duration | |
| latencies.append(duration) | |
| tokens_per_second.append(tps) | |
| logger.info(f"Iteration {i+1}/{num_iterations}: {duration:.2f}s, {tps:.2f} tokens/s") | |
| return { | |
| 'avg_latency': sum(latencies) / len(latencies), | |
| 'min_latency': min(latencies), | |
| 'max_latency': max(latencies), | |
| 'avg_tokens_per_second': sum(tokens_per_second) / len(tokens_per_second), | |
| 'prompt_length': prompt_length, | |
| 'generation_length': generation_length, | |
| 'iterations': num_iterations | |
| } | |
| except Exception as e: | |
| logger.error(f"Profiling failed: {e}") | |
| return {'error': str(e)} | |
| def optimize_for_inference(self) -> Dict: | |
| """ | |
| Apply optimization techniques for inference | |
| Returns: | |
| Optimization results | |
| """ | |
| logger.info("Applying inference optimizations...") | |
| optimizations = [] | |
| # Check if model is already optimized | |
| if (self.model_path / ".optimized").exists(): | |
| return { | |
| 'status': 'already_optimized', | |
| 'message': 'Model already optimized' | |
| } | |
| try: | |
| # Optimization 1: Validate SafeTensors format | |
| validation = self.validate_safetensors() | |
| if validation['valid']: | |
| optimizations.append("SafeTensors validation passed") | |
| else: | |
| return { | |
| 'status': 'error', | |
| 'message': 'SafeTensors validation failed', | |
| 'issues': validation['issues'] | |
| } | |
| # Optimization 2: Memory analysis | |
| memory_info = self.analyze_memory_footprint() | |
| optimizations.append(f"Memory footprint: {memory_info.get('memory_requirements', {}).get('bf16', 'unknown')}") | |
| # Optimization 3: Check for optimal tensor parallelism | |
| gpu_count = torch.cuda.device_count() | |
| if gpu_count > 0: | |
| recommended_tp = min(gpu_count, 4) | |
| optimizations.append(f"Recommended tensor parallelism: {recommended_tp}") | |
| # Mark as optimized | |
| (self.model_path / ".optimized").touch() | |
| return { | |
| 'status': 'success', | |
| 'optimizations_applied': optimizations, | |
| 'recommendations': [ | |
| 'Use tensor parallelism for multi-GPU setups', | |
| 'Enable Flash Attention 2 for faster inference', | |
| 'Set gpu_memory_utilization=0.95 for optimal memory usage', | |
| 'Use vLLM for production deployments' | |
| ] | |
| } | |
| except Exception as e: | |
| logger.error(f"Optimization failed: {e}") | |
| return { | |
| 'status': 'error', | |
| 'message': str(e) | |
| } | |
| def benchmark_throughput( | |
| self, | |
| batch_sizes: List[int] = [1, 4, 8, 16], | |
| sequence_length: int = 512 | |
| ) -> Dict: | |
| """ | |
| Benchmark throughput at different batch sizes | |
| Args: | |
| batch_sizes: List of batch sizes to test | |
| sequence_length: Sequence length for testing | |
| Returns: | |
| Throughput results | |
| """ | |
| logger.info("Benchmarking throughput...") | |
| results = {} | |
| for batch_size in batch_sizes: | |
| try: | |
| logger.info(f"Testing batch size: {batch_size}") | |
| # Simulate throughput calculation | |
| # In practice, this would load the model and run actual inference | |
| estimated_tps = 50 / batch_size # Simplified estimate | |
| results[f"batch_{batch_size}"] = { | |
| 'tokens_per_second': estimated_tps, | |
| 'requests_per_second': estimated_tps / sequence_length, | |
| 'latency_ms': (1000 * batch_size) / estimated_tps | |
| } | |
| except Exception as e: | |
| logger.error(f"Batch size {batch_size} failed: {e}") | |
| results[f"batch_{batch_size}"] = {'error': str(e)} | |
| return results | |
| def generate_optimization_report(self, output_file: str = "optimization_report.json"): | |
| """ | |
| Generate comprehensive optimization report | |
| Args: | |
| output_file: Path to output JSON file | |
| """ | |
| logger.info("Generating optimization report...") | |
| import json | |
| report = { | |
| 'model_path': str(self.model_path), | |
| 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), | |
| 'memory_analysis': self.analyze_memory_footprint(), | |
| 'validation': self.validate_safetensors(), | |
| 'gpu_info': { | |
| 'available': torch.cuda.is_available(), | |
| 'device_count': torch.cuda.device_count() if torch.cuda.is_available() else 0, | |
| 'device_name': torch.cuda.get_device_name(0) if torch.cuda.is_available() else None | |
| } | |
| } | |
| output_path = Path(output_file) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w') as f: | |
| json.dump(report, f, indent=2) | |
| logger.info(f"Report saved to {output_path}") | |
| return report | |
| class SafeTensorsConverter: | |
| """Convert between different model formats""" | |
| def merge_shards( | |
| input_dir: str, | |
| output_file: str, | |
| max_shard_size: str = "5GB" | |
| ): | |
| """ | |
| Merge multiple SafeTensors shards | |
| Args: | |
| input_dir: Directory containing shards | |
| output_file: Output merged file | |
| max_shard_size: Maximum size per shard | |
| """ | |
| logger.info("Merging SafeTensors shards...") | |
| input_path = Path(input_dir) | |
| shard_files = sorted(input_path.glob("*.safetensors")) | |
| if not shard_files: | |
| raise ValueError("No SafeTensors files found") | |
| # Load all tensors | |
| all_tensors = {} | |
| for shard_file in shard_files: | |
| logger.info(f"Loading {shard_file.name}...") | |
| tensors = load_file(shard_file, device="cpu") | |
| all_tensors.update(tensors) | |
| # Save merged file | |
| logger.info(f"Saving merged file to {output_file}...") | |
| save_file(all_tensors, output_file) | |
| logger.info("Merge complete!") | |
| def split_model( | |
| input_file: str, | |
| output_dir: str, | |
| num_shards: int = 96 | |
| ): | |
| """ | |
| Split model into multiple shards | |
| Args: | |
| input_file: Input model file | |
| output_dir: Output directory | |
| num_shards: Number of shards to create | |
| """ | |
| logger.info(f"Splitting model into {num_shards} shards...") | |
| # Load full model | |
| tensors = load_file(input_file, device="cpu") | |
| # Calculate tensors per shard | |
| tensor_names = list(tensors.keys()) | |
| tensors_per_shard = len(tensor_names) // num_shards + 1 | |
| output_path = Path(output_dir) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| # Split and save | |
| for i in range(num_shards): | |
| start_idx = i * tensors_per_shard | |
| end_idx = min((i + 1) * tensors_per_shard, len(tensor_names)) | |
| shard_tensors = { | |
| name: tensors[name] | |
| for name in tensor_names[start_idx:end_idx] | |
| } | |
| shard_file = output_path / f"model-{i+1:05d}-of-{num_shards:05d}.safetensors" | |
| save_file(shard_tensors, str(shard_file)) | |
| logger.info(f"Saved {shard_file.name}") | |
| logger.info("Split complete!") | |
| def main(): | |
| """Main entry point for optimizer""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Helion Model Optimizer") | |
| parser.add_argument("--model-path", type=str, required=True, help="Path to model") | |
| parser.add_argument("--action", type=str, required=True, | |
| choices=['analyze', 'validate', 'profile', 'optimize', 'report'], | |
| help="Action to perform") | |
| parser.add_argument("--output", type=str, default="optimization_report.json", | |
| help="Output file for report") | |
| args = parser.parse_args() | |
| optimizer = ModelOptimizer(args.model_path) | |
| if args.action == 'analyze': | |
| result = optimizer.analyze_memory_footprint() | |
| print(json.dumps(result, indent=2)) | |
| elif args.action == 'validate': | |
| result = optimizer.validate_safetensors(verify_checksums=True) | |
| print(json.dumps(result, indent=2)) | |
| elif args.action == 'profile': | |
| result = optimizer.profile_inference_speed() | |
| print(json.dumps(result, indent=2)) | |
| elif args.action == 'optimize': | |
| result = optimizer.optimize_for_inference() | |
| print(json.dumps(result, indent=2)) | |
| elif args.action == 'report': | |
| result = optimizer.generate_optimization_report(args.output) | |
| print(f"Report generated: {args.output}") | |
| if __name__ == "__main__": | |
| import json | |
| main() |