Skip to main content

Documentation

Getting StartedFree

1. Get Your API Key

2. Authentication Headers

X-RapidAPI-Key: YOUR_RAPIDAPI_KEY
X-RapidAPI-Host: llm-tool.p.rapidapi.com

3. Your First Request

curl -X POST "https://llm-tool.p.rapidapi.com/v1/calculate" \
-H "X-RapidAPI-Key: YOUR_KEY" \
-H "X-RapidAPI-Host: llm-tool.p.rapidapi.com" \
-H "Content-Type: application/json" \
-d '{"hf_model": "meta-llama/Llama-3.1-8B"}'

4. Result

{
"model_family": "llama",
"model_architecture": "LlamaForCausalLM",
"estimated_parameters": "8B",
"is_mixture_of_experts": false,
"gpu_requirements": {
"recommended_gpu_count": 1,
"gpu_type": "L40",
"memory_per_gpu_gb": 48,
"total_vram_required_gb": 15,
"cpu_cores": 12,
"ram_gb": 128
},
"vllm_parameters": {
"core": {
"model": "meta-llama/Llama-3.1-8B",
"tokenizer": "meta-llama/Llama-3.1-8B",
"reasoning_parser": null,
"dtype": "auto",
"load_format": "auto",
"quantization": null
},
"parallel": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1
},
"memory": {
"gpu_memory_utilization": 0.85,
"max_model_len": 2048,
"max_num_seqs": 8,
"max_num_batched_tokens": 4096,
"block_size": 16,
"cpu_offload_gb": 0
},
"performance": {
"enforce_eager": false,
"enable_prefix_caching": false,
"enable_chunked_prefill": false,
"disable_frontend_multiprocessing": false,
"disable_custom_all_reduce": false
},
"features": {
"trust_remote_code": false,
"multimodal": false,
"enable_lora": true,
"max_loras": 1,
"max_lora_rank": 16,
"max_cpu_loras": 2,
"allow_runtime_lora_updating": false,
"lora_modules": null,
"chat_template": null
}
},
"deployment_configuration": {
"container_disk_gb": 50,
"deployment_complexity": "low"
},
"notes": [
"Using empirical VRAM measurement: 14.5GB",
"Selected L40 with 1x GPU configuration for optimal cost/performance"
],
"confidence": "high",
"error": null
}