π₯ TurboModelο
The unified model class for loading, generating, fine-tuning, and exporting.
Class Overviewο
class TurboModel:
"""Ultra-fast LLM with auto-configuration."""
model: PreTrainedModel # The underlying HuggingFace model
tokenizer: PreTrainedTokenizer # The tokenizer
config: SmartConfig # Auto-detected configuration
Class Methodsο
from_pretrained()ο
Load a model from HuggingFace Hub or local path.
@classmethod
def from_pretrained(
cls,
model_name: str,
config: Optional[SmartConfig] = None,
quantize: bool = True,
verbose: bool = True,
**kwargs
) -> "TurboModel"
Example:
from quantllm import TurboModel, SmartConfig
# With auto-config
model = TurboModel.from_pretrained("meta-llama/Llama-3.2-3B")
# With custom config
config = SmartConfig.detect("meta-llama/Llama-3.2-3B", bits=4)
model = TurboModel.from_pretrained("meta-llama/Llama-3.2-3B", config=config)
from_gguf()ο
Load a GGUF model from HuggingFace or local file.
@classmethod
def from_gguf(
cls,
repo_id_or_path: str,
filename: Optional[str] = None,
**kwargs
) -> "TurboModel"
Example:
# From HuggingFace
model = TurboModel.from_gguf(
"TheBloke/Llama-2-7B-Chat-GGUF",
filename="llama-2-7b-chat.Q4_K_M.gguf"
)
# From local file
model = TurboModel.from_gguf("./models/my-model.gguf")
list_gguf_files()ο
List available GGUF files in a HuggingFace repository.
@staticmethod
def list_gguf_files(repo_id: str) -> List[str]
Example:
files = TurboModel.list_gguf_files("TheBloke/Llama-2-7B-Chat-GGUF")
print(files)
# ['llama-2-7b-chat.Q2_K.gguf', 'llama-2-7b-chat.Q4_K_M.gguf', ...]
Instance Methodsο
generate()ο
Generate text from a prompt.
def generate(
self,
prompt: str,
max_new_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.0,
do_sample: bool = True,
stream: bool = False,
stop_strings: Optional[List[str]] = None,
**kwargs
) -> Union[str, Generator[str, None, None]]
Parameter |
Type |
Default |
Description |
|---|---|---|---|
|
str |
required |
Input text |
|
int |
256 |
Maximum tokens to generate |
|
float |
0.7 |
Sampling temperature (0.0-2.0) |
|
float |
0.9 |
Nucleus sampling threshold |
|
int |
50 |
Top-k sampling |
|
float |
1.0 |
Repetition penalty (1.0-1.5) |
|
bool |
False |
Stream tokens as generated |
|
list |
None |
Stop generation at these strings |
Example:
# Basic generation
response = model.generate("What is AI?")
# With parameters
response = model.generate(
"Write a poem:",
max_new_tokens=200,
temperature=0.8,
top_p=0.95,
)
# Streaming
for token in model.generate("Count to 10:", stream=True):
print(token, end="", flush=True)
chat()ο
Chat with the model using messages format.
def chat(
self,
messages: List[Dict[str, str]],
max_new_tokens: int = 256,
stream: bool = False,
**kwargs
) -> Union[str, Generator[str, None, None]]
Messages format:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"},
]
Example:
messages = [
{"role": "system", "content": "You are a coding expert."},
{"role": "user", "content": "How do I read a file in Python?"},
]
response = model.chat(messages)
print(response)
finetune()ο
Fine-tune the model with LoRA.
def finetune(
self,
data: Union[str, List[Dict], Dataset],
epochs: int = 3,
batch_size: int = 4,
learning_rate: float = 2e-4,
lora_r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.1,
output_dir: Optional[str] = None,
hub_manager: Optional[QuantLLMHubManager] = None,
**kwargs
) -> Dict[str, Any]
Parameter |
Type |
Default |
Description |
|---|---|---|---|
|
str/list/Dataset |
required |
Training data |
|
int |
3 |
Training epochs |
|
int |
4 |
Batch size |
|
float |
2e-4 |
Learning rate |
|
int |
8 |
LoRA rank |
|
int |
16 |
LoRA alpha |
|
str |
None |
Save directory |
Returns: Dictionary with train_loss, epochs, output_dir.
Example:
# Simple training
result = model.finetune("data.json", epochs=3)
# Advanced
result = model.finetune(
"data.json",
epochs=5,
learning_rate=2e-4,
lora_r=16,
lora_alpha=32,
batch_size=4,
)
export()ο
Export the model to various formats.
def export(
self,
format: Optional[str] = None,
output_path: Optional[str] = None,
quantization: Optional[str] = None,
**kwargs
) -> str
Parameter |
Type |
Description |
|---|---|---|
|
str |
βggufβ, βonnxβ, βmlxβ, βsafetensorsβ (optional, uses shared config) |
|
str |
Output file or directory (optional) |
|
str |
Quantization type (format-specific) |
Examples:
# GGUF
model = turbo(
"meta-llama/Llama-3.2-3B",
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.export()
# ONNX
model.export("onnx", "./model-onnx/")
# MLX
model.export("mlx", "./model-mlx/", quantization="4bit")
# SafeTensors
model.export("safetensors", "./model-hf/")
push() / push_to_hub()ο
Push model to HuggingFace Hub.
def push(
self,
repo_id: str,
token: Optional[str] = None,
format: Optional[str] = None,
quantization: Optional[str] = None,
license: str = "apache-2.0",
commit_message: str = "Upload model via QuantLLM",
**kwargs
)
Example:
# Push as GGUF
model.push(
"your-username/my-model"
)
# Push as MLX
model.push(
"your-username/my-model-mlx",
format="mlx",
quantization="4bit"
)
SmartConfigο
Auto-detected configuration for optimal performance.
@dataclass
class SmartConfig:
bits: int = 4
quant_type: str = "nf4"
use_flash_attention: bool = True
gradient_checkpointing: bool = False
cpu_offload: bool = False
compile_model: bool = False
batch_size: int = 4
max_seq_length: int = 4096
device: torch.device = "cuda"
dtype: torch.dtype = torch.float16
SmartConfig.detect()ο
Auto-detect optimal configuration.
@classmethod
def detect(
cls,
model_name: str,
bits: Optional[int] = None,
training: bool = False,
) -> SmartConfig
Example:
from quantllm import SmartConfig
config = SmartConfig.detect("meta-llama/Llama-3.2-3B")
print(f"Bits: {config.bits}")
print(f"Flash Attention: {config.use_flash_attention}")
print_summary()ο
Print configuration summary.
config.print_summary()
Output:
ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β QUANTLLM CONFIGURATION β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββ£
β π¦ Quantization: 4-bit (nf4) β
β πΎ Memory: CPU Offload Disabled β
β β‘ Speed: Flash Attention Enabled β
ββββββββββββββββββββββββββββββββββββββββββββββββββββββ