🔥 TurboModel

The unified model class for loading, generating, fine-tuning, and exporting.

Class Overview

class TurboModel:
    """Ultra-fast LLM with auto-configuration."""
    
    model: PreTrainedModel           # The underlying HuggingFace model
    tokenizer: PreTrainedTokenizer   # The tokenizer
    config: SmartConfig              # Auto-detected configuration

Class Methods

from_pretrained()

Load a model from HuggingFace Hub or local path.

@classmethod
def from_pretrained(
    cls,
    model_name: str,
    config: Optional[SmartConfig] = None,
    quantize: bool = True,
    verbose: bool = True,
    **kwargs
) -> "TurboModel"

Example:

from quantllm import TurboModel, SmartConfig

# With auto-config
model = TurboModel.from_pretrained("meta-llama/Llama-3.2-3B")

# With custom config
config = SmartConfig.detect("meta-llama/Llama-3.2-3B", bits=4)
model = TurboModel.from_pretrained("meta-llama/Llama-3.2-3B", config=config)

from_gguf()

Load a GGUF model from HuggingFace or local file.

@classmethod
def from_gguf(
    cls,
    repo_id_or_path: str,
    filename: Optional[str] = None,
    **kwargs
) -> "TurboModel"

Example:

# From HuggingFace
model = TurboModel.from_gguf(
    "TheBloke/Llama-2-7B-Chat-GGUF",
    filename="llama-2-7b-chat.Q4_K_M.gguf"
)

# From local file
model = TurboModel.from_gguf("./models/my-model.gguf")

list_gguf_files()

List available GGUF files in a HuggingFace repository.

@staticmethod
def list_gguf_files(repo_id: str) -> List[str]

Example:

files = TurboModel.list_gguf_files("TheBloke/Llama-2-7B-Chat-GGUF")
print(files)
# ['llama-2-7b-chat.Q2_K.gguf', 'llama-2-7b-chat.Q4_K_M.gguf', ...]

Instance Methods

generate()

Generate text from a prompt.

def generate(
    self,
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.0,
    do_sample: bool = True,
    stream: bool = False,
    stop_strings: Optional[List[str]] = None,
    **kwargs
) -> Union[str, Generator[str, None, None]]

Parameter	Type	Default	Description
`prompt`	str	required	Input text
`max_new_tokens`	int	256	Maximum tokens to generate
`temperature`	float	0.7	Sampling temperature (0.0-2.0)
`top_p`	float	0.9	Nucleus sampling threshold
`top_k`	int	50	Top-k sampling
`repetition_penalty`	float	1.0	Repetition penalty (1.0-1.5)
`stream`	bool	False	Stream tokens as generated
`stop_strings`	list	None	Stop generation at these strings

Example:

# Basic generation
response = model.generate("What is AI?")

# With parameters
response = model.generate(
    "Write a poem:",
    max_new_tokens=200,
    temperature=0.8,
    top_p=0.95,
)

# Streaming
for token in model.generate("Count to 10:", stream=True):
    print(token, end="", flush=True)

chat()

Chat with the model using messages format.

def chat(
    self,
    messages: List[Dict[str, str]],
    max_new_tokens: int = 256,
    stream: bool = False,
    **kwargs
) -> Union[str, Generator[str, None, None]]

Messages format:

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"},
    {"role": "assistant", "content": "Hi there!"},
    {"role": "user", "content": "How are you?"},
]

Example:

messages = [
    {"role": "system", "content": "You are a coding expert."},
    {"role": "user", "content": "How do I read a file in Python?"},
]

response = model.chat(messages)
print(response)

finetune()

Fine-tune the model with LoRA.

def finetune(
    self,
    data: Union[str, List[Dict], Dataset],
    epochs: int = 3,
    batch_size: int = 4,
    learning_rate: float = 2e-4,
    lora_r: int = 8,
    lora_alpha: int = 16,
    lora_dropout: float = 0.1,
    output_dir: Optional[str] = None,
    hub_manager: Optional[QuantLLMHubManager] = None,
    **kwargs
) -> Dict[str, Any]

Parameter	Type	Default	Description
`data`	str/list/Dataset	required	Training data
`epochs`	int	3	Training epochs
`batch_size`	int	4	Batch size
`learning_rate`	float	2e-4	Learning rate
`lora_r`	int	8	LoRA rank
`lora_alpha`	int	16	LoRA alpha
`output_dir`	str	None	Save directory

Returns: Dictionary with train_loss, epochs, output_dir.

Example:

# Simple training
result = model.finetune("data.json", epochs=3)

# Advanced
result = model.finetune(
    "data.json",
    epochs=5,
    learning_rate=2e-4,
    lora_r=16,
    lora_alpha=32,
    batch_size=4,
)

export()

Export the model to various formats.

def export(
    self,
    format: Optional[str] = None,
    output_path: Optional[str] = None,
    quantization: Optional[str] = None,
    **kwargs
) -> str

Parameter	Type	Description
`format`	str	“gguf”, “onnx”, “mlx”, “safetensors” (optional, uses shared config)
`output_path`	str	Output file or directory (optional)
`quantization`	str	Quantization type (format-specific)

Examples:

# GGUF
model = turbo(
    "meta-llama/Llama-3.2-3B",
    config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
)
model.export()

# ONNX
model.export("onnx", "./model-onnx/")

# MLX
model.export("mlx", "./model-mlx/", quantization="4bit")

# SafeTensors
model.export("safetensors", "./model-hf/")

push() / push_to_hub()

Push model to HuggingFace Hub.

def push(
    self,
    repo_id: str,
    token: Optional[str] = None,
    format: Optional[str] = None,
    quantization: Optional[str] = None,
    license: str = "apache-2.0",
    commit_message: str = "Upload model via QuantLLM",
    **kwargs
)

Example:

# Push as GGUF
model.push(
    "your-username/my-model"
)

# Push as MLX
model.push(
    "your-username/my-model-mlx",
    format="mlx",
    quantization="4bit"
)

SmartConfig

Auto-detected configuration for optimal performance.

@dataclass
class SmartConfig:
    bits: int = 4
    quant_type: str = "nf4"
    use_flash_attention: bool = True
    gradient_checkpointing: bool = False
    cpu_offload: bool = False
    compile_model: bool = False
    batch_size: int = 4
    max_seq_length: int = 4096
    device: torch.device = "cuda"
    dtype: torch.dtype = torch.float16

SmartConfig.detect()

Auto-detect optimal configuration.

@classmethod
def detect(
    cls,
    model_name: str,
    bits: Optional[int] = None,
    training: bool = False,
) -> SmartConfig

Example:

from quantllm import SmartConfig

config = SmartConfig.detect("meta-llama/Llama-3.2-3B")
print(f"Bits: {config.bits}")
print(f"Flash Attention: {config.use_flash_attention}")

print_summary()

Print configuration summary.

config.print_summary()

Output:

╔════════════════════════════════════════════════════╗
║          QUANTLLM CONFIGURATION                    ║
╠════════════════════════════════════════════════════╣
║ 📦 Quantization: 4-bit (nf4)                       ║
║ 💾 Memory: CPU Offload Disabled                    ║
║ ⚡ Speed: Flash Attention Enabled                  ║
╚════════════════════════════════════════════════════╝

🔥 TurboModel

Class Overview

Class Methods

from_pretrained()

from_gguf()

list_gguf_files()

Instance Methods

generate()

chat()

finetune()

export()

push() / push_to_hub()

SmartConfig

SmartConfig.detect()

print_summary()

See Also