Choosing the right data types and column ordering for your tables is crucial for ClickHouse performance and storage efficiency. Poor schema design can lead to 10-100x slower queries and 2-5x larger storage requirements.
Keep the following best practices in mind when defining your column types:
Nullable columns in ClickHouse have significant performance overhead.
Instead of using ? or | undefined (TypeScript) or | None or Optional[type] (Python), add the & ClickHouseDefault<"..."> (TypeScript) or (Python) to your column type.
Annotated[type, clickhouse_default("...")]from moose_lib import OlapTablefrom pydantic import BaseModel, Field # ❌ Bad: Using nullable columnsclass UserEvent(BaseModel): id: str user_id: str event_type: str description: str | None = None # Nullable metadata: dict | None = None # Nullable created_at: Date # ✅ Good: Use default values insteadclass UserEvent(BaseModel): id: str user_id: str event_type: str description: str = "" # Default empty string metadata: dict = Field(default_factory=dict) # Default empty dict created_at: Date user_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["id", "created_at"]})LowCardinality is ClickHouse's most efficient string type for columns with limited unique values.
from moose_lib import OlapTable, LowCardinalityfrom pydantic import BaseModelfrom typing import Literal class UserEvent(BaseModel): id: str user_id: str event_type: Annotated[str, "LowCardinality"] # ✅ Good for limited values status: Literal["active", "inactive", "pending"]# ✅ Literals become LowCardinality automatically country: Annotated[str, "LowCardinality"] # ✅ Good for country codes user_agent: str # ❌ Keep as String for high cardinality created_at: Date user_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["id", "created_at"]})Choose the smallest integer type that fits your data range to save storage and improve performance.
from typing import Annotatedfrom moose_lib import OlapTable, UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64from pydantic import BaseModel class UserEvent(BaseModel): id: str user_id: str age: Annotated[int, "uint8"] # ✅ 0-255 (1 byte) score: Annotated[int, "int16"] # ✅ -32,768 to 32,767 (2 bytes) view_count: Annotated[int, "int32"] # ✅ 0 to ~4 billion (4 bytes) timestamp: Annotated[int, "int64"] # ✅ Unix timestamp (8 bytes) event_type: str created_at: Date # Integer type ranges:# UInt8: 0 to 255# UInt16: 0 to 65,535# UInt32: 0 to 4,294,967,295# UInt64: 0 to 18,446,744,073,709,551,615# Int8: -128 to 127# Int16: -32,768 to 32,767# Int32: -2,147,483,648 to 2,147,483,647# Int64: -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807Choose appropriate DateTime precision based on your use case to balance storage and precision.
from moose_lib import OlapTable, DateTime, DateTime64from pydantic import BaseModelfrom datetime import datetime class UserEvent(BaseModel): id: str user_id: str event_type: str created_at: datetime # ✅ Second precision (default) updated_at: clickhouse_datetime(3) # ✅ Millisecond precision processed_at: clickhouse_datetime(6) # ✅ Microsecond precision logged_at: clickhouse_datetime(9) # ✅ Nanosecond precision user_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["id", "created_at"]})Use Decimal for financial and precise calculations to avoid floating-point precision issues.
from moose_lib import OlapTable, Decimalfrom pydantic import BaseModel class Order(BaseModel): id: str user_id: str amount: clickhouse_decimal(10, 2) # ✅ 10 total digits, 2 decimal places tax: clickhouse_decimal(8, 2) # ✅ 8 total digits, 2 decimal places discount: clickhouse_decimal(5, 2) # ✅ 5 total digits, 2 decimal places total: clickhouse_decimal(12, 2) # ✅ 12 total digits, 2 decimal places created_at: Date # ❌ Bad: Using float for financial dataclass BadOrder(BaseModel): id: str amount: float # Float - can cause precision issues tax: float # Float - can cause precision issues orders_table = OlapTable[Order]("orders", { "orderByFields": ["id", "created_at"]})NamedTuple is more efficient than Nested for structured data in ClickHouse.
from moose_lib import OlapTable, NamedTuplefrom pydantic import BaseModel class Location(BaseModel): latitude: float longitude: float city: str country: str class Metadata(BaseModel): version: str source: str priority: int class UserEvent(BaseModel): id: str user_id: str event_type: str location: Annotated[Location, "ClickHouseNamedTuple"] # lat, lon, city, country metadata: Annotated[Metadata, "ClickHouseNamedTuple"] # version, source, priority created_at: Date # ❌ Bad: Using nested objects (less efficient)class BadUserEvent(BaseModel): id: str location: Location # Nested - less efficient metadata: Metadata # Nested - less efficient user_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["id", "created_at"]})Optimize your orderByFields (or orderByExpression) for your most common query patterns.
from moose_lib import OlapTablefrom pydantic import BaseModel class UserEvent(BaseModel): id: str user_id: str event_type: str status: str created_at: Date country: str # ✅ Good: Optimized for common query patternsuser_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["user_id", "created_at", "event_type"] # Most common filters first}) # Common queries this optimizes for:# - WHERE user_id = ? AND created_at > ?# - WHERE user_id = ? AND event_type = ?# - GROUP BY user_id, event_typePlace LowCardinality columns first in your orderByFields (TypeScript) or order_by_fields (Python) (or reflect this priority in your orderByExpression (TypeScript) or order_by_expression (Python)) for better compression and query performance.
import { OlapTable, LowCardinality } from "@514labs/moose-lib"; interface UserEvent { id: string; userId: string; eventType: LowCardinality<string>; // ✅ Low cardinality status: LowCardinality<string>; // ✅ Low cardinality country: LowCardinality<string>; // ✅ Low cardinality createdAt: Date; // High cardinality sessionId: string; // High cardinality} // ✅ Good: LowCardinality columns firstconst userEventsTable = new OlapTable<UserEvent>("user_events", { orderByFields: ["eventType", "status", "country", "createdAt", "sessionId"]}); // ❌ Bad: High cardinality columns firstconst badUserEventsTable = new OlapTable<UserEvent>("user_events", { orderByFields: ["createdAt", "sessionId", "eventType", "status"] // Less efficient});from moose_lib import OlapTable, LowCardinalityfrom pydantic import BaseModel class UserEvent(BaseModel): id: str user_id: str event_type: LowCardinality[str] # ✅ Low cardinality status: LowCardinality[str] # ✅ Low cardinality country: LowCardinality[str] # ✅ Low cardinality created_at: Date # High cardinality session_id: str # High cardinality # ✅ Good: LowCardinality columns firstuser_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["event_type", "status", "country", "created_at", "session_id"]}) # ❌ Bad: High cardinality columns firstbad_user_events_table = OlapTable[UserEvent]("user_events", { "orderByFields": ["created_at", "session_id", "event_type", "status"] # Less efficient})