Database Partitioning

Arkitektur

Opdeling af store tabeller eller databases i mindre, mere håndterbare dele kaldet partitions for bedre performance og skalerbarhed.

Beskrivelse

Database partitioning (også kaldet sharding i distributed contexts) er teknikken at opdele en stor tabel i mindre chunks kaldet partitions. Hver partition indeholder et subset af data baseret på en partitioning key. Der findes flere partitioning strategies: Range Partitioning (opdel efter værdiområder fx dates, IDs), Hash Partitioning (brug hash function til at fordele data uniformt), List Partitioning (explicitte lister af værdier per partition), og Composite Partitioning (kombination af strategies). Partitioning giver mange fordele: queries der kun rammer én partition kører hurtigere (partition pruning), maintenance operations (backup, rebuild indexes) kan køre på individuelle partitions, og data kan organiseres efter access patterns (hot data på fast storage, cold data på slow storage). I distributed databases refererer sharding til partitioning across multiple servers hvor hver shard er en separat database instance. Vertical partitioning opdeler kolonner i stedet for rows og bruges mindre ofte.

Problem

Store tabeller bliver langsomme at query, backup, og vedligeholde. Full table scans tager lang tid, indexes bliver massive, og nogle operations låser hele tabellen. Single-server databaser kan ikke skalere beyond én maskines kapacitet.

Løsning

Partitioning opdeler data i mindre chunks. Queries kan target specific partitions (partition pruning), maintenance kan køres per partition, og data kan fordeles intelligently. I distributed settings (sharding) kan hver partition ligge på forskellige servere, hvilket giver horizontal skalering.

Eksempel

-- PostgreSQL Range Partitioning (efter dato)
CREATE TABLE orders (
  order_id BIGSERIAL,
  customer_id INT,
  order_date DATE NOT NULL,
  total DECIMAL(10,2),
  status VARCHAR(20)
) PARTITION BY RANGE (order_date);

-- Opret partitions for hver måned
CREATE TABLE orders_2024_01 PARTITION OF orders
  FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

CREATE TABLE orders_2024_02 PARTITION OF orders
  FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');

CREATE TABLE orders_2024_03 PARTITION OF orders
  FOR VALUES FROM ('2024-03-01') TO ('2024-04-01');

-- Queries automatically route til korrekt partition
SELECT * FROM orders 
WHERE order_date BETWEEN '2024-02-01' AND '2024-02-15';
-- Kun scanner orders_2024_02 partition!

-- MySQL Hash Partitioning
CREATE TABLE users (
  user_id INT NOT NULL,
  username VARCHAR(50),
  email VARCHAR(100),
  created_at TIMESTAMP
)
PARTITION BY HASH(user_id)
PARTITIONS 4;
-- Data fordeles jævnt across 4 partitions baseret på hash(user_id)

-- MySQL List Partitioning (efter region)
CREATE TABLE customers (
  customer_id INT,
  name VARCHAR(100),
  country VARCHAR(50),
  region VARCHAR(50)
)
PARTITION BY LIST COLUMNS(country) (
  PARTITION p_nordic VALUES IN ('DK', 'SE', 'NO', 'FI'),
  PARTITION p_west VALUES IN ('DE', 'FR', 'UK', 'NL'),
  PARTITION p_south VALUES IN ('IT', 'ES', 'PT', 'GR'),
  PARTITION p_other VALUES IN (DEFAULT)
);

-- Query specific partition
SELECT * FROM customers PARTITION (p_nordic)
WHERE country = 'DK';

-- Composite Partitioning (Range + Hash)
CREATE TABLE logs (
  log_id BIGSERIAL,
  log_date DATE,
  user_id INT,
  message TEXT
)
PARTITION BY RANGE (log_date)
SUBPARTITION BY HASH (user_id)
SUBPARTITIONS 4;

-- Partition management
-- Drop old partition (archive old data)
ALTER TABLE orders DROP PARTITION orders_2023_01;

-- Attach new partition
CREATE TABLE orders_2024_04 (
  LIKE orders INCLUDING ALL
);
ALTER TABLE orders ATTACH PARTITION orders_2024_04
  FOR VALUES FROM ('2024-04-01') TO ('2024-05-01');

-- Sharding eksempel (distributed partitioning)
-- Horizontal sharding efter user_id ranges

-- Shard 1 (Server 1): user_id 1-1000000
CREATE TABLE users_shard1 (
  user_id INT PRIMARY KEY CHECK (user_id BETWEEN 1 AND 1000000),
  username VARCHAR(50),
  email VARCHAR(100)
);

-- Shard 2 (Server 2): user_id 1000001-2000000
CREATE TABLE users_shard2 (
  user_id INT PRIMARY KEY CHECK (user_id BETWEEN 1000001 AND 2000000),
  username VARCHAR(50),
  email VARCHAR(100)
);

-- Application routing logic
function getShardForUser(userId) {
  if (userId <= 1000000) return shard1Connection;
  if (userId <= 2000000) return shard2Connection;
  return shard3Connection;
}

const db = getShardForUser(userId);
const user = await db.query('SELECT * FROM users WHERE user_id = ?', [userId]);

-- Consistent Hashing for sharding
function getShardByHash(key, shardCount) {
  const hash = hashFunction(key);
  return hash % shardCount;
}

const shardId = getShardByHash(userId, 4);
const db = shardConnections[shardId];

-- Vitess eksempel (MySQL sharding)
-- VSchema definition
{
  "sharded": true,
  "vindexes": {
    "hash": {
      "type": "hash"
    }
  },
  "tables": {
    "users": {
      "column_vindexes": [
        {
          "column": "user_id",
          "name": "hash"
        }
      ]
    }
  }
}

-- MongoDB sharding
// Enable sharding på database
sh.enableSharding("mydb");

// Shard collection med hashed _id
sh.shardCollection(
  "mydb.users",
  { "_id": "hashed" }
);

// Eller range-based sharding
sh.shardCollection(
  "mydb.orders",
  { "order_date": 1 }
);

-- Partition pruning eksempel
EXPLAIN SELECT * FROM orders 
WHERE order_date = '2024-02-15';

-- Output viser:
-- Seq Scan on orders_2024_02 (partition pruning worked!)
-- NOT scanning orders_2024_01, orders_2024_03, etc.

-- Cross-partition query (dyrere)
SELECT COUNT(*) FROM orders 
WHERE total > 1000;
-- Scanner ALLE partitions

-- Partition-aware index
CREATE INDEX idx_orders_customer 
ON orders (customer_id);
-- Index oprettes per partition automatisk

-- Node.js sharding example
class ShardManager {
  constructor(shards) {
    this.shards = shards; // Array af DB connections
  }

  getShard(key) {
    const hash = this.hash(key);
    const shardIndex = hash % this.shards.length;
    return this.shards[shardIndex];
  }

  hash(key) {
    // Simple hash function
    let hash = 0;
    for (let i = 0; i < key.length; i++) {
      hash = ((hash << 5) - hash) + key.charCodeAt(i);
      hash = hash & hash;
    }
    return Math.abs(hash);
  }

  async getUser(userId) {
    const shard = this.getShard(userId.toString());
    return await shard.query(
      'SELECT * FROM users WHERE user_id = ?',
      [userId]
    );
  }

  async getAllUsers() {
    // Query alle shards og merge results
    const promises = this.shards.map(shard => 
      shard.query('SELECT * FROM users')
    );
    const results = await Promise.all(promises);
    return results.flat();
  }
}

Fordele

✓Query performance (partition pruning)
✓Horizontal skalering muligt
✓Lettere maintenance (per-partition)
✓Data lifecycle management (drop old partitions)
✓Parallel query execution muligt

Udfordringer

⚠Cross-partition queries er dyre
⚠Rebalancing ved vækst er komplekst
⚠Joins across partitions er svære
⚠Valg af partition key er kritisk
⚠Complexity i application logic

Anvendelsesområder

•Time-series data (partitioner efter dato)
•Multi-tenant applications (partitioner efter tenant)
•Large tables (logs, events, transactions)
•Geographic data (partitioner efter region)
•High-growth applications

Eksempler fra den virkelige verden

•Log data partitioneret efter måned (nem at droppe gamle)
•E-commerce orders partitioneret efter dato
•Social media (Instagram shards efter user_id)
•SaaS applications (en partition per customer)
•IoT sensor data (partition efter device eller tid)