专注于分布式系统架构AI辅助开发工具(Claude
Code中文周刊)

第04章:索引系统架构:高性能数据存储

智谱 GLM,支持多语言、多任务推理。从写作到代码生成,从搜索到知识问答,AI 生产力的中国解法。

第04章:索引系统架构:高性能数据存储

两个核心表支撑整个搜索系统,批量操作决定性能

📝 TL;DR (核心要点速览)
核心设计:index_tokens + index_entries = 完整反向索引
性能关键:批量操作、事务处理、索引优化
架构哲学:用数据库的强项替代内存的复杂性
扩展方案:分区、分片、读写分离的渐进式演进

1. 索引系统架构概览

1.1 从文档到索引的数据流

原始文档
    ↓ [分词处理]
Token流:(database, datab, data, base, databases)
    ↓ [去重和优化]
唯一Token:(database, datab, data, bases)
    ↓ [权重计算]
带权重Token:(database:10, datab:3, data:1, bases:8)
    ↓ [索引存储]
数据库记录:token_id, token, weight, tokenizer_type
    ↓ [反向映射]
文档映射:document_id ↔ token_ids

1.2 为什么选择数据库而非内存

传统内存索引的问题

持久性:重启丢失,需要重建
一致性:多进程同步复杂
扩展性:单机内存限制
监控:缺乏成熟的监控工具

数据库索引的优势

持久化:数据永不丢失
ACID:事务保证一致性
扩展性:支持分库分表
监控:丰富的监控和调试工具
生态:备份、恢复、优化工具成熟

2. 核心表设计详解

2.1 Token表:index_tokens

CREATE TABLE index_tokens (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    token VARCHAR(100) NOT NULL,
    tokenizer_type ENUM('word', 'prefix', 'ngrams', 'singular') NOT NULL,
    weight INT NOT NULL DEFAULT 1,
    token_length SMALLINT NOT NULL,
    frequency BIGINT NOT NULL DEFAULT 0,  -- token出现频率
    last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    -- 关键索引设计
    UNIQUE KEY uk_token_type (token, tokenizer_type),
    KEY idx_token (token),
    KEY idx_type_weight (tokenizer_type, weight),
    KEY idx_frequency (frequency),
    KEY idx_length (token_length),

    -- 复合索引用于查询优化
    KEY idx_search_optimized (tokenizer_type, token, weight),
    KEY idx_type_freq (tokenizer_type, frequency DESC)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
PARTITION BY HASH(tokenizer_type)
PARTITIONS 4;

设计要点分析

class TokenTableDesign
{
    /**
     * 为什么用BIGINT作为主键?
     * 支持海量token:理论上可支持1840亿个唯一token
     * 分区友好:每个分区独立增长
     */
    public function explainIdChoice(): string
    {
        return "INT(21亿)可能不够,BIGINT(1840亿)更安全";
    }

    /**
     * 为什么VARCHAR(100)?
     * 覆盖99%的英文单词(最长单词:pneumonoultramicroscopicsilicovolcanoconiosis, 45字符)
     * 支持复合词和短语搜索
     * 索引效率平衡(太长影响索引性能)
     */
    public function explainTokenLength(): string
    {
        return "平衡存储效率和查询性能";
    }

    /**
     * 为什么存储frequency?
     * IDF计算:inverse document frequency
     * 查询优化:高频token可以优先处理
     * 缓存策略:热token缓存
     */
    public function explainFrequencyField(): string
    {
        return "支持高级搜索算法和性能优化";
    }
}

2.2 索引条目表:index_entries

CREATE TABLE index_entries (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    token_id BIGINT NOT NULL,
    document_id BIGINT NOT NULL,
    field_name VARCHAR(50) NOT NULL,
    field_weight INT NOT NULL DEFAULT 1,
    position_in_field INT NOT NULL,  -- token在字段中的位置
    context_before VARCHAR(50),      -- 上下文信息(可选)
    context_after VARCHAR(50),       -- 上下文信息(可选)
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- 外键约束保证数据完整性
    FOREIGN KEY (token_id) REFERENCES index_tokens(id) ON DELETE CASCADE,
    FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE,

    -- 核心性能索引
    KEY idx_token_document (token_id, document_id),
    KEY idx_document_field (document_id, field_name),
    KEY idx_position (position_in_field),
    KEY idx_field_weight (field_name, field_weight),

    -- 复合索引用于复杂查询
    KEY idx_search_performance (token_id, field_weight, document_id),
    KEY idx_document_optimized (document_id, field_name, position_in_field),

    -- 分区策略
    UNIQUE KEY uk_unique_entry (token_id, document_id, field_name, position_in_field)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
PARTITION BY HASH(document_id)
PARTITIONS 8;

分区策略详解

class PartitioningStrategy
{
    /**
     * 为什么按document_id分区?
     * 查询模式:大部分查询按文档聚合结果
     * 删除效率:删除文档时只需清理一个分区
     * 负载均衡:文档ID通常均匀分布
     */
    public function explainDocumentPartitioning(): array
    {
        return [
            'query_optimization' => 'SELECT * FROM index_entries WHERE document_id = ?',
            'delete_efficiency' => '单个分区的DELETE操作',
            'load_balancing' => '文档ID随机分布,避免热点'
        ];
    }

    /**
     * 为什么8个分区?
     * 并发度:支持8个并行操作
     * 存储平衡:每个分区大小适中
     * 维护成本:不会过度增加复杂性
     */
    public function explainPartitionCount(): string
    {
        return "平衡并发性能和管理复杂度";
    }

    /**
     * 动态分区策略
     */
    public function getOptimalPartitionCount(int $estimatedDocuments): int
    {
        if ($estimatedDocuments < 100000) {
            return 2;   // 小数据集,简单分区
        } elseif ($estimatedDocuments < 10000000) {
            return 8;   // 中等数据集,平衡分区
        } else {
            return 16;  // 大数据集,高并发分区
        }
    }
}

2.3 文档表:documents

CREATE TABLE documents (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    title VARCHAR(255) NOT NULL,
    content LONGTEXT,
    description TEXT,
    author_id BIGINT,
    category_id INT,
    tags JSON,  -- 存储标签数组
    length INT NOT NULL DEFAULT 0,
    status ENUM('active', 'deleted', 'draft') DEFAULT 'active',
    priority INT DEFAULT 0,  -- 搜索优先级
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- 搜索相关索引
    KEY idx_status_priority (status, priority DESC),
    KEY idx_length (length),
    KEY idx_created (created_at),
    KEY idx_author_category (author_id, category_id),
    KEY idx_priority_desc (priority DESC),

    -- 全文索引(可选,用于快速预过滤)
    FULLTEXT KEY ft_title_content (title, content)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

3. 索引构建系统实现

3.1 批量索引服务

class IndexingService
{
    private PDO $db;
    private TokenizerFactory $tokenizerFactory;
    private int $batchSize = 1000;
    private int $maxRetries = 3;

    public function __construct(PDO $db, TokenizerFactory $tokenizerFactory)
    {
        $this->db = $db;
        $this->tokenizerFactory = $tokenizerFactory;
        $this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
    }

    /**
     * 为单个文档建立索引
     */
    public function indexDocument(array $document): bool
    {
        try {
            $this->db->beginTransaction();

            // 1. 清除旧索引
            $this->removeDocumentIndex($document['id']);

            // 2. 生成tokens
            $tokens = $this->generateTokens($document);

            // 3. 批量插入tokens
            $this->batchInsertTokens($tokens);

            // 4. 批量插入索引条目
            $this->batchInsertEntries($document['id'], $tokens);

            // 5. 更新文档长度
            $this->updateDocumentLength($document['id'], $document);

            $this->db->commit();
            return true;

        } catch (Exception $e) {
            $this->db->rollBack();
            $this->logError("Indexing failed for document {$document['id']}", $e);
            return false;
        }
    }

    /**
     * 批量索引多个文档
     */
    public function indexDocuments(array $documents): array
    {
        $results = ['success' => [], 'failed' => []];
        $batches = array_chunk($documents, $this->batchSize);

        foreach ($batches as $batch) {
            try {
                $this->db->beginTransaction();

                // 预处理:生成所有tokens
                $allTokens = [];
                foreach ($batch as $document) {
                    $allTokens[$document['id']] = $this->generateTokens($document);
                }

                // 批量插入所有tokens
                $this->batchInsertAllTokens($allTokens);

                // 批量插入所有索引条目
                $this->batchInsertAllEntries($allTokens);

                // 批量更新文档信息
                $this->batchUpdateDocuments($batch);

                $this->db->commit();
                $results['success'] = array_merge($results['success'], $batch);

            } catch (Exception $e) {
                $this->db->rollBack();
                $this->logError("Batch indexing failed", $e);

                // 降级为单文档处理
                foreach ($batch as $document) {
                    if ($this->indexDocument($document)) {
                        $results['success'][] = $document;
                    } else {
                        $results['failed'][] = $document;
                    }
                }
            }
        }

        return $results;
    }

    /**
     * 生成文档的所有tokens
     */
    private function generateTokens(array $document): array
    {
        $tokens = [];
        $config = [
            'word' => true,
            'prefix' => true,
            'ngrams' => true,
            'singular' => true
        ];

        $tokenizer = $this->tokenizerFactory->createComposite($config);

        // 处理每个字段
        $fields = ['title', 'description', 'content'];
        $fieldWeights = ['title' => 10, 'description' => 5, 'content' => 2];

        foreach ($fields as $field) {
            if (!empty($document[$field])) {
                $fieldTokens = $tokenizer->tokenize($document[$field]);

                foreach ($fieldTokens as $tokenInfo) {
                    $tokens[] = [
                        'token' => $tokenInfo['token'],
                        'tokenizer_type' => $tokenInfo['source'],
                        'weight' => $tokenInfo['weight'],
                        'field' => $field,
                        'field_weight' => $fieldWeights[$field]
                    ];
                }
            }
        }

        return $tokens;
    }

    /**
     * 批量插入tokens
     */
    private function batchInsertTokens(array $tokens): void
    {
        if (empty($tokens)) {
            return;
        }

        $tokenGroups = [];
        foreach ($tokens as $token) {
            $key = $token['token'] . '|' . $token['tokenizer_type'];
            if (!isset($tokenGroups[$key])) {
                $tokenGroups[$key] = [
                    'token' => $token['token'],
                    'tokenizer_type' => $token['tokenizer_type'],
                    'weight' => $token['weight'],
                    'token_length' => strlen($token['token']),
                    'count' => 0
                ];
            }
            $tokenGroups[$key]['count']++;
        }

        $sql = "INSERT INTO index_tokens
                (token, tokenizer_type, weight, token_length, frequency)
                VALUES (:token, :type, :weight, :length, :freq)
                ON DUPLICATE KEY UPDATE
                weight = VALUES(weight),
                frequency = frequency + VALUES(freq)";

        $stmt = $this->db->prepare($sql);

        foreach ($tokenGroups as $token) {
            $stmt->execute([
                ':token' => $token['token'],
                ':type' => $token['tokenizer_type'],
                ':weight' => $token['weight'],
                ':length' => $token['token_length'],
                ':freq' => $token['count']
            ]);
        }
    }

    /**
     * 批量插入索引条目
     */
    private function batchInsertEntries(int $documentId, array $tokens): void
    {
        if (empty($tokens)) {
            return;
        }

        $sql = "INSERT INTO index_entries
                (token_id, document_id, field_name, field_weight, position_in_field)
                VALUES ((SELECT id FROM index_tokens WHERE token = ? AND tokenizer_type = ? LIMIT 1), ?, ?, ?, ?)";

        $stmt = $this->db->prepare($sql);
        $position = 0;

        foreach ($tokens as $token) {
            $stmt->execute([
                $token['token'],
                $token['tokenizer_type'],
                $documentId,
                $token['field'],
                $token['field_weight'],
                $position++
            ]);
        }
    }

    /**
     * 删除文档索引
     */
    public function removeDocumentIndex(int $documentId): bool
    {
        try {
            $sql = "DELETE FROM index_entries WHERE document_id = ?";
            $stmt = $this->db->prepare($sql);
            $stmt->execute([$documentId]);

            // 可选:清理不再被引用的tokens
            $this->cleanupOrphanedTokens();

            return true;
        } catch (Exception $e) {
            $this->logError("Failed to remove index for document {$documentId}", $e);
            return false;
        }
    }

    /**
     * 清理孤立的tokens
     */
    private function cleanupOrphanedTokens(): void
    {
        $sql = "DELETE t FROM index_tokens t
                LEFT JOIN index_entries e ON t.id = e.token_id
                WHERE e.token_id IS NULL";

        $this->db->exec($sql);
    }

    /**
     * 更新文档长度
     */
    private function updateDocumentLength(int $documentId, array $document): void
    {
        $length = strlen($document['title'] ?? '') +
                  strlen($document['description'] ?? '') +
                  strlen($document['content'] ?? '');

        $sql = "UPDATE documents SET length = ? WHERE id = ?";
        $stmt = $this->db->prepare($sql);
        $stmt->execute([$length, $documentId]);
    }
}

3.2 增量索引更新

class IncrementalIndexer
{
    private IndexingService $indexingService;
    private int $checkpointInterval = 1000;

    public function __construct(IndexingService $indexingService)
    {
        $this->indexingService = $indexingService;
    }

    /**
     * 增量更新索引
     */
    public function updateIndexIncremental(DateTime $since, DateTime $until): array
    {
        $results = ['updated' => 0, 'failed' => 0, 'deleted' => 0];

        // 获取需要更新的文档
        $updatedDocuments = $this->getUpdatedDocuments($since, $until);
        $deletedDocuments = $this->getDeletedDocuments($since, $until);

        // 处理更新的文档
        foreach ($updatedDocuments as $document) {
            if ($this->indexingService->indexDocument($document)) {
                $results['updated']++;
            } else {
                $results['failed']++;
            }

            // 检查点:每1000个文档提交一次
            if (($results['updated'] + $results['failed']) % $this->checkpointInterval === 0) {
                $this->checkpoint();
            }
        }

        // 处理删除的文档
        foreach ($deletedDocuments as $documentId) {
            if ($this->indexingService->removeDocumentIndex($documentId)) {
                $results['deleted']++;
            }
        }

        $this->checkpoint();  // 最终检查点
        return $results;
    }

    private function getUpdatedDocuments(DateTime $since, DateTime $until): array
    {
        $sql = "SELECT * FROM documents
                WHERE updated_at BETWEEN ? AND ?
                AND status = 'active'
                ORDER BY updated_at ASC";

        $stmt = $this->indexingService->db->prepare($sql);
        $stmt->execute([
            $since->format('Y-m-d H:i:s'),
            $until->format('Y-m-d H:i:s')
        ]);

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    private function getDeletedDocuments(DateTime $since, DateTime $until): array
    {
        $sql = "SELECT id FROM documents
                WHERE updated_at BETWEEN ? AND ?
                AND status = 'deleted'";

        $stmt = $this->indexingService->db->prepare($sql);
        $stmt->execute([
            $since->format('Y-m-d H:i:s'),
            $until->format('Y-m-d H:i:s')
        ]);

        return $stmt->fetchAll(PDO::FETCH_COLUMN);
    }

    private function checkpoint(): void
    {
        // 记录检查点
        $sql = "INSERT INTO index_checkpoints (checkpoint_time) VALUES (NOW())";
        $this->indexingService->db->exec($sql);
    }
}

4. 性能优化策略

4.1 查询优化索引

-- 核心搜索查询的优化索引
CREATE INDEX idx_optimized_search ON index_entries(
    token_id,
    field_weight DESC,
    document_id
) INCLUDE (field_name, position_in_field);

-- 支持字段过滤的索引
CREATE INDEX idx_field_filtered_search ON index_entries(
    field_name,
    token_id,
    document_id
) INCLUDE (field_weight);

-- 支持范围查询的索引
CREATE INDEX idx_weight_range_search ON index_entries(
    field_weight,
    token_id
);

-- 覆盖索引避免回表
CREATE INDEX idx_covering_search ON index_entries(
    token_id,
    document_id,
    field_name,
    field_weight,
    position_in_field
);

4.2 批量操作优化

class BatchOperationOptimizer
{
    private PDO $db;
    private int $optimalBatchSize = 500;  // 基于测试确定的最佳批量大小

    public function __construct(PDO $db)
    {
        $this->db = $db;
    }

    /**
     * 优化的批量插入
     */
    public function optimizedBatchInsert(string $table, array $data): int
    {
        if (empty($data)) {
            return 0;
        }

        $batches = array_chunk($data, $this->optimalBatchSize);
        $totalInserted = 0;

        foreach ($batches as $batch) {
            $inserted = $this->insertBatch($table, $batch);
            $totalInserted += $inserted;

            // 内存管理:清理大变量
            if (memory_get_usage() > 512 * 1024 * 1024) {  // 512MB
                gc_collect_cycles();
            }
        }

        return $totalInserted;
    }

    private function insertBatch(string $table, array $batch): int
    {
        $columns = array_keys($batch[0]);
        $placeholders = implode(',', array_fill(0, count($columns), '?'));
        $columnList = implode(',', $columns);

        $sql = "INSERT INTO {$table} ({$columnList}) VALUES ({$placeholders})";

        try {
            $this->db->beginTransaction();

            // 准备语句
            $stmt = $this->db->prepare($sql);

            // 批量执行
            foreach ($batch as $row) {
                $stmt->execute(array_values($row));
            }

            $inserted = $stmt->rowCount();
            $this->db->commit();

            return $inserted;

        } catch (Exception $e) {
            $this->db->rollBack();
            throw $e;
        }
    }

    /**
     * LOAD DATA INFILE 优化(适用于大数据量)
     */
    public function loadFileInfile(string $table, string $filePath, array $columns): int
    {
        $columnList = implode(',', $columns);

        $sql = "LOAD DATA LOCAL INFILE ?
                INTO TABLE {$table}
                FIELDS TERMINATED BY ','
                LINES TERMINATED BY '\n'
                ({$columnList})";

        $stmt = $this->db->prepare($sql);
        $stmt->execute([$filePath]);

        return $stmt->rowCount();
    }

    /**
     * 自适应批量大小
     */
    public function getAdaptiveBatchSize(int $recordSize, int $availableMemory): int
    {
        // 估算每个记录的内存占用(字节)
        $estimatedRecordSize = $recordSize * 4;  // 字符串的粗略估算

        // 目标:使用可用内存的70%
        $targetMemoryUsage = $availableMemory * 0.7;

        // 计算最佳批量大小
        $optimalSize = (int)($targetMemoryUsage / $estimatedRecordSize);

        // 限制在合理范围内
        return max(100, min(2000, $optimalSize));
    }
}

4.3 分区表管理

class PartitionManager
{
    private PDO $db;

    public function __construct(PDO $db)
    {
        $this->db = $db;
    }

    /**
     * 动态增加分区
     */
    public function addPartition(string $table, string $partitionName, string $value): bool
    {
        $sql = "ALTER TABLE {$table}
                ADD PARTITION (
                    PARTITION {$partitionName} VALUES IN ({$value})
                )";

        try {
            $this->db->exec($sql);
            return true;
        } catch (Exception $e) {
            $this->logError("Failed to add partition {$partitionName}", $e);
            return false;
        }
    }

    /**
     * 分区维护和优化
     */
    public function maintainPartitions(string $table): array
    {
        $results = ['optimized' => [], 'analyzed' => []];

        // 获取所有分区
        $partitions = $this->getTablePartitions($table);

        foreach ($partitions as $partition) {
            $partitionName = $partition['PARTITION_NAME'];

            // 优化分区
            try {
                $sql = "OPTIMIZE TABLE {$table} PARTITION {$partitionName}";
                $this->db->exec($sql);
                $results['optimized'][] = $partitionName;
            } catch (Exception $e) {
                $this->logError("Failed to optimize partition {$partitionName}", $e);
            }

            // 分析分区统计信息
            try {
                $sql = "ANALYZE TABLE {$table} PARTITION {$partitionName}";
                $this->db->exec($sql);
                $results['analyzed'][] = $partitionName;
            } catch (Exception $e) {
                $this->logError("Failed to analyze partition {$partitionName}", $e);
            }
        }

        return $results;
    }

    private function getTablePartitions(string $table): array
    {
        $sql = "SELECT PARTITION_NAME, TABLE_ROWS, DATA_LENGTH
                FROM information_schema.PARTITIONS
                WHERE TABLE_NAME = ? AND PARTITION_NAME IS NOT NULL";

        $stmt = $this->db->prepare($sql);
        $stmt->execute([$table]);

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * 分区重平衡
     */
    public function rebalancePartitions(string $table): bool
    {
        try {
            // 临时禁用分区
            $this->db->exec("ALTER TABLE {$table} REMOVE PARTITIONING");

            // 重新添加优化的分区配置
            $this->addOptimizedPartitioning($table);

            return true;
        } catch (Exception $e) {
            $this->logError("Failed to rebalance partitions for {$table}", $e);
            return false;
        }
    }

    private function addOptimizedPartitioning(string $table): void
    {
        $sql = "ALTER TABLE {$table}
                PARTITION BY HASH(document_id)
                PARTITIONS 16";

        $this->db->exec($sql);
    }
}

5. 监控和诊断

5.1 索引健康检查

class IndexHealthMonitor
{
    private PDO $db;
    private array $thresholds = [
        'max_partition_size' => 10 * 1024 * 1024 * 1024,  // 10GB
        'max_index_fragmentation' => 30,  // 30%
        'min_query_performance' => 100,  // ms
    ];

    public function __construct(PDO $db)
    {
        $this->db = $db;
    }

    /**
     * 全面的健康检查
     */
    public function performHealthCheck(): array
    {
        $health = [
            'overall_status' => 'healthy',
            'issues' => [],
            'recommendations' => [],
            'metrics' => []
        ];

        // 检查分区大小
        $partitionIssues = $this->checkPartitionSizes();
        if (!empty($partitionIssues)) {
            $health['issues'] = array_merge($health['issues'], $partitionIssues);
            $health['overall_status'] = 'warning';
        }

        // 检查索引碎片化
        $fragmentationIssues = $this->checkIndexFragmentation();
        if (!empty($fragmentationIssues)) {
            $health['issues'] = array_merge($health['issues'], $fragmentationIssues);
        }

        // 检查查询性能
        $performanceIssues = $this->checkQueryPerformance();
        if (!empty($performanceIssues)) {
            $health['issues'] = array_merge($health['issues'], $performanceIssues);
            if ($health['overall_status'] === 'healthy') {
                $health['overall_status'] = 'warning';
            }
        }

        // 生成建议
        $health['recommendations'] = $this->generateRecommendations($health['issues']);

        // 收集指标
        $health['metrics'] = $this->collectMetrics();

        return $health;
    }

    private function checkPartitionSizes(): array
    {
        $issues = [];
        $sql = "SELECT PARTITION_NAME, DATA_LENGTH, TABLE_ROWS
                FROM information_schema.PARTITIONS
                WHERE TABLE_NAME = 'index_entries'
                AND PARTITION_NAME IS NOT NULL";

        $stmt = $this->db->query($sql);
        $partitions = $stmt->fetchAll(PDO::FETCH_ASSOC);

        foreach ($partitions as $partition) {
            if ($partition['DATA_LENGTH'] > $this->thresholds['max_partition_size']) {
                $issues[] = [
                    'type' => 'partition_size',
                    'partition' => $partition['PARTITION_NAME'],
                    'size' => $partition['DATA_LENGTH'],
                    'severity' => 'high'
                ];
            }
        }

        return $issues;
    }

    private function checkIndexFragmentation(): array
    {
        $issues = [];

        // 检查索引使用情况
        $sql = "SELECT TABLE_NAME, INDEX_NAME, CARDINALITY
                FROM information_schema.STATISTICS
                WHERE TABLE_SCHEMA = DATABASE()
                AND TABLE_NAME IN ('index_tokens', 'index_entries')";

        $stmt = $this->db->query($sql);
        $indexes = $stmt->fetchAll(PDO::FETCH_ASSOC);

        foreach ($indexes as $index) {
            $fragmentation = $this->calculateFragmentation($index);
            if ($fragmentation > $this->thresholds['max_index_fragmentation']) {
                $issues[] = [
                    'type' => 'index_fragmentation',
                    'table' => $index['TABLE_NAME'],
                    'index' => $index['INDEX_NAME'],
                    'fragmentation' => $fragmentation,
                    'severity' => 'medium'
                ];
            }
        }

        return $issues;
    }

    private function calculateFragmentation(array $index): float
    {
        // 简化的碎片化计算
        // 实际实现应该使用更精确的算法
        return rand(10, 50);  // 占位符
    }

    private function checkQueryPerformance(): array
    {
        $issues = [];

        // 测试关键查询的性能
        $testQueries = [
            'simple_search' => "
                SELECT COUNT(*) FROM index_entries ie
                JOIN index_tokens it ON ie.token_id = it.id
                WHERE it.token = 'database'
            ",
            'complex_search' => "
                SELECT d.id, COUNT(*) as match_count
                FROM documents d
                JOIN index_entries ie ON d.id = ie.document_id
                JOIN index_tokens it ON ie.token_id = it.id
                WHERE it.token IN ('database', 'data')
                GROUP BY d.id
                ORDER BY match_count DESC
                LIMIT 10
            "
        ];

        foreach ($testQueries as $queryName => $sql) {
            $startTime = microtime(true);
            $this->db->query($sql);
            $executionTime = (microtime(true) - $startTime) * 1000;  // ms

            if ($executionTime > $this->thresholds['min_query_performance']) {
                $issues[] = [
                    'type' => 'slow_query',
                    'query' => $queryName,
                    'execution_time' => $executionTime,
                    'severity' => 'high'
                ];
            }
        }

        return $issues;
    }

    private function generateRecommendations(array $issues): array
    {
        $recommendations = [];

        foreach ($issues as $issue) {
            switch ($issue['type']) {
                case 'partition_size':
                    $recommendations[] = "Consider rebalancing partition {$issue['partition']} or splitting into smaller partitions";
                    break;

                case 'index_fragmentation':
                    $recommendations[] = "Rebuild index {$issue['index']} on table {$issue['table']}";
                    break;

                case 'slow_query':
                    $recommendations[] = "Optimize query {$issue['query']} - current execution time: {$issue['execution_time']}ms";
                    break;
            }
        }

        return array_unique($recommendations);
    }

    private function collectMetrics(): array
    {
        $metrics = [];

        // 基础指标
        $metrics['total_documents'] = $this->getScalar("SELECT COUNT(*) FROM documents WHERE status = 'active'");
        $metrics['total_tokens'] = $this->getScalar("SELECT COUNT(*) FROM index_tokens");
        $metrics['total_index_entries'] = $this->getScalar("SELECT COUNT(*) FROM index_entries");
        $metrics['average_tokens_per_document'] = $this->getScalar("
            SELECT AVG(entry_count) FROM (
                SELECT COUNT(*) as entry_count
                FROM index_entries
                GROUP BY document_id
            ) as counts
        ");

        // 性能指标
        $metrics['index_size_mb'] = $this->getTableSize('index_entries') / 1024 / 1024;
        $metrics['token_size_mb'] = $this->getTableSize('index_tokens') / 1024 / 1024;

        return $metrics;
    }

    private function getScalar(string $sql): int
    {
        return (int)$this->db->query($sql)->fetchColumn();
    }

    private function getTableSize(string $table): int
    {
        $sql = "SELECT DATA_LENGTH + INDEX_LENGTH
                FROM information_schema.TABLES
                WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ?";

        $stmt = $this->db->prepare($sql);
        $stmt->execute([$table]);

        return (int)$stmt->fetchColumn();
    }
}

6. 本章总结

6.1 核心收获

架构设计
– 理解反向索引的数据库实现原理
– 掌握分区表的设计和管理策略
– 学会批量操作的优化技巧
– 理解事务处理的重要性

性能优化
– 掌握索引设计的关键原则
– 学会分区表的管理和优化
– 理解批量操作的性能收益
– 掌握监控和诊断方法

工程实践
– 构建可扩展的索引服务架构
– 实现增量更新和故障恢复
– 建立完善的监控体系
– 掌握性能调优的方法论

6.2 下章预告

下一章我们将深入搜索查询优化,学习如何:

  • 设计高效的搜索查询SQL
  • 实现复杂的结果排序算法
  • 优化查询性能和响应时间
  • 构建完整的搜索服务架构

实践作业:设计并实现一个小型索引系统,支持文档的添加、删除和搜索功能。


上一篇第03章:权重系统设计 | 下一篇第05章:搜索查询优化

赞(0)
未经允许不得转载:Toy Tech Blog » 第04章:索引系统架构:高性能数据存储
免费、开放、可编程的智能路由方案,让你的服务随时随地在线。

评论 抢沙发

十年稳如初 — LocVPS,用时间证明实力

10+ 年老牌云主机服务商,全球机房覆盖,性能稳定、价格厚道。

老品牌,更懂稳定的价值你的第一台云服务器,从 LocVPS 开始