第04章：索引系统架构：高性能数据存储

智谱 GLM，支持多语言、多任务推理。从写作到代码生成，从搜索到知识问答，AI 生产力的中国解法。

第04章：索引系统架构：高性能数据存储

两个核心表支撑整个搜索系统，批量操作决定性能

📝 TL;DR (核心要点速览)
– 核心设计：index_tokens + index_entries = 完整反向索引
– 性能关键：批量操作、事务处理、索引优化
– 架构哲学：用数据库的强项替代内存的复杂性
– 扩展方案：分区、分片、读写分离的渐进式演进

1. 索引系统架构概览

1.1 从文档到索引的数据流

原始文档
    ↓ [分词处理]
Token流：(database, datab, data, base, databases)
    ↓ [去重和优化]
唯一Token：(database, datab, data, bases)
    ↓ [权重计算]
带权重Token：(database:10, datab:3, data:1, bases:8)
    ↓ [索引存储]
数据库记录：token_id, token, weight, tokenizer_type
    ↓ [反向映射]
文档映射：document_id ↔ token_ids

1.2 为什么选择数据库而非内存

传统内存索引的问题：

持久性：重启丢失，需要重建
一致性：多进程同步复杂
扩展性：单机内存限制
监控：缺乏成熟的监控工具

数据库索引的优势：

持久化：数据永不丢失
ACID：事务保证一致性
扩展性：支持分库分表
监控：丰富的监控和调试工具
生态：备份、恢复、优化工具成熟

2. 核心表设计详解

2.1 Token表：index_tokens

CREATE TABLE index_tokens (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    token VARCHAR(100) NOT NULL,
    tokenizer_type ENUM('word', 'prefix', 'ngrams', 'singular') NOT NULL,
    weight INT NOT NULL DEFAULT 1,
    token_length SMALLINT NOT NULL,
    frequency BIGINT NOT NULL DEFAULT 0,  -- token出现频率
    last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

    -- 关键索引设计
    UNIQUE KEY uk_token_type (token, tokenizer_type),
    KEY idx_token (token),
    KEY idx_type_weight (tokenizer_type, weight),
    KEY idx_frequency (frequency),
    KEY idx_length (token_length),

    -- 复合索引用于查询优化
    KEY idx_search_optimized (tokenizer_type, token, weight),
    KEY idx_type_freq (tokenizer_type, frequency DESC)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
PARTITION BY HASH(tokenizer_type)
PARTITIONS 4;

设计要点分析：

class TokenTableDesign
{
    /**
     * 为什么用BIGINT作为主键？
     * 支持海量token：理论上可支持1840亿个唯一token
     * 分区友好：每个分区独立增长
     */
    public function explainIdChoice(): string
    {
        return "INT(21亿)可能不够，BIGINT(1840亿)更安全";
    }

    /**
     * 为什么VARCHAR(100)？
     * 覆盖99%的英文单词（最长单词：pneumonoultramicroscopicsilicovolcanoconiosis, 45字符）
     * 支持复合词和短语搜索
     * 索引效率平衡（太长影响索引性能）
     */
    public function explainTokenLength(): string
    {
        return "平衡存储效率和查询性能";
    }

    /**
     * 为什么存储frequency？
     * IDF计算：inverse document frequency
     * 查询优化：高频token可以优先处理
     * 缓存策略：热token缓存
     */
    public function explainFrequencyField(): string
    {
        return "支持高级搜索算法和性能优化";
    }
}

2.2 索引条目表：index_entries

CREATE TABLE index_entries (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    token_id BIGINT NOT NULL,
    document_id BIGINT NOT NULL,
    field_name VARCHAR(50) NOT NULL,
    field_weight INT NOT NULL DEFAULT 1,
    position_in_field INT NOT NULL,  -- token在字段中的位置
    context_before VARCHAR(50),      -- 上下文信息（可选）
    context_after VARCHAR(50),       -- 上下文信息（可选）
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- 外键约束保证数据完整性
    FOREIGN KEY (token_id) REFERENCES index_tokens(id) ON DELETE CASCADE,
    FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE,

    -- 核心性能索引
    KEY idx_token_document (token_id, document_id),
    KEY idx_document_field (document_id, field_name),
    KEY idx_position (position_in_field),
    KEY idx_field_weight (field_name, field_weight),

    -- 复合索引用于复杂查询
    KEY idx_search_performance (token_id, field_weight, document_id),
    KEY idx_document_optimized (document_id, field_name, position_in_field),

    -- 分区策略
    UNIQUE KEY uk_unique_entry (token_id, document_id, field_name, position_in_field)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
PARTITION BY HASH(document_id)
PARTITIONS 8;

分区策略详解：

class PartitioningStrategy
{
    /**
     * 为什么按document_id分区？
     * 查询模式：大部分查询按文档聚合结果
     * 删除效率：删除文档时只需清理一个分区
     * 负载均衡：文档ID通常均匀分布
     */
    public function explainDocumentPartitioning(): array
    {
        return [
            'query_optimization' => 'SELECT * FROM index_entries WHERE document_id = ?',
            'delete_efficiency' => '单个分区的DELETE操作',
            'load_balancing' => '文档ID随机分布，避免热点'
        ];
    }

    /**
     * 为什么8个分区？
     * 并发度：支持8个并行操作
     * 存储平衡：每个分区大小适中
     * 维护成本：不会过度增加复杂性
     */
    public function explainPartitionCount(): string
    {
        return "平衡并发性能和管理复杂度";
    }

    /**
     * 动态分区策略
     */
    public function getOptimalPartitionCount(int $estimatedDocuments): int
    {
        if ($estimatedDocuments < 100000) {
            return 2;   // 小数据集，简单分区
        } elseif ($estimatedDocuments < 10000000) {
            return 8;   // 中等数据集，平衡分区
        } else {
            return 16;  // 大数据集，高并发分区
        }
    }
}

2.3 文档表：documents

CREATE TABLE documents (
    id BIGINT PRIMARY KEY AUTO_INCREMENT,
    title VARCHAR(255) NOT NULL,
    content LONGTEXT,
    description TEXT,
    author_id BIGINT,
    category_id INT,
    tags JSON,  -- 存储标签数组
    length INT NOT NULL DEFAULT 0,
    status ENUM('active', 'deleted', 'draft') DEFAULT 'active',
    priority INT DEFAULT 0,  -- 搜索优先级
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,

    -- 搜索相关索引
    KEY idx_status_priority (status, priority DESC),
    KEY idx_length (length),
    KEY idx_created (created_at),
    KEY idx_author_category (author_id, category_id),
    KEY idx_priority_desc (priority DESC),

    -- 全文索引（可选，用于快速预过滤）
    FULLTEXT KEY ft_title_content (title, content)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

3. 索引构建系统实现

3.1 批量索引服务

class IndexingService
{
    private PDO $db;
    private TokenizerFactory $tokenizerFactory;
    private int $batchSize = 1000;
    private int $maxRetries = 3;

    public function __construct(PDO $db, TokenizerFactory $tokenizerFactory)
    {
        $this->db = $db;
        $this->tokenizerFactory = $tokenizerFactory;
        $this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
    }

    /**
     * 为单个文档建立索引
     */
    public function indexDocument(array $document): bool
    {
        try {
            $this->db->beginTransaction();

            // 1. 清除旧索引
            $this->removeDocumentIndex($document['id']);

            // 2. 生成tokens
            $tokens = $this->generateTokens($document);

            // 3. 批量插入tokens
            $this->batchInsertTokens($tokens);

            // 4. 批量插入索引条目
            $this->batchInsertEntries($document['id'], $tokens);

            // 5. 更新文档长度
            $this->updateDocumentLength($document['id'], $document);

            $this->db->commit();
            return true;

        } catch (Exception $e) {
            $this->db->rollBack();
            $this->logError("Indexing failed for document {$document['id']}", $e);
            return false;
        }
    }

    /**
     * 批量索引多个文档
     */
    public function indexDocuments(array $documents): array
    {
        $results = ['success' => [], 'failed' => []];
        $batches = array_chunk($documents, $this->batchSize);

        foreach ($batches as $batch) {
            try {
                $this->db->beginTransaction();

                // 预处理：生成所有tokens
                $allTokens = [];
                foreach ($batch as $document) {
                    $allTokens[$document['id']] = $this->generateTokens($document);
                }

                // 批量插入所有tokens
                $this->batchInsertAllTokens($allTokens);

                // 批量插入所有索引条目
                $this->batchInsertAllEntries($allTokens);

                // 批量更新文档信息
                $this->batchUpdateDocuments($batch);

                $this->db->commit();
                $results['success'] = array_merge($results['success'], $batch);

            } catch (Exception $e) {
                $this->db->rollBack();
                $this->logError("Batch indexing failed", $e);

                // 降级为单文档处理
                foreach ($batch as $document) {
                    if ($this->indexDocument($document)) {
                        $results['success'][] = $document;
                    } else {
                        $results['failed'][] = $document;
                    }
                }
            }
        }

        return $results;
    }

    /**
     * 生成文档的所有tokens
     */
    private function generateTokens(array $document): array
    {
        $tokens = [];
        $config = [
            'word' => true,
            'prefix' => true,
            'ngrams' => true,
            'singular' => true
        ];

        $tokenizer = $this->tokenizerFactory->createComposite($config);

        // 处理每个字段
        $fields = ['title', 'description', 'content'];
        $fieldWeights = ['title' => 10, 'description' => 5, 'content' => 2];

        foreach ($fields as $field) {
            if (!empty($document[$field])) {
                $fieldTokens = $tokenizer->tokenize($document[$field]);

                foreach ($fieldTokens as $tokenInfo) {
                    $tokens[] = [
                        'token' => $tokenInfo['token'],
                        'tokenizer_type' => $tokenInfo['source'],
                        'weight' => $tokenInfo['weight'],
                        'field' => $field,
                        'field_weight' => $fieldWeights[$field]
                    ];
                }
            }
        }

        return $tokens;
    }

    /**
     * 批量插入tokens
     */
    private function batchInsertTokens(array $tokens): void
    {
        if (empty($tokens)) {
            return;
        }

        $tokenGroups = [];
        foreach ($tokens as $token) {
            $key = $token['token'] . '|' . $token['tokenizer_type'];
            if (!isset($tokenGroups[$key])) {
                $tokenGroups[$key] = [
                    'token' => $token['token'],
                    'tokenizer_type' => $token['tokenizer_type'],
                    'weight' => $token['weight'],
                    'token_length' => strlen($token['token']),
                    'count' => 0
                ];
            }
            $tokenGroups[$key]['count']++;
        }

        $sql = "INSERT INTO index_tokens
                (token, tokenizer_type, weight, token_length, frequency)
                VALUES (:token, :type, :weight, :length, :freq)
                ON DUPLICATE KEY UPDATE
                weight = VALUES(weight),
                frequency = frequency + VALUES(freq)";

        $stmt = $this->db->prepare($sql);

        foreach ($tokenGroups as $token) {
            $stmt->execute([
                ':token' => $token['token'],
                ':type' => $token['tokenizer_type'],
                ':weight' => $token['weight'],
                ':length' => $token['token_length'],
                ':freq' => $token['count']
            ]);
        }
    }

    /**
     * 批量插入索引条目
     */
    private function batchInsertEntries(int $documentId, array $tokens): void
    {
        if (empty($tokens)) {
            return;
        }

        $sql = "INSERT INTO index_entries
                (token_id, document_id, field_name, field_weight, position_in_field)
                VALUES ((SELECT id FROM index_tokens WHERE token = ? AND tokenizer_type = ? LIMIT 1), ?, ?, ?, ?)";

        $stmt = $this->db->prepare($sql);
        $position = 0;

        foreach ($tokens as $token) {
            $stmt->execute([
                $token['token'],
                $token['tokenizer_type'],
                $documentId,
                $token['field'],
                $token['field_weight'],
                $position++
            ]);
        }
    }

    /**
     * 删除文档索引
     */
    public function removeDocumentIndex(int $documentId): bool
    {
        try {
            $sql = "DELETE FROM index_entries WHERE document_id = ?";
            $stmt = $this->db->prepare($sql);
            $stmt->execute([$documentId]);

            // 可选：清理不再被引用的tokens
            $this->cleanupOrphanedTokens();

            return true;
        } catch (Exception $e) {
            $this->logError("Failed to remove index for document {$documentId}", $e);
            return false;
        }
    }

    /**
     * 清理孤立的tokens
     */
    private function cleanupOrphanedTokens(): void
    {
        $sql = "DELETE t FROM index_tokens t
                LEFT JOIN index_entries e ON t.id = e.token_id
                WHERE e.token_id IS NULL";

        $this->db->exec($sql);
    }

    /**
     * 更新文档长度
     */
    private function updateDocumentLength(int $documentId, array $document): void
    {
        $length = strlen($document['title'] ?? '') +
                  strlen($document['description'] ?? '') +
                  strlen($document['content'] ?? '');

        $sql = "UPDATE documents SET length = ? WHERE id = ?";
        $stmt = $this->db->prepare($sql);
        $stmt->execute([$length, $documentId]);
    }
}

3.2 增量索引更新

class IncrementalIndexer
{
    private IndexingService $indexingService;
    private int $checkpointInterval = 1000;

    public function __construct(IndexingService $indexingService)
    {
        $this->indexingService = $indexingService;
    }

    /**
     * 增量更新索引
     */
    public function updateIndexIncremental(DateTime $since, DateTime $until): array
    {
        $results = ['updated' => 0, 'failed' => 0, 'deleted' => 0];

        // 获取需要更新的文档
        $updatedDocuments = $this->getUpdatedDocuments($since, $until);
        $deletedDocuments = $this->getDeletedDocuments($since, $until);

        // 处理更新的文档
        foreach ($updatedDocuments as $document) {
            if ($this->indexingService->indexDocument($document)) {
                $results['updated']++;
            } else {
                $results['failed']++;
            }

            // 检查点：每1000个文档提交一次
            if (($results['updated'] + $results['failed']) % $this->checkpointInterval === 0) {
                $this->checkpoint();
            }
        }

        // 处理删除的文档
        foreach ($deletedDocuments as $documentId) {
            if ($this->indexingService->removeDocumentIndex($documentId)) {
                $results['deleted']++;
            }
        }

        $this->checkpoint();  // 最终检查点
        return $results;
    }

    private function getUpdatedDocuments(DateTime $since, DateTime $until): array
    {
        $sql = "SELECT * FROM documents
                WHERE updated_at BETWEEN ? AND ?
                AND status = 'active'
                ORDER BY updated_at ASC";

        $stmt = $this->indexingService->db->prepare($sql);
        $stmt->execute([
            $since->format('Y-m-d H:i:s'),
            $until->format('Y-m-d H:i:s')
        ]);

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    private function getDeletedDocuments(DateTime $since, DateTime $until): array
    {
        $sql = "SELECT id FROM documents
                WHERE updated_at BETWEEN ? AND ?
                AND status = 'deleted'";

        $stmt = $this->indexingService->db->prepare($sql);
        $stmt->execute([
            $since->format('Y-m-d H:i:s'),
            $until->format('Y-m-d H:i:s')
        ]);

        return $stmt->fetchAll(PDO::FETCH_COLUMN);
    }

    private function checkpoint(): void
    {
        // 记录检查点
        $sql = "INSERT INTO index_checkpoints (checkpoint_time) VALUES (NOW())";
        $this->indexingService->db->exec($sql);
    }
}

4. 性能优化策略

4.1 查询优化索引

-- 核心搜索查询的优化索引
CREATE INDEX idx_optimized_search ON index_entries(
    token_id,
    field_weight DESC,
    document_id
) INCLUDE (field_name, position_in_field);

-- 支持字段过滤的索引
CREATE INDEX idx_field_filtered_search ON index_entries(
    field_name,
    token_id,
    document_id
) INCLUDE (field_weight);

-- 支持范围查询的索引
CREATE INDEX idx_weight_range_search ON index_entries(
    field_weight,
    token_id
);

-- 覆盖索引避免回表
CREATE INDEX idx_covering_search ON index_entries(
    token_id,
    document_id,
    field_name,
    field_weight,
    position_in_field
);

4.2 批量操作优化

class BatchOperationOptimizer
{
    private PDO $db;
    private int $optimalBatchSize = 500;  // 基于测试确定的最佳批量大小

    public function __construct(PDO $db)
    {
        $this->db = $db;
    }

    /**
     * 优化的批量插入
     */
    public function optimizedBatchInsert(string $table, array $data): int
    {
        if (empty($data)) {
            return 0;
        }

        $batches = array_chunk($data, $this->optimalBatchSize);
        $totalInserted = 0;

        foreach ($batches as $batch) {
            $inserted = $this->insertBatch($table, $batch);
            $totalInserted += $inserted;

            // 内存管理：清理大变量
            if (memory_get_usage() > 512 * 1024 * 1024) {  // 512MB
                gc_collect_cycles();
            }
        }

        return $totalInserted;
    }

    private function insertBatch(string $table, array $batch): int
    {
        $columns = array_keys($batch[0]);
        $placeholders = implode(',', array_fill(0, count($columns), '?'));
        $columnList = implode(',', $columns);

        $sql = "INSERT INTO {$table} ({$columnList}) VALUES ({$placeholders})";

        try {
            $this->db->beginTransaction();

            // 准备语句
            $stmt = $this->db->prepare($sql);

            // 批量执行
            foreach ($batch as $row) {
                $stmt->execute(array_values($row));
            }

            $inserted = $stmt->rowCount();
            $this->db->commit();

            return $inserted;

        } catch (Exception $e) {
            $this->db->rollBack();
            throw $e;
        }
    }

    /**
     * LOAD DATA INFILE 优化（适用于大数据量）
     */
    public function loadFileInfile(string $table, string $filePath, array $columns): int
    {
        $columnList = implode(',', $columns);

        $sql = "LOAD DATA LOCAL INFILE ?
                INTO TABLE {$table}
                FIELDS TERMINATED BY ','
                LINES TERMINATED BY '\n'
                ({$columnList})";

        $stmt = $this->db->prepare($sql);
        $stmt->execute([$filePath]);

        return $stmt->rowCount();
    }

    /**
     * 自适应批量大小
     */
    public function getAdaptiveBatchSize(int $recordSize, int $availableMemory): int
    {
        // 估算每个记录的内存占用（字节）
        $estimatedRecordSize = $recordSize * 4;  // 字符串的粗略估算

        // 目标：使用可用内存的70%
        $targetMemoryUsage = $availableMemory * 0.7;

        // 计算最佳批量大小
        $optimalSize = (int)($targetMemoryUsage / $estimatedRecordSize);

        // 限制在合理范围内
        return max(100, min(2000, $optimalSize));
    }
}

4.3 分区表管理

class PartitionManager
{
    private PDO $db;

    public function __construct(PDO $db)
    {
        $this->db = $db;
    }

    /**
     * 动态增加分区
     */
    public function addPartition(string $table, string $partitionName, string $value): bool
    {
        $sql = "ALTER TABLE {$table}
                ADD PARTITION (
                    PARTITION {$partitionName} VALUES IN ({$value})
                )";

        try {
            $this->db->exec($sql);
            return true;
        } catch (Exception $e) {
            $this->logError("Failed to add partition {$partitionName}", $e);
            return false;
        }
    }

    /**
     * 分区维护和优化
     */
    public function maintainPartitions(string $table): array
    {
        $results = ['optimized' => [], 'analyzed' => []];

        // 获取所有分区
        $partitions = $this->getTablePartitions($table);

        foreach ($partitions as $partition) {
            $partitionName = $partition['PARTITION_NAME'];

            // 优化分区
            try {
                $sql = "OPTIMIZE TABLE {$table} PARTITION {$partitionName}";
                $this->db->exec($sql);
                $results['optimized'][] = $partitionName;
            } catch (Exception $e) {
                $this->logError("Failed to optimize partition {$partitionName}", $e);
            }

            // 分析分区统计信息
            try {
                $sql = "ANALYZE TABLE {$table} PARTITION {$partitionName}";
                $this->db->exec($sql);
                $results['analyzed'][] = $partitionName;
            } catch (Exception $e) {
                $this->logError("Failed to analyze partition {$partitionName}", $e);
            }
        }

        return $results;
    }

    private function getTablePartitions(string $table): array
    {
        $sql = "SELECT PARTITION_NAME, TABLE_ROWS, DATA_LENGTH
                FROM information_schema.PARTITIONS
                WHERE TABLE_NAME = ? AND PARTITION_NAME IS NOT NULL";

        $stmt = $this->db->prepare($sql);
        $stmt->execute([$table]);

        return $stmt->fetchAll(PDO::FETCH_ASSOC);
    }

    /**
     * 分区重平衡
     */
    public function rebalancePartitions(string $table): bool
    {
        try {
            // 临时禁用分区
            $this->db->exec("ALTER TABLE {$table} REMOVE PARTITIONING");

            // 重新添加优化的分区配置
            $this->addOptimizedPartitioning($table);

            return true;
        } catch (Exception $e) {
            $this->logError("Failed to rebalance partitions for {$table}", $e);
            return false;
        }
    }

    private function addOptimizedPartitioning(string $table): void
    {
        $sql = "ALTER TABLE {$table}
                PARTITION BY HASH(document_id)
                PARTITIONS 16";

        $this->db->exec($sql);
    }
}

5. 监控和诊断

5.1 索引健康检查

class IndexHealthMonitor
{
    private PDO $db;
    private array $thresholds = [
        'max_partition_size' => 10 * 1024 * 1024 * 1024,  // 10GB
        'max_index_fragmentation' => 30,  // 30%
        'min_query_performance' => 100,  // ms
    ];

    public function __construct(PDO $db)
    {
        $this->db = $db;
    }

    /**
     * 全面的健康检查
     */
    public function performHealthCheck(): array
    {
        $health = [
            'overall_status' => 'healthy',
            'issues' => [],
            'recommendations' => [],
            'metrics' => []
        ];

        // 检查分区大小
        $partitionIssues = $this->checkPartitionSizes();
        if (!empty($partitionIssues)) {
            $health['issues'] = array_merge($health['issues'], $partitionIssues);
            $health['overall_status'] = 'warning';
        }

        // 检查索引碎片化
        $fragmentationIssues = $this->checkIndexFragmentation();
        if (!empty($fragmentationIssues)) {
            $health['issues'] = array_merge($health['issues'], $fragmentationIssues);
        }

        // 检查查询性能
        $performanceIssues = $this->checkQueryPerformance();
        if (!empty($performanceIssues)) {
            $health['issues'] = array_merge($health['issues'], $performanceIssues);
            if ($health['overall_status'] === 'healthy') {
                $health['overall_status'] = 'warning';
            }
        }

        // 生成建议
        $health['recommendations'] = $this->generateRecommendations($health['issues']);

        // 收集指标
        $health['metrics'] = $this->collectMetrics();

        return $health;
    }

    private function checkPartitionSizes(): array
    {
        $issues = [];
        $sql = "SELECT PARTITION_NAME, DATA_LENGTH, TABLE_ROWS
                FROM information_schema.PARTITIONS
                WHERE TABLE_NAME = 'index_entries'
                AND PARTITION_NAME IS NOT NULL";

        $stmt = $this->db->query($sql);
        $partitions = $stmt->fetchAll(PDO::FETCH_ASSOC);

        foreach ($partitions as $partition) {
            if ($partition['DATA_LENGTH'] > $this->thresholds['max_partition_size']) {
                $issues[] = [
                    'type' => 'partition_size',
                    'partition' => $partition['PARTITION_NAME'],
                    'size' => $partition['DATA_LENGTH'],
                    'severity' => 'high'
                ];
            }
        }

        return $issues;
    }

    private function checkIndexFragmentation(): array
    {
        $issues = [];

        // 检查索引使用情况
        $sql = "SELECT TABLE_NAME, INDEX_NAME, CARDINALITY
                FROM information_schema.STATISTICS
                WHERE TABLE_SCHEMA = DATABASE()
                AND TABLE_NAME IN ('index_tokens', 'index_entries')";

        $stmt = $this->db->query($sql);
        $indexes = $stmt->fetchAll(PDO::FETCH_ASSOC);

        foreach ($indexes as $index) {
            $fragmentation = $this->calculateFragmentation($index);
            if ($fragmentation > $this->thresholds['max_index_fragmentation']) {
                $issues[] = [
                    'type' => 'index_fragmentation',
                    'table' => $index['TABLE_NAME'],
                    'index' => $index['INDEX_NAME'],
                    'fragmentation' => $fragmentation,
                    'severity' => 'medium'
                ];
            }
        }

        return $issues;
    }

    private function calculateFragmentation(array $index): float
    {
        // 简化的碎片化计算
        // 实际实现应该使用更精确的算法
        return rand(10, 50);  // 占位符
    }

    private function checkQueryPerformance(): array
    {
        $issues = [];

        // 测试关键查询的性能
        $testQueries = [
            'simple_search' => "
                SELECT COUNT(*) FROM index_entries ie
                JOIN index_tokens it ON ie.token_id = it.id
                WHERE it.token = 'database'
            ",
            'complex_search' => "
                SELECT d.id, COUNT(*) as match_count
                FROM documents d
                JOIN index_entries ie ON d.id = ie.document_id
                JOIN index_tokens it ON ie.token_id = it.id
                WHERE it.token IN ('database', 'data')
                GROUP BY d.id
                ORDER BY match_count DESC
                LIMIT 10
            "
        ];

        foreach ($testQueries as $queryName => $sql) {
            $startTime = microtime(true);
            $this->db->query($sql);
            $executionTime = (microtime(true) - $startTime) * 1000;  // ms

            if ($executionTime > $this->thresholds['min_query_performance']) {
                $issues[] = [
                    'type' => 'slow_query',
                    'query' => $queryName,
                    'execution_time' => $executionTime,
                    'severity' => 'high'
                ];
            }
        }

        return $issues;
    }

    private function generateRecommendations(array $issues): array
    {
        $recommendations = [];

        foreach ($issues as $issue) {
            switch ($issue['type']) {
                case 'partition_size':
                    $recommendations[] = "Consider rebalancing partition {$issue['partition']} or splitting into smaller partitions";
                    break;

                case 'index_fragmentation':
                    $recommendations[] = "Rebuild index {$issue['index']} on table {$issue['table']}";
                    break;

                case 'slow_query':
                    $recommendations[] = "Optimize query {$issue['query']} - current execution time: {$issue['execution_time']}ms";
                    break;
            }
        }

        return array_unique($recommendations);
    }

    private function collectMetrics(): array
    {
        $metrics = [];

        // 基础指标
        $metrics['total_documents'] = $this->getScalar("SELECT COUNT(*) FROM documents WHERE status = 'active'");
        $metrics['total_tokens'] = $this->getScalar("SELECT COUNT(*) FROM index_tokens");
        $metrics['total_index_entries'] = $this->getScalar("SELECT COUNT(*) FROM index_entries");
        $metrics['average_tokens_per_document'] = $this->getScalar("
            SELECT AVG(entry_count) FROM (
                SELECT COUNT(*) as entry_count
                FROM index_entries
                GROUP BY document_id
            ) as counts
        ");

        // 性能指标
        $metrics['index_size_mb'] = $this->getTableSize('index_entries') / 1024 / 1024;
        $metrics['token_size_mb'] = $this->getTableSize('index_tokens') / 1024 / 1024;

        return $metrics;
    }

    private function getScalar(string $sql): int
    {
        return (int)$this->db->query($sql)->fetchColumn();
    }

    private function getTableSize(string $table): int
    {
        $sql = "SELECT DATA_LENGTH + INDEX_LENGTH
                FROM information_schema.TABLES
                WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ?";

        $stmt = $this->db->prepare($sql);
        $stmt->execute([$table]);

        return (int)$stmt->fetchColumn();
    }
}

6. 本章总结

6.1 核心收获

架构设计：
– 理解反向索引的数据库实现原理
– 掌握分区表的设计和管理策略
– 学会批量操作的优化技巧
– 理解事务处理的重要性

性能优化：
– 掌握索引设计的关键原则
– 学会分区表的管理和优化
– 理解批量操作的性能收益
– 掌握监控和诊断方法

工程实践：
– 构建可扩展的索引服务架构
– 实现增量更新和故障恢复
– 建立完善的监控体系
– 掌握性能调优的方法论

6.2 下章预告

下一章我们将深入搜索查询优化，学习如何：

设计高效的搜索查询SQL
实现复杂的结果排序算法
优化查询性能和响应时间
构建完整的搜索服务架构

实践作业：设计并实现一个小型索引系统，支持文档的添加、删除和搜索功能。

上一篇 → 第03章：权重系统设计 | 下一篇 → 第05章：搜索查询优化

第04章：索引系统架构：高性能数据存储

1. 索引系统架构概览

1.1 从文档到索引的数据流

1.2 为什么选择数据库而非内存

2. 核心表设计详解

2.1 Token表：index_tokens

2.2 索引条目表：index_entries

2.3 文档表：documents

3. 索引构建系统实现

3.1 批量索引服务

3.2 增量索引更新

4. 性能优化策略

4.1 查询优化索引

4.2 批量操作优化

4.3 分区表管理

5. 监控和诊断

5.1 索引健康检查

6. 本章总结

6.1 核心收获

6.2 下章预告

相关推荐

评论 抢沙发

作者介绍

Toy

文章目录

置顶推荐

前沿哨所

遭 AI 报告轰炸，cURL 宣布取消 Bug 赏金计划

Retain：跨平台AI对话知识库，自动提取偏好统一上下文

MMORPG《BitCraft》宣布开源服务器端代码

开源项目 Grov：为 AI 编程代理打造多人协作共享记忆

Spotify秘密胜诉，法庭下令查封影子图书馆Anna's Archive域名

AdGuard开源VPN协议TrustTunnel：抗审查且高性能

最新文章

热门专题

热门标签

网站统计

最新评论

十年稳如初 — LocVPS，用时间证明实力

10+ 年老牌云主机服务商，全球机房覆盖，性能稳定、价格厚道。

评论抢沙发