第04章:索引系统架构:高性能数据存储
两个核心表支撑整个搜索系统,批量操作决定性能
📝 TL;DR (核心要点速览)
– 核心设计:index_tokens + index_entries = 完整反向索引
– 性能关键:批量操作、事务处理、索引优化
– 架构哲学:用数据库的强项替代内存的复杂性
– 扩展方案:分区、分片、读写分离的渐进式演进
1. 索引系统架构概览
1.1 从文档到索引的数据流
原始文档
↓ [分词处理]
Token流:(database, datab, data, base, databases)
↓ [去重和优化]
唯一Token:(database, datab, data, bases)
↓ [权重计算]
带权重Token:(database:10, datab:3, data:1, bases:8)
↓ [索引存储]
数据库记录:token_id, token, weight, tokenizer_type
↓ [反向映射]
文档映射:document_id ↔ token_ids
1.2 为什么选择数据库而非内存
传统内存索引的问题:
持久性:重启丢失,需要重建
一致性:多进程同步复杂
扩展性:单机内存限制
监控:缺乏成熟的监控工具
数据库索引的优势:
持久化:数据永不丢失
ACID:事务保证一致性
扩展性:支持分库分表
监控:丰富的监控和调试工具
生态:备份、恢复、优化工具成熟
2. 核心表设计详解
2.1 Token表:index_tokens
CREATE TABLE index_tokens (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
token VARCHAR(100) NOT NULL,
tokenizer_type ENUM('word', 'prefix', 'ngrams', 'singular') NOT NULL,
weight INT NOT NULL DEFAULT 1,
token_length SMALLINT NOT NULL,
frequency BIGINT NOT NULL DEFAULT 0, -- token出现频率
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- 关键索引设计
UNIQUE KEY uk_token_type (token, tokenizer_type),
KEY idx_token (token),
KEY idx_type_weight (tokenizer_type, weight),
KEY idx_frequency (frequency),
KEY idx_length (token_length),
-- 复合索引用于查询优化
KEY idx_search_optimized (tokenizer_type, token, weight),
KEY idx_type_freq (tokenizer_type, frequency DESC)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
PARTITION BY HASH(tokenizer_type)
PARTITIONS 4;
设计要点分析:
class TokenTableDesign
{
/**
* 为什么用BIGINT作为主键?
* 支持海量token:理论上可支持1840亿个唯一token
* 分区友好:每个分区独立增长
*/
public function explainIdChoice(): string
{
return "INT(21亿)可能不够,BIGINT(1840亿)更安全";
}
/**
* 为什么VARCHAR(100)?
* 覆盖99%的英文单词(最长单词:pneumonoultramicroscopicsilicovolcanoconiosis, 45字符)
* 支持复合词和短语搜索
* 索引效率平衡(太长影响索引性能)
*/
public function explainTokenLength(): string
{
return "平衡存储效率和查询性能";
}
/**
* 为什么存储frequency?
* IDF计算:inverse document frequency
* 查询优化:高频token可以优先处理
* 缓存策略:热token缓存
*/
public function explainFrequencyField(): string
{
return "支持高级搜索算法和性能优化";
}
}
2.2 索引条目表:index_entries
CREATE TABLE index_entries (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
token_id BIGINT NOT NULL,
document_id BIGINT NOT NULL,
field_name VARCHAR(50) NOT NULL,
field_weight INT NOT NULL DEFAULT 1,
position_in_field INT NOT NULL, -- token在字段中的位置
context_before VARCHAR(50), -- 上下文信息(可选)
context_after VARCHAR(50), -- 上下文信息(可选)
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 外键约束保证数据完整性
FOREIGN KEY (token_id) REFERENCES index_tokens(id) ON DELETE CASCADE,
FOREIGN KEY (document_id) REFERENCES documents(id) ON DELETE CASCADE,
-- 核心性能索引
KEY idx_token_document (token_id, document_id),
KEY idx_document_field (document_id, field_name),
KEY idx_position (position_in_field),
KEY idx_field_weight (field_name, field_weight),
-- 复合索引用于复杂查询
KEY idx_search_performance (token_id, field_weight, document_id),
KEY idx_document_optimized (document_id, field_name, position_in_field),
-- 分区策略
UNIQUE KEY uk_unique_entry (token_id, document_id, field_name, position_in_field)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
PARTITION BY HASH(document_id)
PARTITIONS 8;
分区策略详解:
class PartitioningStrategy
{
/**
* 为什么按document_id分区?
* 查询模式:大部分查询按文档聚合结果
* 删除效率:删除文档时只需清理一个分区
* 负载均衡:文档ID通常均匀分布
*/
public function explainDocumentPartitioning(): array
{
return [
'query_optimization' => 'SELECT * FROM index_entries WHERE document_id = ?',
'delete_efficiency' => '单个分区的DELETE操作',
'load_balancing' => '文档ID随机分布,避免热点'
];
}
/**
* 为什么8个分区?
* 并发度:支持8个并行操作
* 存储平衡:每个分区大小适中
* 维护成本:不会过度增加复杂性
*/
public function explainPartitionCount(): string
{
return "平衡并发性能和管理复杂度";
}
/**
* 动态分区策略
*/
public function getOptimalPartitionCount(int $estimatedDocuments): int
{
if ($estimatedDocuments < 100000) {
return 2; // 小数据集,简单分区
} elseif ($estimatedDocuments < 10000000) {
return 8; // 中等数据集,平衡分区
} else {
return 16; // 大数据集,高并发分区
}
}
}
2.3 文档表:documents
CREATE TABLE documents (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(255) NOT NULL,
content LONGTEXT,
description TEXT,
author_id BIGINT,
category_id INT,
tags JSON, -- 存储标签数组
length INT NOT NULL DEFAULT 0,
status ENUM('active', 'deleted', 'draft') DEFAULT 'active',
priority INT DEFAULT 0, -- 搜索优先级
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
-- 搜索相关索引
KEY idx_status_priority (status, priority DESC),
KEY idx_length (length),
KEY idx_created (created_at),
KEY idx_author_category (author_id, category_id),
KEY idx_priority_desc (priority DESC),
-- 全文索引(可选,用于快速预过滤)
FULLTEXT KEY ft_title_content (title, content)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
3. 索引构建系统实现
3.1 批量索引服务
class IndexingService
{
private PDO $db;
private TokenizerFactory $tokenizerFactory;
private int $batchSize = 1000;
private int $maxRetries = 3;
public function __construct(PDO $db, TokenizerFactory $tokenizerFactory)
{
$this->db = $db;
$this->tokenizerFactory = $tokenizerFactory;
$this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
}
/**
* 为单个文档建立索引
*/
public function indexDocument(array $document): bool
{
try {
$this->db->beginTransaction();
// 1. 清除旧索引
$this->removeDocumentIndex($document['id']);
// 2. 生成tokens
$tokens = $this->generateTokens($document);
// 3. 批量插入tokens
$this->batchInsertTokens($tokens);
// 4. 批量插入索引条目
$this->batchInsertEntries($document['id'], $tokens);
// 5. 更新文档长度
$this->updateDocumentLength($document['id'], $document);
$this->db->commit();
return true;
} catch (Exception $e) {
$this->db->rollBack();
$this->logError("Indexing failed for document {$document['id']}", $e);
return false;
}
}
/**
* 批量索引多个文档
*/
public function indexDocuments(array $documents): array
{
$results = ['success' => [], 'failed' => []];
$batches = array_chunk($documents, $this->batchSize);
foreach ($batches as $batch) {
try {
$this->db->beginTransaction();
// 预处理:生成所有tokens
$allTokens = [];
foreach ($batch as $document) {
$allTokens[$document['id']] = $this->generateTokens($document);
}
// 批量插入所有tokens
$this->batchInsertAllTokens($allTokens);
// 批量插入所有索引条目
$this->batchInsertAllEntries($allTokens);
// 批量更新文档信息
$this->batchUpdateDocuments($batch);
$this->db->commit();
$results['success'] = array_merge($results['success'], $batch);
} catch (Exception $e) {
$this->db->rollBack();
$this->logError("Batch indexing failed", $e);
// 降级为单文档处理
foreach ($batch as $document) {
if ($this->indexDocument($document)) {
$results['success'][] = $document;
} else {
$results['failed'][] = $document;
}
}
}
}
return $results;
}
/**
* 生成文档的所有tokens
*/
private function generateTokens(array $document): array
{
$tokens = [];
$config = [
'word' => true,
'prefix' => true,
'ngrams' => true,
'singular' => true
];
$tokenizer = $this->tokenizerFactory->createComposite($config);
// 处理每个字段
$fields = ['title', 'description', 'content'];
$fieldWeights = ['title' => 10, 'description' => 5, 'content' => 2];
foreach ($fields as $field) {
if (!empty($document[$field])) {
$fieldTokens = $tokenizer->tokenize($document[$field]);
foreach ($fieldTokens as $tokenInfo) {
$tokens[] = [
'token' => $tokenInfo['token'],
'tokenizer_type' => $tokenInfo['source'],
'weight' => $tokenInfo['weight'],
'field' => $field,
'field_weight' => $fieldWeights[$field]
];
}
}
}
return $tokens;
}
/**
* 批量插入tokens
*/
private function batchInsertTokens(array $tokens): void
{
if (empty($tokens)) {
return;
}
$tokenGroups = [];
foreach ($tokens as $token) {
$key = $token['token'] . '|' . $token['tokenizer_type'];
if (!isset($tokenGroups[$key])) {
$tokenGroups[$key] = [
'token' => $token['token'],
'tokenizer_type' => $token['tokenizer_type'],
'weight' => $token['weight'],
'token_length' => strlen($token['token']),
'count' => 0
];
}
$tokenGroups[$key]['count']++;
}
$sql = "INSERT INTO index_tokens
(token, tokenizer_type, weight, token_length, frequency)
VALUES (:token, :type, :weight, :length, :freq)
ON DUPLICATE KEY UPDATE
weight = VALUES(weight),
frequency = frequency + VALUES(freq)";
$stmt = $this->db->prepare($sql);
foreach ($tokenGroups as $token) {
$stmt->execute([
':token' => $token['token'],
':type' => $token['tokenizer_type'],
':weight' => $token['weight'],
':length' => $token['token_length'],
':freq' => $token['count']
]);
}
}
/**
* 批量插入索引条目
*/
private function batchInsertEntries(int $documentId, array $tokens): void
{
if (empty($tokens)) {
return;
}
$sql = "INSERT INTO index_entries
(token_id, document_id, field_name, field_weight, position_in_field)
VALUES ((SELECT id FROM index_tokens WHERE token = ? AND tokenizer_type = ? LIMIT 1), ?, ?, ?, ?)";
$stmt = $this->db->prepare($sql);
$position = 0;
foreach ($tokens as $token) {
$stmt->execute([
$token['token'],
$token['tokenizer_type'],
$documentId,
$token['field'],
$token['field_weight'],
$position++
]);
}
}
/**
* 删除文档索引
*/
public function removeDocumentIndex(int $documentId): bool
{
try {
$sql = "DELETE FROM index_entries WHERE document_id = ?";
$stmt = $this->db->prepare($sql);
$stmt->execute([$documentId]);
// 可选:清理不再被引用的tokens
$this->cleanupOrphanedTokens();
return true;
} catch (Exception $e) {
$this->logError("Failed to remove index for document {$documentId}", $e);
return false;
}
}
/**
* 清理孤立的tokens
*/
private function cleanupOrphanedTokens(): void
{
$sql = "DELETE t FROM index_tokens t
LEFT JOIN index_entries e ON t.id = e.token_id
WHERE e.token_id IS NULL";
$this->db->exec($sql);
}
/**
* 更新文档长度
*/
private function updateDocumentLength(int $documentId, array $document): void
{
$length = strlen($document['title'] ?? '') +
strlen($document['description'] ?? '') +
strlen($document['content'] ?? '');
$sql = "UPDATE documents SET length = ? WHERE id = ?";
$stmt = $this->db->prepare($sql);
$stmt->execute([$length, $documentId]);
}
}
3.2 增量索引更新
class IncrementalIndexer
{
private IndexingService $indexingService;
private int $checkpointInterval = 1000;
public function __construct(IndexingService $indexingService)
{
$this->indexingService = $indexingService;
}
/**
* 增量更新索引
*/
public function updateIndexIncremental(DateTime $since, DateTime $until): array
{
$results = ['updated' => 0, 'failed' => 0, 'deleted' => 0];
// 获取需要更新的文档
$updatedDocuments = $this->getUpdatedDocuments($since, $until);
$deletedDocuments = $this->getDeletedDocuments($since, $until);
// 处理更新的文档
foreach ($updatedDocuments as $document) {
if ($this->indexingService->indexDocument($document)) {
$results['updated']++;
} else {
$results['failed']++;
}
// 检查点:每1000个文档提交一次
if (($results['updated'] + $results['failed']) % $this->checkpointInterval === 0) {
$this->checkpoint();
}
}
// 处理删除的文档
foreach ($deletedDocuments as $documentId) {
if ($this->indexingService->removeDocumentIndex($documentId)) {
$results['deleted']++;
}
}
$this->checkpoint(); // 最终检查点
return $results;
}
private function getUpdatedDocuments(DateTime $since, DateTime $until): array
{
$sql = "SELECT * FROM documents
WHERE updated_at BETWEEN ? AND ?
AND status = 'active'
ORDER BY updated_at ASC";
$stmt = $this->indexingService->db->prepare($sql);
$stmt->execute([
$since->format('Y-m-d H:i:s'),
$until->format('Y-m-d H:i:s')
]);
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
private function getDeletedDocuments(DateTime $since, DateTime $until): array
{
$sql = "SELECT id FROM documents
WHERE updated_at BETWEEN ? AND ?
AND status = 'deleted'";
$stmt = $this->indexingService->db->prepare($sql);
$stmt->execute([
$since->format('Y-m-d H:i:s'),
$until->format('Y-m-d H:i:s')
]);
return $stmt->fetchAll(PDO::FETCH_COLUMN);
}
private function checkpoint(): void
{
// 记录检查点
$sql = "INSERT INTO index_checkpoints (checkpoint_time) VALUES (NOW())";
$this->indexingService->db->exec($sql);
}
}
4. 性能优化策略
4.1 查询优化索引
-- 核心搜索查询的优化索引
CREATE INDEX idx_optimized_search ON index_entries(
token_id,
field_weight DESC,
document_id
) INCLUDE (field_name, position_in_field);
-- 支持字段过滤的索引
CREATE INDEX idx_field_filtered_search ON index_entries(
field_name,
token_id,
document_id
) INCLUDE (field_weight);
-- 支持范围查询的索引
CREATE INDEX idx_weight_range_search ON index_entries(
field_weight,
token_id
);
-- 覆盖索引避免回表
CREATE INDEX idx_covering_search ON index_entries(
token_id,
document_id,
field_name,
field_weight,
position_in_field
);
4.2 批量操作优化
class BatchOperationOptimizer
{
private PDO $db;
private int $optimalBatchSize = 500; // 基于测试确定的最佳批量大小
public function __construct(PDO $db)
{
$this->db = $db;
}
/**
* 优化的批量插入
*/
public function optimizedBatchInsert(string $table, array $data): int
{
if (empty($data)) {
return 0;
}
$batches = array_chunk($data, $this->optimalBatchSize);
$totalInserted = 0;
foreach ($batches as $batch) {
$inserted = $this->insertBatch($table, $batch);
$totalInserted += $inserted;
// 内存管理:清理大变量
if (memory_get_usage() > 512 * 1024 * 1024) { // 512MB
gc_collect_cycles();
}
}
return $totalInserted;
}
private function insertBatch(string $table, array $batch): int
{
$columns = array_keys($batch[0]);
$placeholders = implode(',', array_fill(0, count($columns), '?'));
$columnList = implode(',', $columns);
$sql = "INSERT INTO {$table} ({$columnList}) VALUES ({$placeholders})";
try {
$this->db->beginTransaction();
// 准备语句
$stmt = $this->db->prepare($sql);
// 批量执行
foreach ($batch as $row) {
$stmt->execute(array_values($row));
}
$inserted = $stmt->rowCount();
$this->db->commit();
return $inserted;
} catch (Exception $e) {
$this->db->rollBack();
throw $e;
}
}
/**
* LOAD DATA INFILE 优化(适用于大数据量)
*/
public function loadFileInfile(string $table, string $filePath, array $columns): int
{
$columnList = implode(',', $columns);
$sql = "LOAD DATA LOCAL INFILE ?
INTO TABLE {$table}
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
({$columnList})";
$stmt = $this->db->prepare($sql);
$stmt->execute([$filePath]);
return $stmt->rowCount();
}
/**
* 自适应批量大小
*/
public function getAdaptiveBatchSize(int $recordSize, int $availableMemory): int
{
// 估算每个记录的内存占用(字节)
$estimatedRecordSize = $recordSize * 4; // 字符串的粗略估算
// 目标:使用可用内存的70%
$targetMemoryUsage = $availableMemory * 0.7;
// 计算最佳批量大小
$optimalSize = (int)($targetMemoryUsage / $estimatedRecordSize);
// 限制在合理范围内
return max(100, min(2000, $optimalSize));
}
}
4.3 分区表管理
class PartitionManager
{
private PDO $db;
public function __construct(PDO $db)
{
$this->db = $db;
}
/**
* 动态增加分区
*/
public function addPartition(string $table, string $partitionName, string $value): bool
{
$sql = "ALTER TABLE {$table}
ADD PARTITION (
PARTITION {$partitionName} VALUES IN ({$value})
)";
try {
$this->db->exec($sql);
return true;
} catch (Exception $e) {
$this->logError("Failed to add partition {$partitionName}", $e);
return false;
}
}
/**
* 分区维护和优化
*/
public function maintainPartitions(string $table): array
{
$results = ['optimized' => [], 'analyzed' => []];
// 获取所有分区
$partitions = $this->getTablePartitions($table);
foreach ($partitions as $partition) {
$partitionName = $partition['PARTITION_NAME'];
// 优化分区
try {
$sql = "OPTIMIZE TABLE {$table} PARTITION {$partitionName}";
$this->db->exec($sql);
$results['optimized'][] = $partitionName;
} catch (Exception $e) {
$this->logError("Failed to optimize partition {$partitionName}", $e);
}
// 分析分区统计信息
try {
$sql = "ANALYZE TABLE {$table} PARTITION {$partitionName}";
$this->db->exec($sql);
$results['analyzed'][] = $partitionName;
} catch (Exception $e) {
$this->logError("Failed to analyze partition {$partitionName}", $e);
}
}
return $results;
}
private function getTablePartitions(string $table): array
{
$sql = "SELECT PARTITION_NAME, TABLE_ROWS, DATA_LENGTH
FROM information_schema.PARTITIONS
WHERE TABLE_NAME = ? AND PARTITION_NAME IS NOT NULL";
$stmt = $this->db->prepare($sql);
$stmt->execute([$table]);
return $stmt->fetchAll(PDO::FETCH_ASSOC);
}
/**
* 分区重平衡
*/
public function rebalancePartitions(string $table): bool
{
try {
// 临时禁用分区
$this->db->exec("ALTER TABLE {$table} REMOVE PARTITIONING");
// 重新添加优化的分区配置
$this->addOptimizedPartitioning($table);
return true;
} catch (Exception $e) {
$this->logError("Failed to rebalance partitions for {$table}", $e);
return false;
}
}
private function addOptimizedPartitioning(string $table): void
{
$sql = "ALTER TABLE {$table}
PARTITION BY HASH(document_id)
PARTITIONS 16";
$this->db->exec($sql);
}
}
5. 监控和诊断
5.1 索引健康检查
class IndexHealthMonitor
{
private PDO $db;
private array $thresholds = [
'max_partition_size' => 10 * 1024 * 1024 * 1024, // 10GB
'max_index_fragmentation' => 30, // 30%
'min_query_performance' => 100, // ms
];
public function __construct(PDO $db)
{
$this->db = $db;
}
/**
* 全面的健康检查
*/
public function performHealthCheck(): array
{
$health = [
'overall_status' => 'healthy',
'issues' => [],
'recommendations' => [],
'metrics' => []
];
// 检查分区大小
$partitionIssues = $this->checkPartitionSizes();
if (!empty($partitionIssues)) {
$health['issues'] = array_merge($health['issues'], $partitionIssues);
$health['overall_status'] = 'warning';
}
// 检查索引碎片化
$fragmentationIssues = $this->checkIndexFragmentation();
if (!empty($fragmentationIssues)) {
$health['issues'] = array_merge($health['issues'], $fragmentationIssues);
}
// 检查查询性能
$performanceIssues = $this->checkQueryPerformance();
if (!empty($performanceIssues)) {
$health['issues'] = array_merge($health['issues'], $performanceIssues);
if ($health['overall_status'] === 'healthy') {
$health['overall_status'] = 'warning';
}
}
// 生成建议
$health['recommendations'] = $this->generateRecommendations($health['issues']);
// 收集指标
$health['metrics'] = $this->collectMetrics();
return $health;
}
private function checkPartitionSizes(): array
{
$issues = [];
$sql = "SELECT PARTITION_NAME, DATA_LENGTH, TABLE_ROWS
FROM information_schema.PARTITIONS
WHERE TABLE_NAME = 'index_entries'
AND PARTITION_NAME IS NOT NULL";
$stmt = $this->db->query($sql);
$partitions = $stmt->fetchAll(PDO::FETCH_ASSOC);
foreach ($partitions as $partition) {
if ($partition['DATA_LENGTH'] > $this->thresholds['max_partition_size']) {
$issues[] = [
'type' => 'partition_size',
'partition' => $partition['PARTITION_NAME'],
'size' => $partition['DATA_LENGTH'],
'severity' => 'high'
];
}
}
return $issues;
}
private function checkIndexFragmentation(): array
{
$issues = [];
// 检查索引使用情况
$sql = "SELECT TABLE_NAME, INDEX_NAME, CARDINALITY
FROM information_schema.STATISTICS
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME IN ('index_tokens', 'index_entries')";
$stmt = $this->db->query($sql);
$indexes = $stmt->fetchAll(PDO::FETCH_ASSOC);
foreach ($indexes as $index) {
$fragmentation = $this->calculateFragmentation($index);
if ($fragmentation > $this->thresholds['max_index_fragmentation']) {
$issues[] = [
'type' => 'index_fragmentation',
'table' => $index['TABLE_NAME'],
'index' => $index['INDEX_NAME'],
'fragmentation' => $fragmentation,
'severity' => 'medium'
];
}
}
return $issues;
}
private function calculateFragmentation(array $index): float
{
// 简化的碎片化计算
// 实际实现应该使用更精确的算法
return rand(10, 50); // 占位符
}
private function checkQueryPerformance(): array
{
$issues = [];
// 测试关键查询的性能
$testQueries = [
'simple_search' => "
SELECT COUNT(*) FROM index_entries ie
JOIN index_tokens it ON ie.token_id = it.id
WHERE it.token = 'database'
",
'complex_search' => "
SELECT d.id, COUNT(*) as match_count
FROM documents d
JOIN index_entries ie ON d.id = ie.document_id
JOIN index_tokens it ON ie.token_id = it.id
WHERE it.token IN ('database', 'data')
GROUP BY d.id
ORDER BY match_count DESC
LIMIT 10
"
];
foreach ($testQueries as $queryName => $sql) {
$startTime = microtime(true);
$this->db->query($sql);
$executionTime = (microtime(true) - $startTime) * 1000; // ms
if ($executionTime > $this->thresholds['min_query_performance']) {
$issues[] = [
'type' => 'slow_query',
'query' => $queryName,
'execution_time' => $executionTime,
'severity' => 'high'
];
}
}
return $issues;
}
private function generateRecommendations(array $issues): array
{
$recommendations = [];
foreach ($issues as $issue) {
switch ($issue['type']) {
case 'partition_size':
$recommendations[] = "Consider rebalancing partition {$issue['partition']} or splitting into smaller partitions";
break;
case 'index_fragmentation':
$recommendations[] = "Rebuild index {$issue['index']} on table {$issue['table']}";
break;
case 'slow_query':
$recommendations[] = "Optimize query {$issue['query']} - current execution time: {$issue['execution_time']}ms";
break;
}
}
return array_unique($recommendations);
}
private function collectMetrics(): array
{
$metrics = [];
// 基础指标
$metrics['total_documents'] = $this->getScalar("SELECT COUNT(*) FROM documents WHERE status = 'active'");
$metrics['total_tokens'] = $this->getScalar("SELECT COUNT(*) FROM index_tokens");
$metrics['total_index_entries'] = $this->getScalar("SELECT COUNT(*) FROM index_entries");
$metrics['average_tokens_per_document'] = $this->getScalar("
SELECT AVG(entry_count) FROM (
SELECT COUNT(*) as entry_count
FROM index_entries
GROUP BY document_id
) as counts
");
// 性能指标
$metrics['index_size_mb'] = $this->getTableSize('index_entries') / 1024 / 1024;
$metrics['token_size_mb'] = $this->getTableSize('index_tokens') / 1024 / 1024;
return $metrics;
}
private function getScalar(string $sql): int
{
return (int)$this->db->query($sql)->fetchColumn();
}
private function getTableSize(string $table): int
{
$sql = "SELECT DATA_LENGTH + INDEX_LENGTH
FROM information_schema.TABLES
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = ?";
$stmt = $this->db->prepare($sql);
$stmt->execute([$table]);
return (int)$stmt->fetchColumn();
}
}
6. 本章总结
6.1 核心收获
架构设计:
– 理解反向索引的数据库实现原理
– 掌握分区表的设计和管理策略
– 学会批量操作的优化技巧
– 理解事务处理的重要性
性能优化:
– 掌握索引设计的关键原则
– 学会分区表的管理和优化
– 理解批量操作的性能收益
– 掌握监控和诊断方法
工程实践:
– 构建可扩展的索引服务架构
– 实现增量更新和故障恢复
– 建立完善的监控体系
– 掌握性能调优的方法论
6.2 下章预告
下一章我们将深入搜索查询优化,学习如何:
- 设计高效的搜索查询SQL
- 实现复杂的结果排序算法
- 优化查询性能和响应时间
- 构建完整的搜索服务架构
实践作业:设计并实现一个小型索引系统,支持文档的添加、删除和搜索功能。
上一篇 → 第03章:权重系统设计 | 下一篇 → 第05章:搜索查询优化







最新评论
照片令人惊艳。万分感谢 温暖。
氛围绝佳。由衷感谢 感受。 你的博客让人一口气读完。敬意 真诚。
实用的 杂志! 越来越好!
又到年底了,真快!
研究你的文章, 我体会到美好的心情。
感谢激励。由衷感谢
好久没见过, 如此温暖又有信息量的博客。敬意。
很稀有, 这么鲜明的文字。谢谢。