第03章:权重系统设计:相关性评分的科学
三层权重架构实现精确相关性控制,这是搜索质量的核心
📝 TL;DR (核心要点速览)
– 核心公式:$finalWeight = $fieldWeight × $tokenizerWeight × lengthPenalty
– 评分机制:基础得分 + 多样性提升 + 质量加权 + 长度惩罚
– 数学原理:对数函数、平方根、权重矩阵的科学应用
– SQL实现:用数据库聚合函数实现复杂评分算法
1. 权重系统的重要性
1.1 为什么权重决定搜索质量
用户搜索”database”的期望结果排序:
1. "Database Management Systems" // 标题完全匹配
2. "Advanced Database Techniques" // 标题部分匹配
3. "Learning Database Design" // 内容包含"database"
4. "Data Management Systems" // 只匹配"data"
5. "Computer Systems Basics" // 只匹配"systems"
没有权重系统的问题:
– 所有相关结果随机排序
– 无法区分重要性差异
– 用户体验差,找不到想要的内容
1.2 权重的三个维度
Field维度:不同字段的重要性
标题权重:10 // 标题匹配最重要
描述权重:5 // 描述匹配次重要
内容权重:2 // 内容匹配相对不重要
标签权重:8 // 标签匹配很重要
Tokenizer维度:不同匹配策略的重要性
完全匹配:10 // Word Tokenizer
前缀匹配:3 // Prefix Tokenizer
单复数匹配:8 // Singular Tokenizer
容错匹配:1 // N-Grams Tokenizer
长度维度:避免长文档霸屏
$lengthPenalty = 1 / sqrt($documentLength);
// 长文档被适当降权,短文档有展示机会
2. 三层权重架构详解
2.1 第一层:字段权重设计
字段权重矩阵:
class FieldWeightManager
{
private array $fieldWeights = [
'title' => 10, // 标题权重最高
'description' => 5, // 描述权重中等
'content' => 2, // 内容权重较低
'tags' => 8, // 标签权重较高
'category' => 6, // 分类权重中等偏上
'author' => 3, // 作者权重较低
];
public function getWeight(string $field): int
{
return $this->fieldWeights[$field] ?? 1;
}
public function setWeight(string $field, int $weight): void
{
$this->fieldWeights[$field] = $weight;
}
public function getAllWeights(): array
{
return $this->fieldWeights;
}
/**
* 根据业务场景自动调整权重
*/
public function optimizeForScenario(string $scenario): void
{
switch ($scenario) {
case 'ecommerce':
$this->fieldWeights = [
'product_name' => 15, // 商品名最重要
'brand' => 12, // 品牌次重要
'description' => 5, // 描述一般重要
'specifications' => 3, // 规格不太重要
];
break;
case 'blog':
$this->fieldWeights = [
'title' => 10, // 博客标题重要
'content' => 6, // 内容也重要
'tags' => 8, // 标签重要
'category' => 4, // 分类一般重要
];
break;
case 'documentation':
$this->fieldWeights = [
'title' => 12, // 文档标题很重要
'content' => 8, // 内容很重要
'section' => 6, // 章节重要
'keywords' => 10, // 关键词很重要
];
break;
}
}
}
动态权重调整:
class DynamicFieldWeight extends FieldWeightManager
{
private array $userBehaviorData;
public function __construct(array $userBehaviorData = [])
{
$this->userBehaviorData = $userBehaviorData;
parent::__construct();
}
public function getAdaptiveWeight(string $field): float
{
$baseWeight = $this->getWeight($field);
// 根据用户行为调整权重
if (isset($this->userBehaviorData[$field])) {
$clickRate = $this->userBehaviorData[$field]['click_rate'];
$adjustment = 1 + ($clickRate - 0.1); // 基准点击率10%
return $baseWeight * $adjustment;
}
return $baseWeight;
}
}
2.2 第二层:Tokenizer权重设计
Tokenizer权重计算:
class TokenizerWeightCalculator
{
private array $tokenizerWeights = [
'word' => 10, // 完全匹配权重最高
'singular' => 8, // 单复数匹配权重较高
'prefix' => 3, // 前缀匹配权重中等
'ngrams' => 1, // 容错匹配权重最低
];
public function calculateTokenWeight(
string $token,
string $tokenizerType,
int $tokenLength
): float {
$baseWeight = $this->tokenizerWeights[$tokenizerType] ?? 1;
// 长度因子:长token权重更高,因为更具体
$lengthFactor = sqrt($tokenLength);
return $baseWeight * $lengthFactor;
}
public function calculateFieldTokenizerWeight(
string $field,
string $tokenizerType
): float {
// 不同字段对不同tokenizer的敏感度不同
$fieldSensitivity = $this->getFieldSensitivity($field, $tokenizerType);
return ($this->tokenizerWeights[$tokenizerType] ?? 1) * $fieldSensitivity;
}
private function getFieldSensitivity(string $field, string $tokenizerType): float
{
$sensitivityMatrix = [
'title' => [
'word' => 1.2, // 标题对完全匹配更敏感
'singular' => 1.1, // 标题对单复数较敏感
'prefix' => 0.8, // 标题对前缀不太敏感
'ngrams' => 0.3, // 标题对容错最不敏感
],
'content' => [
'word' => 1.0, // 内容对完全匹配正常敏感
'singular' => 1.2, // 内容对单复数更敏感
'prefix' => 1.0, // 内容对前缀正常敏感
'ngrams' => 0.8, // 内容对容错较敏感
],
];
return $sensitivityMatrix[$field][$tokenizerType] ?? 1.0;
}
}
2.3 第三层:长度惩罚机制
长度惩罚算法:
class LengthPenaltyCalculator
{
private float $basePenalty = 1.0;
private float $penaltyFactor = 0.1;
private int $optimalLength = 100; // 最佳文档长度
public function calculatePenalty(int $documentLength): float
{
// 使用对数函数避免过度惩罚
$lengthRatio = $documentLength / $this->optimalLength;
if ($lengthRatio <= 1) {
// 短文档不惩罚
return 1.0;
} elseif ($lengthRatio <= 2) {
// 中等长度文档轻度惩罚
return 1.0 / (1.0 + $this->penaltyFactor * log($lengthRatio));
} else {
// 长文档重度惩罚
return 1.0 / sqrt($lengthRatio);
}
}
public function calculateLengthScore(int $documentLength): float
{
// 长度奖励:适中的文档长度得到奖励
$optimalRange = 0.5; // 允许的长度偏差范围
if (abs($documentLength - $this->optimalLength) <= $this->optimalLength * $optimalRange) {
return 1.2; // 最佳长度奖励
} elseif ($documentLength < $this->optimalLength * 0.3) {
return 0.7; // 过短文档惩罚
} elseif ($documentLength > $this->optimalLength * 3) {
return 0.5; // 过长文档重度惩罚
} else {
return 1.0; // 正常长度
}
}
}
3. 综合评分算法实现
3.1 核心评分公式
class SearchScoreCalculator
{
private FieldWeightManager $fieldWeightManager;
private TokenizerWeightCalculator $tokenizerWeightCalculator;
private LengthPenaltyCalculator $lengthPenaltyCalculator;
public function __construct()
{
$this->fieldWeightManager = new FieldWeightManager();
$this->tokenizerWeightCalculator = new TokenizerWeightCalculator();
$this->lengthPenaltyCalculator = new LengthPenaltyCalculator();
}
/**
* 计算单个文档的搜索得分
*/
public function calculateScore(
int $documentId,
array $matchedTokens,
int $documentLength
): float {
$totalScore = 0;
foreach ($matchedTokens as $match) {
$tokenScore = $this->calculateTokenScore($match);
$totalScore += $tokenScore;
}
// 应用长度惩罚
$lengthPenalty = $this->lengthPenaltyCalculator->calculatePenalty($documentLength);
// 应用多样性提升
$diversityBonus = $this->calculateDiversityBonus($matchedTokens);
// 最终得分
$finalScore = $totalScore * $lengthPenalty * $diversityBonus;
return $finalScore;
}
private function calculateTokenScore(array $match): float
{
// 获取字段权重
$fieldWeight = $this->fieldWeightManager->getWeight($match['field']);
// 获取tokenizer权重
$tokenizerWeight = $this->tokenizerWeightCalculator->calculateTokenWeight(
$match['token'],
$match['tokenizer_type'],
strlen($match['token'])
);
// 计算综合权重
$combinedWeight = $fieldWeight * $tokenizerWeight;
return $combinedWeight;
}
/**
* 多样性提升:匹配的tokenizer种类越多,得分越高
*/
private function calculateDiversityBonus(array $matchedTokens): float
{
$uniqueTokenizers = array_unique(array_column($matchedTokens, 'tokenizer_type'));
$diversityRatio = count($uniqueTokenizers) / count($matchedTokens);
// 使用对数函数平滑多样性奖励
return 1.0 + 0.2 * log(1 + $diversityRatio * 10);
}
/**
* 质量加权:基于token的平均权重调整最终得分
*/
public function applyQualityWeighting(float $baseScore, array $matchedTokens): float
{
if (empty($matchedTokens)) {
return 0;
}
$averageQuality = array_sum(array_map(function($match) {
return $match['tokenizer_weight'];
}, $matchedTokens)) / count($matchedTokens);
// 质量因子:高质量匹配得到额外奖励
$qualityFactor = 1.0 + ($averageQuality / 10) * 0.3;
return $baseScore * $qualityFactor;
}
}
3.2 批量评分优化
class BatchScoreCalculator
{
private SearchScoreCalculator $scoreCalculator;
private array $scoreCache = [];
public function __construct()
{
$this->scoreCalculator = new SearchScoreCalculator();
}
public function calculateBatchScores(array $documents): array
{
$results = [];
// 预处理:提取公共数据,避免重复计算
$commonData = $this->preprocessCommonData($documents);
foreach ($documents as $document) {
$docId = $document['id'];
// 检查缓存
if (isset($this->scoreCache[$docId])) {
$results[$docId] = $this->scoreCache[$docId];
continue;
}
// 计算得分
$score = $this->scoreCalculator->calculateScore(
$docId,
$document['matched_tokens'],
$document['length']
);
$results[$docId] = $score;
$this->scoreCache[$docId] = $score;
}
// 归一化得分到0-100范围
return $this->normalizeScores($results);
}
private function preprocessCommonData(array $documents): array
{
// 预计算长度惩罚等公共数据
$lengthPenalties = [];
foreach ($documents as $doc) {
$lengthPenalties[$doc['id']] = $this->lengthPenaltyCalculator->calculatePenalty($doc['length']);
}
return ['length_penalties' => $lengthPenalties];
}
private function normalizeScores(array $scores): array
{
if (empty($scores)) {
return [];
}
$maxScore = max($scores);
if ($maxScore == 0) {
return $scores;
}
// 归一化到0-100分
$normalized = [];
foreach ($scores as $docId => $score) {
$normalized[$docId] = ($score / $maxScore) * 100;
}
return $normalized;
}
}
4. SQL级别的权重实现
4.1 数据库设计
-- 索引token表:存储所有token及其基本信息
CREATE TABLE index_tokens (
id INT PRIMARY KEY AUTO_INCREMENT,
token VARCHAR(100) NOT NULL,
tokenizer_type ENUM('word', 'prefix', 'ngrams', 'singular') NOT NULL,
weight INT NOT NULL DEFAULT 1,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE KEY unique_token_type (token, tokenizer_type),
KEY idx_token (token),
KEY idx_type (tokenizer_type)
);
-- 索引条目表:存储token与文档的映射关系
CREATE TABLE index_entries (
id INT PRIMARY KEY AUTO_INCREMENT,
token_id INT NOT NULL,
document_id INT NOT NULL,
field_name VARCHAR(50) NOT NULL,
field_weight INT NOT NULL DEFAULT 1,
position_in_field INT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (token_id) REFERENCES index_tokens(id),
FOREIGN KEY (document_id) REFERENCES documents(id),
KEY idx_token_document (token_id, document_id),
KEY idx_document_field (document_id, field_name),
KEY idx_position (position_in_field)
);
-- 文档表:存储文档基本信息
CREATE TABLE documents (
id INT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(255) NOT NULL,
content TEXT,
description TEXT,
length INT NOT NULL DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
KEY idx_length (length),
KEY idx_created (created_at)
);
4.2 复杂权重查询SQL
-- 核心搜索查询:包含所有权重计算
SELECT
d.id,
d.title,
d.description,
d.length,
-- 1. 基础得分:所有匹配token的权重之和
SUM(t.weight * ie.field_weight) as base_score,
-- 2. 多样性得分:匹配的tokenizer种类
COUNT(DISTINCT t.tokenizer_type) as diversity_count,
-- 3. 质量得分:平均token权重
AVG(t.weight * ie.field_weight) as avg_quality,
-- 4. 长度惩罚:避免长文档霸屏
CASE
WHEN d.length <= 100 THEN 1.0
WHEN d.length <= 300 THEN 0.8
WHEN d.length <= 1000 THEN 0.6
ELSE 0.4
END as length_penalty,
-- 5. 位置奖励:开头匹配奖励更高
AVG(CASE
WHEN ie.position_in_field <= 10 THEN 1.2
WHEN ie.position_in_field <= 50 THEN 1.0
ELSE 0.8
END) as position_bonus
FROM documents d
INNER JOIN index_entries ie ON d.id = ie.document_id
INNER JOIN index_tokens t ON ie.token_id = t.id
WHERE t.token IN (
'database', 'data', 'base', 'datab', 'datas' -- 用户输入的所有token
)
GROUP BY d.id, d.title, d.description, d.length
HAVING base_score > 0 -- 只返回有匹配的结果
-- 6. 最终得分计算
ORDER BY (
base_score *
(1 + 0.1 * LOG(1 + diversity_count)) * -- 多样性奖励
avg_quality / 10 * -- 质量加权
length_penalty * -- 长度惩罚
position_bonus -- 位置奖励
) DESC
LIMIT 20;
4.3 高级评分函数
-- 创建存储过程计算综合得分
DELIMITER //
CREATE PROCEDURE calculate_search_score(
IN search_tokens TEXT,
IN limit_count INT
)
BEGIN
-- 临时表存储token列表
CREATE TEMPORARY TABLE temp_tokens (
token VARCHAR(100)
);
-- 插入搜索tokens
-- 这里应该用程序逻辑分割tokens,简化示例
INSERT INTO temp_tokens VALUES ('database'), ('data'), ('base');
-- 主查询
SELECT
d.id,
d.title,
d.description,
-- 综合得分计算
(
SUM(t.weight * ie.field_weight) * -- 基础得分
(
1 + 0.2 * LOG(1 + COUNT(DISTINCT t.tokenizer_type)) -- 多样性奖励
) *
(
AVG(t.weight * ie.field_weight) / 10 -- 质量因子
) *
CASE -- 长度惩罚
WHEN d.length <= 50 THEN 1.3
WHEN d.length <= 100 THEN 1.1
WHEN d.length <= 500 THEN 1.0
WHEN d.length <= 2000 THEN 0.8
ELSE 0.5
END
) as final_score,
-- 调试信息
COUNT(*) as match_count,
COUNT(DISTINCT t.tokenizer_type) as tokenizer_diversity,
AVG(t.weight * ie.field_weight) as avg_token_weight,
d.length as document_length
FROM documents d
INNER JOIN index_entries ie ON d.id = ie.document_id
INNER JOIN index_tokens t ON ie.token_id = t.id
INNER JOIN temp_tokens tt ON t.token = tt.token
GROUP BY d.id, d.title, d.description, d.length
HAVING final_score > 0
ORDER BY final_score DESC
LIMIT limit_count;
-- 清理临时表
DROP TEMPORARY TABLE temp_tokens;
END //
DELIMITER ;
5. 性能优化策略
5.1 索引优化
-- 复合索引优化查询性能
CREATE INDEX idx_search_optimized ON index_entries(token_id, document_id, field_weight);
-- 分区表处理大数据量
ALTER TABLE index_entries
PARTITION BY HASH(document_id)
PARTITIONS 8;
-- 覆盖索引避免回表
CREATE INDEX idx_covering_search ON index_entries(
token_id,
document_id,
field_name,
field_weight,
position_in_field
);
5.2 查询缓存策略
class QueryCacheManager
{
private array $cache = [];
private int $maxCacheSize = 1000;
private int $cacheTtl = 3600; // 1小时
public function get(string $queryHash): ?array
{
if (isset($this->cache[$queryHash])) {
$cached = $this->cache[$queryHash];
// 检查是否过期
if (time() - $cached['timestamp'] < $this->cacheTtl) {
return $cached['results'];
} else {
unset($this->cache[$queryHash]);
}
}
return null;
}
public function set(string $queryHash, array $results): void
{
$this->cache[$queryHash] = [
'results' => $results,
'timestamp' => time()
];
// 缓存大小控制
if (count($this->cache) > $this->maxCacheSize) {
$this->evictOldest();
}
}
private function evictOldest(): void
{
$oldestTime = time();
$oldestKey = null;
foreach ($this->cache as $key => $value) {
if ($value['timestamp'] < $oldestTime) {
$oldestTime = $value['timestamp'];
$oldestKey = $key;
}
}
if ($oldestKey) {
unset($this->cache[$oldestKey]);
}
}
}
6. 实际应用:权重调优
6.1 A/B测试框架
class WeightABTest
{
private array $weightVariants = [
'control' => [
'title' => 10,
'content' => 2,
'description' => 5,
],
'variant_a' => [
'title' => 15, // 提高标题权重
'content' => 1, // 降低内容权重
'description' => 6,
],
'variant_b' => [
'title' => 8, // 降低标题权重
'content' => 4, // 提高内容权重
'description' => 8,
]
];
public function assignUser(string $userId): string
{
// 基于用户ID的哈希分配到不同组
$hash = crc32($userId);
$variants = array_keys($this->weightVariants);
return $variants[abs($hash) % count($variants)];
}
public function getWeights(string $variant): array
{
return $this->weightVariants[$variant] ?? $this->weightVariants['control'];
}
public function recordConversion(string $variant, string $userId, string $action): void
{
// 记录用户行为数据
$this->analytics->track([
'variant' => $variant,
'user_id' => $userId,
'action' => $action,
'timestamp' => time()
]);
}
}
6.2 机器学习权重优化
class MLWeightOptimizer
{
private array $trainingData = [];
private float $learningRate = 0.01;
public function addTrainingSample(array $features, float $targetScore): void
{
$this->trainingData[] = [
'features' => $features, // [field_weights, tokenizer_weights, etc.]
'target' => $targetScore // 用户满意度得分
];
}
public function optimizeWeights(array $currentWeights): array
{
$optimizedWeights = $currentWeights;
$iterations = 1000;
for ($i = 0; $i < $iterations; $i++) {
$gradient = $this->calculateGradient($optimizedWeights);
// 梯度下降更新权重
foreach ($gradient as $key => $grad) {
$optimizedWeights[$key] -= $this->learningRate * $grad;
}
}
return $this->normalizeWeights($optimizedWeights);
}
private function calculateGradient(array $weights): array
{
$gradient = [];
$epsilon = 0.001; // 数值微分的小量
foreach ($weights as $key => $value) {
// 前向差分计算梯度
$weightsPlus = $weights;
$weightsPlus[$key] += $epsilon;
$lossPlus = $this->calculateLoss($weightsPlus);
$loss = $this->calculateLoss($weights);
$gradient[$key] = ($lossPlus - $loss) / $epsilon;
}
return $gradient;
}
private function calculateLoss(array $weights): float
{
$totalLoss = 0;
foreach ($this->trainingData as $sample) {
$predictedScore = $this->predictScore($sample['features'], $weights);
$error = $predictedScore - $sample['target'];
$totalLoss += $error * $error; // 平方误差
}
return $totalLoss / count($this->trainingData);
}
private function predictScore(array $features, array $weights): float
{
// 简化的预测函数
$score = 0;
foreach ($features as $key => $value) {
$score += ($weights[$key] ?? 1) * $value;
}
return $score;
}
private function normalizeWeights(array $weights): array
{
$maxWeight = max($weights);
if ($maxWeight == 0) {
return $weights;
}
// 归一化到1-20范围
$normalized = [];
foreach ($weights as $key => $weight) {
$normalized[$key] = max(1, min(20, ($weight / $maxWeight) * 20));
}
return $normalized;
}
}
7. 本章总结
7.1 核心收获
算法层面:
– 掌握三层权重架构的设计原理
– 理解数学公式在搜索评分中的应用
– 学会用SQL实现复杂的权重计算
– 掌握性能优化的关键技术
实践层面:
– 不同场景的权重配置策略
– A/B测试验证权重效果
– 机器学习优化权重参数
– 生产环境监控和调优
设计思维:
– 权重不是固定不变,需要持续优化
– 数学原理指导算法设计
– 性能与准确性的平衡艺术
– 数据驱动的权重调优方法
7.2 下章预告
下一章我们将深入索引系统架构,学习如何:
- 设计高性能的数据库表结构
- 实现批量索引操作和事务处理
- 优化SQL查询性能和索引设计
- 构建可扩展的索引服务架构
实践作业:为你当前的项目设计权重配置,并通过A/B测试验证不同权重策略的效果。
上一篇 → 第02章:搜索引擎核心原理 | 下一篇 → 第04章:索引系统架构





最新评论
照片令人惊艳。万分感谢 温暖。
氛围绝佳。由衷感谢 感受。 你的博客让人一口气读完。敬意 真诚。
实用的 杂志! 越来越好!
又到年底了,真快!
研究你的文章, 我体会到美好的心情。
感谢激励。由衷感谢
好久没见过, 如此温暖又有信息量的博客。敬意。
很稀有, 这么鲜明的文字。谢谢。