Elasticsearch 模糊查询性能优化：wildcard vs ngram 实测对比

智谱 GLM，支持多语言、多任务推理。从写作到代码生成，从搜索到知识问答，AI 生产力的中国解法。

Build Index

{
  "warmsearch" : {
    "aliases" : { },
    "mappings" : {
      "properties" : {
        "aname" : {
          "type" : "wildcard"
        },
        "sn" : {
          "type" : "text"
        },
        "title" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "my_analyzer"
        }
      }
    },
    "settings" : {
      "index" : {
        "max_ngram_diff" : "10",
        "routing" : {
          "allocation" : {
            "include" : {
              "_tier_preference" : "data_content"
            }
          }
        },
        "number_of_shards" : "1",
        "provided_name" : "warmsearch",
        "creation_date" : "1701145536507",
        "analysis" : {
          "analyzer" : {
            "my_analyzer" : {
              "tokenizer" : "my_tokenizer"
            }
          },
          "tokenizer" : {
            "my_tokenizer" : {
              "token_chars" : [
                "letter",
                "digit"
              ],
              "min_gram" : "1",
              "type" : "ngram",
              "max_gram" : "10"
            }
          }
        },
        "number_of_replicas" : "1",
        "queries" : {
          "cache" : {
            "enabled" : "false"
          }
        },
        "uuid" : "OG0QGTrxQiejej8Hj0T9eA",
        "version" : {
          "created" : "7100299"
        }
      }
    }
  }
}

Put Data Insert

import random

import requests
import uuid

def add_document_to_es(index, document):
    """向Elasticsearch索引中添加一个文档"""
    response = requests.post(f'http://localhost:9200/{index}/_doc/', json=document)
    return response.json()

# 索引名称
index_name = 'warmsearch'

for i in range(1, 500000):
    # 生成随机数据
    random_title = uuid.uuid4()
    random_sn = uuid.uuid4()
    aname = uuid.uuid4()
    random_number = random.randint(0,1)
    # 根据随机数输出扩展名
    if random_number == 0:
        filetype = ".log"
    else:
        filetype = ".bag"
    # 构建文档
    document = {
        "title": str(random_title),
        "sn": str(random_sn),
        "aname": str(aname) + filetype
    }

    print(document)
    # 添加文档到Elasticsearch
    response = add_document_to_es(index_name, document)
    print(response)

Close ES Index

POST /warmsearch/_close

Modify ES disable search with cache


PUT warmsearch/_settings
{
  "index.queries.cache.enabled": false
}

Open Es Index

POST /warmsearch/_open

Search Bench Testing

import requests
import time

# Elasticsearch服务器的URL
base_url = 'http://localhost:9200'

# 查询请求主体，分别为match、match_phrase和title.keyword查询

# keyName = "title"
# value = "asd"

keyName = "aname"
value = "a0"

queries = [
    {
        "query": {
            "match": {
                keyName: value
            }
        }
    },
    {
        "query": {
            "match": {
                keyName + ".keyword": value
            }
        }
    },
    {
        "query": {
            "match_phrase": {
                keyName: value
            }
        }
    },

    {
        "query": {
            "match_phrase": {
                keyName + ".keyword": value
            }
        }
    },
    {
        "query": {
            "wildcard": {
                keyName: "*" + value + "*"
            }
        }
    },
    {
        "query": {
            "wildcard": {
                keyName + ".keyword": "*" + value + "*"
            }
        }
    }
]

# 执行50次查询

for query in queries:
    # print(query)
    total_time = 0
    num_queries = 50
    for i in range(num_queries):
        start_time = time.time()
        # 发送查询请求
        response = requests.post(f'{base_url}/warmsearch/_search', json=query)

        end_time = time.time()
        elapsed_time = end_time - start_time

        total_time += elapsed_time

        if (i == (num_queries - 1)):
            response_json = response.json()
            hits = response_json.get('hits', {})
            total = hits.get("total", {})
            # print(total.get("value", {}))

        # 输出每次请求的访问耗时
        # print(f'Request {i + 1}: Elapsed Time = {elapsed_time:.4f} seconds')
        # 计算平均耗时
    average_time = total_time / num_queries
    print(f'Query  {query.get("query")}, | {total["value"]}| {average_time:.4f}')

Result

965658 Logs ，50 times avg ,close cache

Query	Method	KeyName	Counts	Analysis	Avg Time (s)
{'match': {'title': '-ah'}},	match	title	10000	Ngram	0.0056
{'match': {'title.keyword': '-ah'}}	match	title.keyword	0	Ngram	0.0018
{'match_phrase': {'title': '-ah'}}	match_phrase	title	45	Ngram	0.0036
{'match_phrase': {'title.keyword': '-ah'}}	match_phrase	title.keyword	0	Ngram	0.0024
{'wildcard': {'title': '-ah'}}	wildcard	title	0	Ngram	0.8781
{'wildcard': {'title.keyword': '-ah'}}	wildcard	title.keyword	0	Ngram	0.0580
{'match': {'sn': '5c'}}	match	sn	0	Default	0.0105
{'match': {'sn.keyword': '5c'}}	match	sn.keyword	0	Default	0.0048
{'match_phrase': {'sn': '5c'}}	match_phrase	sn	0	Default	0.0068
{'match_phrase': {'sn.keyword': '5c'}}	match_phrase	sn.keyword	0	Default	0.0054
{'wildcard': {'sn': '5c'}}	wildcard	sn	10000	Default	0.1659
{'wildcard': {'sn.keyword': '5c'}}	wildcard	sn.keyword	0	Default	0.0062
{'match': {'aname': 'a0'}}	match	aname	0	wildcard	0.0138
{'match': {'aname.keyword': 'a0'}}	match	aname.keyword	0	wildcard	0.0065
{'match_phrase': {'aname': 'a0'}}	match_phrase	aname	0	wildcard	0.0061
{'match_phrase': {'aname.keyword': 'a0'}}	match_phrase	aname.keyword	0	wildcard	0.0048
{'wildcard': {'aname': 'a0'}}	wildcard	aname	1000	wildcard	0.0380
{'wildcard': {'aname.keyword': 'a0'}}	wildcard	aname.keyword	0	wildcard	0.0086

Elasticsearch 模糊查询性能优化：wildcard vs ngram 实测对比

Build Index

Put Data Insert

Close ES Index

Modify ES disable search with cache

Open Es Index

Search Bench Testing

Result

相关推荐

评论抢沙发

作者介绍

Toy

文章目录

置顶推荐

前沿哨所

斯坦福突破：发现软骨再生新法，有望终结关节炎

生成式AI正在“自噬”：摧毁其赖以生存的内容生态

Anthropic公开Claude完整“宪法”，重塑AI价值观与训练范式

从遍地开花到四面楚歌：AI开发者的现实反思

拒绝虚高！开发者用AI打造原生级Pixel网速工具

AI金融顾问Autonomous招聘，主打0%费用挑战传统

最新文章

热门专题

热门标签

网站统计

最新评论

十年稳如初 — LocVPS，用时间证明实力

10+ 年老牌云主机服务商，全球机房覆盖，性能稳定、价格厚道。

Build Index

Put Data Insert

Close ES Index

Modify ES disable search with cache

Open Es Index

Search Bench Testing

Result

相关推荐

评论 抢沙发

作者介绍

Toy

文章目录

置顶推荐

前沿哨所

斯坦福突破：发现软骨再生新法，有望终结关节炎

生成式AI正在“自噬”：摧毁其赖以生存的内容生态

Anthropic公开Claude完整“宪法”，重塑AI价值观与训练范式

从遍地开花到四面楚歌：AI开发者的现实反思

拒绝虚高！开发者用AI打造原生级Pixel网速工具

AI金融顾问Autonomous招聘，主打0%费用挑战传统

最新文章

热门专题

热门标签

网站统计

最新评论

十年稳如初 — LocVPS，用时间证明实力

10+ 年老牌云主机服务商，全球机房覆盖，性能稳定、价格厚道。

评论抢沙发