AI编程 · 架构思考 · 技术人生

Elasticsearch 模糊查询性能优化:wildcard vs ngram 实测对比

智谱 GLM,支持多语言、多任务推理。从写作到代码生成,从搜索到知识问答,AI 生产力的中国解法。

Build Index

{
  "warmsearch" : {
    "aliases" : { },
    "mappings" : {
      "properties" : {
        "aname" : {
          "type" : "wildcard"
        },
        "sn" : {
          "type" : "text"
        },
        "title" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          },
          "analyzer" : "my_analyzer"
        }
      }
    },
    "settings" : {
      "index" : {
        "max_ngram_diff" : "10",
        "routing" : {
          "allocation" : {
            "include" : {
              "_tier_preference" : "data_content"
            }
          }
        },
        "number_of_shards" : "1",
        "provided_name" : "warmsearch",
        "creation_date" : "1701145536507",
        "analysis" : {
          "analyzer" : {
            "my_analyzer" : {
              "tokenizer" : "my_tokenizer"
            }
          },
          "tokenizer" : {
            "my_tokenizer" : {
              "token_chars" : [
                "letter",
                "digit"
              ],
              "min_gram" : "1",
              "type" : "ngram",
              "max_gram" : "10"
            }
          }
        },
        "number_of_replicas" : "1",
        "queries" : {
          "cache" : {
            "enabled" : "false"
          }
        },
        "uuid" : "OG0QGTrxQiejej8Hj0T9eA",
        "version" : {
          "created" : "7100299"
        }
      }
    }
  }
}

Put Data Insert

import random

import requests
import uuid

def add_document_to_es(index, document):
    """向Elasticsearch索引中添加一个文档"""
    response = requests.post(f'http://localhost:9200/{index}/_doc/', json=document)
    return response.json()

# 索引名称
index_name = 'warmsearch'

for i in range(1, 500000):
    # 生成随机数据
    random_title = uuid.uuid4()
    random_sn = uuid.uuid4()
    aname = uuid.uuid4()
    random_number = random.randint(0,1)
    # 根据随机数输出扩展名
    if random_number == 0:
        filetype = ".log"
    else:
        filetype = ".bag"
    # 构建文档
    document = {
        "title": str(random_title),
        "sn": str(random_sn),
        "aname": str(aname) + filetype
    }

    print(document)
    # 添加文档到Elasticsearch
    response = add_document_to_es(index_name, document)
    print(response)

Close ES Index

POST /warmsearch/_close

Modify ES disable search with cache


PUT warmsearch/_settings
{
  "index.queries.cache.enabled": false
}

Open Es Index

POST /warmsearch/_open

Search Bench Testing

import requests
import time

# Elasticsearch服务器的URL
base_url = 'http://localhost:9200'

# 查询请求主体,分别为match、match_phrase和title.keyword查询

# keyName = "title"
# value = "asd"

keyName = "aname"
value = "a0"

queries = [
    {
        "query": {
            "match": {
                keyName: value
            }
        }
    },
    {
        "query": {
            "match": {
                keyName + ".keyword": value
            }
        }
    },
    {
        "query": {
            "match_phrase": {
                keyName: value
            }
        }
    },

    {
        "query": {
            "match_phrase": {
                keyName + ".keyword": value
            }
        }
    },
    {
        "query": {
            "wildcard": {
                keyName: "*" + value + "*"
            }
        }
    },
    {
        "query": {
            "wildcard": {
                keyName + ".keyword": "*" + value + "*"
            }
        }
    }
]

# 执行50次查询

for query in queries:
    # print(query)
    total_time = 0
    num_queries = 50
    for i in range(num_queries):
        start_time = time.time()
        # 发送查询请求
        response = requests.post(f'{base_url}/warmsearch/_search', json=query)

        end_time = time.time()
        elapsed_time = end_time - start_time

        total_time += elapsed_time

        if (i == (num_queries - 1)):
            response_json = response.json()
            hits = response_json.get('hits', {})
            total = hits.get("total", {})
            # print(total.get("value", {}))

        # 输出每次请求的访问耗时
        # print(f'Request {i + 1}: Elapsed Time = {elapsed_time:.4f} seconds')
        # 计算平均耗时
    average_time = total_time / num_queries
    print(f'Query  {query.get("query")}, | {total["value"]}| {average_time:.4f}')

Result

  1. 965658 Logs ,50 times avg ,close cache
Query Method KeyName Counts Analysis Avg Time (s)
{'match': {'title': '-ah'}}, match title 10000 Ngram 0.0056
{'match': {'title.keyword': '-ah'}} match title.keyword 0 Ngram 0.0018
{'match_phrase': {'title': '-ah'}} match_phrase title 45 Ngram 0.0036
{'match_phrase': {'title.keyword': '-ah'}} match_phrase title.keyword 0 Ngram 0.0024
{'wildcard': {'title': '-ah'}} wildcard title 0 Ngram 0.8781
{'wildcard': {'title.keyword': '-ah'}} wildcard title.keyword 0 Ngram 0.0580
{'match': {'sn': '5c'}} match sn 0 Default 0.0105
{'match': {'sn.keyword': '5c'}} match sn.keyword 0 Default 0.0048
{'match_phrase': {'sn': '5c'}} match_phrase sn 0 Default 0.0068
{'match_phrase': {'sn.keyword': '5c'}} match_phrase sn.keyword 0 Default 0.0054
{'wildcard': {'sn': '5c'}} wildcard sn 10000 Default 0.1659
{'wildcard': {'sn.keyword': '5c'}} wildcard sn.keyword 0 Default 0.0062
{'match': {'aname': 'a0'}} match aname 0 wildcard 0.0138
{'match': {'aname.keyword': 'a0'}} match aname.keyword 0 wildcard 0.0065
{'match_phrase': {'aname': 'a0'}} match_phrase aname 0 wildcard 0.0061
{'match_phrase': {'aname.keyword': 'a0'}} match_phrase aname.keyword 0 wildcard 0.0048
{'wildcard': {'aname': 'a0'}} wildcard aname 1000 wildcard 0.0380
{'wildcard': {'aname.keyword': 'a0'}} wildcard aname.keyword 0 wildcard 0.0086
赞(0)
未经允许不得转载:Toy's Tech Notes » Elasticsearch 模糊查询性能优化:wildcard vs ngram 实测对比
免费、开放、可编程的智能路由方案,让你的服务随时随地在线。

评论 抢沙发

十年稳如初 — LocVPS,用时间证明实力

10+ 年老牌云主机服务商,全球机房覆盖,性能稳定、价格厚道。

老品牌,更懂稳定的价值你的第一台云服务器,从 LocVPS 开始