Elasticsearch实现搜索推荐词
阅读原文时间:2023年07月09日阅读:4

本篇介绍的是基于Elasticsearch实现搜索推荐词,其中需要用到Elasticsearch的pinyin插件以及ik分词插件,代码的实现这里提供了java跟C#的版本方便大家参考。

1.实现的结果

①当搜索【qiy】的时候,能匹配企业、祈愿等

②当搜索【qi业】的时候,只能匹配的到企业,如果没有企业,将使用模糊查询,匹配祈愿。

③当搜索【q业】的时候结果同②。

④当搜索【企y】或【企ye】的时候结果同②。

④当搜索【qy】的时候,能匹配企业、祈愿等。

2.实现的逻辑

中文匹配前缀==》全拼匹配前缀==》拼音首字母匹配前缀==》拼音模糊匹配前缀

优先级从左到右,当前面三个有结果的时候不建议用模糊匹配,这样结果更加精确。比如需要获取8个推荐词,先获取中文的,如果足够8个将不再获取之后的匹配结果。但是当模糊匹配之前已经存在匹配结果了,即使数量没有达到8个,也不再继续获取模糊匹配结果。

3.插件准备

ik分词插件安装相对简单,网上教程也多,这里不做介绍。这里讲解下pinyin插件,官方版本的拼音插件不支持中文,处理结果只有拼音的,这样会出现同音字匹配,结果不准确。

这里感谢小伙伴分享的拼音插件修改方法:https://www.cnblogs.com/danvid/p/10691547.html

按照里面的操作处理后的插件将实现:

企业画报:{"qi","企","ye","业","hua","画","bao","报"}

拼音插件的各项具体属性参考:https://blog.csdn.net/a1148233614/article/details/80280024,里面有详细介绍。

4.Elasticsearch创建index

这里使用的ES版本为7.0.1,不再支持mapping,创建代码如下:

PUT /suggest_tset
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"prefix_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"prefix_pinyin"
]
},
"full_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"full_pinyin"
]
},
"like_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"like_pinyin"
]
}
},
"filter": {
"_pattern": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": [
"([0-9])",
"([a-z])"
]
},
"prefix_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_full_pinyin": "false",
"none_chinese_pinyin_tokenize": "false",
"keep_separate_chinese": "true",
"keep_original": "false"
},
"full_pinyin": {
"type": "pinyin",
"keep_first_letter": "false",
"keep_full_pinyin": "true",
"keep_original": "false",
"keep_separate_chinese": "true",
"keep_none_chinese_in_first_letter": "false"
},
"like_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_full_pinyin": "true",
"keep_joined_full_pinyin": "false",
"keep_original": "false",
"keep_separate_chinese": "false",
"keep_none_chinese_in_first_letter": "false"
}
}
}
},
"mappings": {
"dynamic": "false",
"properties": {
"kwsuggest": {
"fields": {
"suggestText": {
"type": "completion",
"analyzer": "standard",
"preserve_separators": "false",
"preserve_position_increments": "true",
"max_input_length": 50
},
"prefix_pinyin": {
"type": "completion",
"analyzer": "prefix_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": "false"
},
"full_pinyin": {
"type": "completion",
"analyzer": "full_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": "false"
},
"like_pinyin": {
"type": "completion",
"analyzer": "like_pinyin_analyzer",
"preserve_separators": "false"
}
},
"type": "text"
}
}
}
}

 这里插入几条测试数据

POST _bulk/?refresh=true
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "企业规划"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "祈愿设计 完美无瑕"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "悬崖的图片 美景"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "县衙地址 那里呢"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "悬崖风景图"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜的风光 真的美"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜第二个词 测试使用"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "需要一半留下一半打一字谜"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "许亚为"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "许雅非测试"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "徐杨是谁"}

下面为测试的查询语句

GET /suggest_tset/_search
{
"suggest": {
"suggestText": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.suggestText",
"skip_duplicates": true
}
},
"full_pinyin": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.full_pinyin",
"skip_duplicates": true
}
},
"prefix_pinyin": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.prefix_pinyin",
"skip_duplicates": true
}
},
"like_pinyin": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.like_pinyin",
"skip_duplicates": true,
"fuzzy": {
"fuzziness": 1
}
}
}
}
}  

当输入查询条件为【qiy】的时候,结果为:

{
"took" : 17,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"like_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企业规划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "企业规划"
}
},
{
"text" : "祈愿设计 这是啥呢",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "祈愿设计 这是啥呢"
}
},
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
],
"suggestText" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
]
}
}  

 输入【qi业】的查询结果为

{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企业规划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "企业规划"
}
}
]
}
],
"like_pinyin" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企业规划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "企业规划"
}
},
{
"text" : "祈愿设计 这是啥呢",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "祈愿设计 这是啥呢"
}
},
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
],
"suggestText" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
]
}
}

  输入【qy】的结果为

{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [ ]
}
],
"like_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"suggestText" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [ ]
}
]
}
}

5.java版本代码

这里使用elasticsearch-rest-high-level-client

application.yml添加配置

# ES配置
elasticsearch:
ipAddress: [127.0.0.1:9200]

添加配置类

@Component
@Configuration
@ConfigurationProperties(prefix = "elasticsearch")
@Data
public class ElasticsearchRestClientConfig {
private Logger logger = LoggerFactory.getLogger(getClass());

private static final int ADDRESS\_LENGTH = 2;  
private static final String HTTP\_SCHEME = "http";

/\*\*  
 \* 使用冒号隔开ip和端口  
 \*/  
public String\[\] ipAddress;

@Bean  
public RestClientBuilder restClientBuilder() {  
    HttpHost\[\] hosts = Arrays.stream(ipAddress)  
            .map(this::makeHttpHost)  
            .filter(Objects::nonNull)  
            .toArray(HttpHost\[\]::new);  
    logger.debug("hosts:{}", Arrays.toString(hosts));  
    return RestClient.builder(hosts);  
}

@Bean(name = "highLevelClient")  
public RestHighLevelClient highLevelClient(@Autowired RestClientBuilder restClientBuilder) {  
    return new RestHighLevelClient(restClientBuilder);  
}

private HttpHost makeHttpHost(String s) {  
    assert StringUtils.isNotEmpty(s);  
    String\[\] address = s.split(":");  
    if (address.length == ADDRESS\_LENGTH) {  
        String ip = address\[0\];  
        int port = Integer.parseInt(address\[1\]);  
        return new HttpHost(ip, port, HTTP\_SCHEME);  
    } else {  
        return null;  
    }  
}  

}

实现的代码:

@Service
public class KwSuggestService implements IKwSuggest {
@Autowired
RestHighLevelClient highLevelClient;

@Override  
public List<String> GetKwSuggestList(String kw){  
    SearchRequest searchRequest = new SearchRequest("suggest\_tset");  
    SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();  
    SuggestBuilder suggestBuilder=new SuggestBuilder();  
    suggestBuilder.addSuggestion("suggestText", SuggestBuilders.completionSuggestion("kwsuggest.suggestText").prefix(kw).skipDuplicates(true).size(5));  
    suggestBuilder.addSuggestion("full\_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.full\_pinyin").prefix(kw).skipDuplicates(true).size(5));  
    suggestBuilder.addSuggestion("prefix\_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.prefix\_pinyin").prefix(kw).skipDuplicates(true).size(5));  
    suggestBuilder.addSuggestion("like\_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.like\_pinyin").prefix(kw, Fuzziness.fromEdits(1)).skipDuplicates(true).size(5));  
    sourceBuilder.suggest(suggestBuilder);  
    sourceBuilder.timeout(new TimeValue(10, TimeUnit.SECONDS));  
    searchRequest.source(sourceBuilder);  
    List<String> result = new ArrayList<>();  
    List<String> suggestionList= Arrays.asList("suggestText","full\_pinyin","prefix\_pinyin","like\_pinyin");  
    try {  
        SearchResponse response = highLevelClient.search(searchRequest, RequestOptions.DEFAULT);  
        Suggest suggestions = response.getSuggest();  
        Integer index = 1;  
        for(String suggestionType : suggestionList){  
            CompletionSuggestion completionSuggestion = suggestions.getSuggestion(suggestionType);  
            for (CompletionSuggestion.Entry entry : completionSuggestion.getEntries()) {  
                for (CompletionSuggestion.Entry.Option option : entry) {  
                    String suggestText =  option.getHit().getSourceAsMap().get("kwsuggest").toString();  
                    result.add(suggestText);  
                }  
            }  
            // 按照中文匹配、全拼匹配、拼音首字母匹配、模糊匹配的顺序,结果大于5的时候返回结果,根据自己业务需要判断这个返回的数量  
            if(result.size()>=5){  
                break;  
            }  
            // 中文匹配,全拼匹配以及拼音首字母匹配存在结果的,不需要模糊匹配  
            if(index==3 && result.size()>0){  
                break;  
            }  
            // 超过3个字模糊匹配不准确  
            if(kw.length()>3 && result.size()==0){  
                break;  
            }  
        }  
        return result;  
    } catch (IOException e) {  
        e.printStackTrace();  
        return new ArrayList<>();  
    }  
}  

}

6..c#代码实现

C#使用的是NEST

public partial class ElasticFactory
{
public ExternalServiceResponse GetKeywordsSuggest(ElasticKeywordsSuggestRequest request)
{
var result = new ExternalServiceResponse();

        try  
        {  
            if (string.IsNullOrEmpty(request.q)) return result;

            var nodes = new Uri\[0\];  
            nodes\[0\] = new Uri("http://127.0.0.1:9200");  
            var pool = new StaticConnectionPool(nodes);  
            var settings = new ConnectionSettings(pool).DefaultIndex("suggest\_tset");  
            var client = new ElasticClient(settings);

            string\[\] keys = new\[\] { "suggestText", "full\_pinyin", "prefix\_pinyin", "like\_pinyin" };  
            SearchDescriptor<object> search = new SearchDescriptor<object>();  
            search  
                .Source(r => r  
                    .Includes(f => f  
                        .Fields("kw")  
                    )  
                )  
                .Suggest(s => s.Completion(keys\[0\], c => c.Field("kwsuggest.suggestText").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())  
                    .Completion(keys\[1\], c => c.Field("kwsuggest.full\_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())  
                    .Completion(keys\[2\], c => c.Field("kwsuggest.prefix\_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())  
                    .Completion(keys\[3\], c => c.Field("kwsuggest.like\_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates().Fuzzy(m=>m.Fuzziness(Fuzziness.EditDistance(1)))))  
                ;  
            var esResult = client.Search<dynamic>(s => search);  
            if (esResult != null)  
            {  
                result.code = 1;  
                result.data = new KeywordsSuggestResponseDataEntity();  
                //1.先获取中文全匹配  
                //2.上面不满5个,再匹配全拼  
                //3.上面不满5个,中文全拼匹配首字母  
                //4.上面都没有用模糊匹配  
                if (esResult.Suggest != null)  
                {  
                    result.data.items = new List<KeywordsSuggestResponseItemEntity>();  
                    int index = 1;  
                    foreach (var key in keys)  
                    {  
                        AddSuggestItems(esResult.Suggest, key, result.data.items);  
                        //1-3之间,够了5个就返回  
                        if (index >= 1 && index <= 3 && result.data.items.Count >= 5)  
                        {  
                            result.data.items = result.data.items.Skip(0).Take(5).ToList();  
                            break;  
                        }  
                        //到了第3步如果还没有满足5个,直接返回,模糊匹配不精确  
                        if (index == 3 && result.data.items.Count > 0)  
                        {  
                            break;  
                        }  
                        //输入的字符数大于3个以上,前面没有关键词匹配,后面不做模糊处理,匹配度太差了  
                        if (index == 3 && request.q.Length>3)  
                        {  
                            break;  
                        }  
                        index++;  
                    }  
                    result.data.num = result.data.items.Count;  
                }  
                else  
                {  
                    result.data.num = 0;  
                }  
            }  
            else  
            {  
                result.code = 0;  
                result.msg = "查询失败";  
            }  
        }  
        catch (Exception ex)  
        {  
            result.code = 0;  
            result.msg = ex.Message;  
        }

        return result;  
    }

    private void AddSuggestItems(ISuggestDictionary<dynamic> suggest, string key, List<KeywordsSuggestResponseItemEntity> items)  
    {  
        var suggestFullPinyin = suggest\[key\];  
        if (suggestFullPinyin != null)  
        {  
            foreach (var hit in suggestFullPinyin\[0\].Options)  
            {  
                string kwSource = hit.Source\["kwsuggest"\];  
                //已经存在的不要重复添加  
                if (items.Any(m => m.kw == kwSource))  
                {  
                    continue;  
                }  
                items.Add(new KeywordsSuggestResponseItemEntity() { kw = kwSource });  
            }  
        }  
    }  
}

手机扫一扫

移动阅读更方便

阿里云服务器
腾讯云服务器
七牛云服务器

你可能感兴趣的文章