本篇介绍的是基于Elasticsearch实现搜索推荐词,其中需要用到Elasticsearch的pinyin插件以及ik分词插件,代码的实现这里提供了java跟C#的版本方便大家参考。
1.实现的结果
①当搜索【qiy】的时候,能匹配企业、祈愿等
②当搜索【qi业】的时候,只能匹配的到企业,如果没有企业,将使用模糊查询,匹配祈愿。
③当搜索【q业】的时候结果同②。
④当搜索【企y】或【企ye】的时候结果同②。
④当搜索【qy】的时候,能匹配企业、祈愿等。
2.实现的逻辑
中文匹配前缀==》全拼匹配前缀==》拼音首字母匹配前缀==》拼音模糊匹配前缀
优先级从左到右,当前面三个有结果的时候不建议用模糊匹配,这样结果更加精确。比如需要获取8个推荐词,先获取中文的,如果足够8个将不再获取之后的匹配结果。但是当模糊匹配之前已经存在匹配结果了,即使数量没有达到8个,也不再继续获取模糊匹配结果。
3.插件准备
ik分词插件安装相对简单,网上教程也多,这里不做介绍。这里讲解下pinyin插件,官方版本的拼音插件不支持中文,处理结果只有拼音的,这样会出现同音字匹配,结果不准确。
这里感谢小伙伴分享的拼音插件修改方法:https://www.cnblogs.com/danvid/p/10691547.html。
按照里面的操作处理后的插件将实现:
企业画报:{"qi","企","ye","业","hua","画","bao","报"}
拼音插件的各项具体属性参考:https://blog.csdn.net/a1148233614/article/details/80280024,里面有详细介绍。
4.Elasticsearch创建index
这里使用的ES版本为7.0.1,不再支持mapping,创建代码如下:
PUT /suggest_tset
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"prefix_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"prefix_pinyin"
]
},
"full_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"full_pinyin"
]
},
"like_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"like_pinyin"
]
}
},
"filter": {
"_pattern": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": [
"([0-9])",
"([a-z])"
]
},
"prefix_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_full_pinyin": "false",
"none_chinese_pinyin_tokenize": "false",
"keep_separate_chinese": "true",
"keep_original": "false"
},
"full_pinyin": {
"type": "pinyin",
"keep_first_letter": "false",
"keep_full_pinyin": "true",
"keep_original": "false",
"keep_separate_chinese": "true",
"keep_none_chinese_in_first_letter": "false"
},
"like_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_full_pinyin": "true",
"keep_joined_full_pinyin": "false",
"keep_original": "false",
"keep_separate_chinese": "false",
"keep_none_chinese_in_first_letter": "false"
}
}
}
},
"mappings": {
"dynamic": "false",
"properties": {
"kwsuggest": {
"fields": {
"suggestText": {
"type": "completion",
"analyzer": "standard",
"preserve_separators": "false",
"preserve_position_increments": "true",
"max_input_length": 50
},
"prefix_pinyin": {
"type": "completion",
"analyzer": "prefix_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": "false"
},
"full_pinyin": {
"type": "completion",
"analyzer": "full_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": "false"
},
"like_pinyin": {
"type": "completion",
"analyzer": "like_pinyin_analyzer",
"preserve_separators": "false"
}
},
"type": "text"
}
}
}
}
这里插入几条测试数据
POST _bulk/?refresh=true
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "企业规划"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "祈愿设计 完美无瑕"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "悬崖的图片 美景"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "县衙地址 那里呢"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "悬崖风景图"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜的风光 真的美"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜第二个词 测试使用"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "需要一半留下一半打一字谜"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "许亚为"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "许雅非测试"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "徐杨是谁"}
下面为测试的查询语句
GET /suggest_tset/_search
{
"suggest": {
"suggestText": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.suggestText",
"skip_duplicates": true
}
},
"full_pinyin": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.full_pinyin",
"skip_duplicates": true
}
},
"prefix_pinyin": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.prefix_pinyin",
"skip_duplicates": true
}
},
"like_pinyin": {
"prefix": "qi业",
"completion": {
"field": "kwsuggest.like_pinyin",
"skip_duplicates": true,
"fuzzy": {
"fuzziness": 1
}
}
}
}
}
当输入查询条件为【qiy】的时候,结果为:
{
"took" : 17,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"like_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企业规划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "企业规划"
}
},
{
"text" : "祈愿设计 这是啥呢",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "祈愿设计 这是啥呢"
}
},
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
],
"suggestText" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
]
}
}
输入【qi业】的查询结果为
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企业规划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "企业规划"
}
}
]
}
],
"like_pinyin" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企业规划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "企业规划"
}
},
{
"text" : "祈愿设计 这是啥呢",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "祈愿设计 这是啥呢"
}
},
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
],
"suggestText" : [
{
"text" : "qi业",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
]
}
}
输入【qy】的结果为
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [ ]
}
],
"like_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [
{
"text" : "起夜的风光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜的风光 真的美"
}
},
{
"text" : "起夜第二个词 测试使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜第二个词 测试使用"
}
}
]
}
],
"suggestText" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [ ]
}
]
}
}
5.java版本代码
这里使用elasticsearch-rest-high-level-client
application.yml添加配置
# ES配置
elasticsearch:
ipAddress: [127.0.0.1:9200]
添加配置类
@Component
@Configuration
@ConfigurationProperties(prefix = "elasticsearch")
@Data
public class ElasticsearchRestClientConfig {
private Logger logger = LoggerFactory.getLogger(getClass());
private static final int ADDRESS\_LENGTH = 2;
private static final String HTTP\_SCHEME = "http";
/\*\*
\* 使用冒号隔开ip和端口
\*/
public String\[\] ipAddress;
@Bean
public RestClientBuilder restClientBuilder() {
HttpHost\[\] hosts = Arrays.stream(ipAddress)
.map(this::makeHttpHost)
.filter(Objects::nonNull)
.toArray(HttpHost\[\]::new);
logger.debug("hosts:{}", Arrays.toString(hosts));
return RestClient.builder(hosts);
}
@Bean(name = "highLevelClient")
public RestHighLevelClient highLevelClient(@Autowired RestClientBuilder restClientBuilder) {
return new RestHighLevelClient(restClientBuilder);
}
private HttpHost makeHttpHost(String s) {
assert StringUtils.isNotEmpty(s);
String\[\] address = s.split(":");
if (address.length == ADDRESS\_LENGTH) {
String ip = address\[0\];
int port = Integer.parseInt(address\[1\]);
return new HttpHost(ip, port, HTTP\_SCHEME);
} else {
return null;
}
}
}
实现的代码:
@Service
public class KwSuggestService implements IKwSuggest {
@Autowired
RestHighLevelClient highLevelClient;
@Override
public List<String> GetKwSuggestList(String kw){
SearchRequest searchRequest = new SearchRequest("suggest\_tset");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
SuggestBuilder suggestBuilder=new SuggestBuilder();
suggestBuilder.addSuggestion("suggestText", SuggestBuilders.completionSuggestion("kwsuggest.suggestText").prefix(kw).skipDuplicates(true).size(5));
suggestBuilder.addSuggestion("full\_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.full\_pinyin").prefix(kw).skipDuplicates(true).size(5));
suggestBuilder.addSuggestion("prefix\_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.prefix\_pinyin").prefix(kw).skipDuplicates(true).size(5));
suggestBuilder.addSuggestion("like\_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.like\_pinyin").prefix(kw, Fuzziness.fromEdits(1)).skipDuplicates(true).size(5));
sourceBuilder.suggest(suggestBuilder);
sourceBuilder.timeout(new TimeValue(10, TimeUnit.SECONDS));
searchRequest.source(sourceBuilder);
List<String> result = new ArrayList<>();
List<String> suggestionList= Arrays.asList("suggestText","full\_pinyin","prefix\_pinyin","like\_pinyin");
try {
SearchResponse response = highLevelClient.search(searchRequest, RequestOptions.DEFAULT);
Suggest suggestions = response.getSuggest();
Integer index = 1;
for(String suggestionType : suggestionList){
CompletionSuggestion completionSuggestion = suggestions.getSuggestion(suggestionType);
for (CompletionSuggestion.Entry entry : completionSuggestion.getEntries()) {
for (CompletionSuggestion.Entry.Option option : entry) {
String suggestText = option.getHit().getSourceAsMap().get("kwsuggest").toString();
result.add(suggestText);
}
}
// 按照中文匹配、全拼匹配、拼音首字母匹配、模糊匹配的顺序,结果大于5的时候返回结果,根据自己业务需要判断这个返回的数量
if(result.size()>=5){
break;
}
// 中文匹配,全拼匹配以及拼音首字母匹配存在结果的,不需要模糊匹配
if(index==3 && result.size()>0){
break;
}
// 超过3个字模糊匹配不准确
if(kw.length()>3 && result.size()==0){
break;
}
}
return result;
} catch (IOException e) {
e.printStackTrace();
return new ArrayList<>();
}
}
}
6..c#代码实现
C#使用的是NEST
public partial class ElasticFactory
{
public ExternalServiceResponse
{
var result = new ExternalServiceResponse
try
{
if (string.IsNullOrEmpty(request.q)) return result;
var nodes = new Uri\[0\];
nodes\[0\] = new Uri("http://127.0.0.1:9200");
var pool = new StaticConnectionPool(nodes);
var settings = new ConnectionSettings(pool).DefaultIndex("suggest\_tset");
var client = new ElasticClient(settings);
string\[\] keys = new\[\] { "suggestText", "full\_pinyin", "prefix\_pinyin", "like\_pinyin" };
SearchDescriptor<object> search = new SearchDescriptor<object>();
search
.Source(r => r
.Includes(f => f
.Fields("kw")
)
)
.Suggest(s => s.Completion(keys\[0\], c => c.Field("kwsuggest.suggestText").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
.Completion(keys\[1\], c => c.Field("kwsuggest.full\_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
.Completion(keys\[2\], c => c.Field("kwsuggest.prefix\_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
.Completion(keys\[3\], c => c.Field("kwsuggest.like\_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates().Fuzzy(m=>m.Fuzziness(Fuzziness.EditDistance(1)))))
;
var esResult = client.Search<dynamic>(s => search);
if (esResult != null)
{
result.code = 1;
result.data = new KeywordsSuggestResponseDataEntity();
//1.先获取中文全匹配
//2.上面不满5个,再匹配全拼
//3.上面不满5个,中文全拼匹配首字母
//4.上面都没有用模糊匹配
if (esResult.Suggest != null)
{
result.data.items = new List<KeywordsSuggestResponseItemEntity>();
int index = 1;
foreach (var key in keys)
{
AddSuggestItems(esResult.Suggest, key, result.data.items);
//1-3之间,够了5个就返回
if (index >= 1 && index <= 3 && result.data.items.Count >= 5)
{
result.data.items = result.data.items.Skip(0).Take(5).ToList();
break;
}
//到了第3步如果还没有满足5个,直接返回,模糊匹配不精确
if (index == 3 && result.data.items.Count > 0)
{
break;
}
//输入的字符数大于3个以上,前面没有关键词匹配,后面不做模糊处理,匹配度太差了
if (index == 3 && request.q.Length>3)
{
break;
}
index++;
}
result.data.num = result.data.items.Count;
}
else
{
result.data.num = 0;
}
}
else
{
result.code = 0;
result.msg = "查询失败";
}
}
catch (Exception ex)
{
result.code = 0;
result.msg = ex.Message;
}
return result;
}
private void AddSuggestItems(ISuggestDictionary<dynamic> suggest, string key, List<KeywordsSuggestResponseItemEntity> items)
{
var suggestFullPinyin = suggest\[key\];
if (suggestFullPinyin != null)
{
foreach (var hit in suggestFullPinyin\[0\].Options)
{
string kwSource = hit.Source\["kwsuggest"\];
//已经存在的不要重复添加
if (items.Any(m => m.kw == kwSource))
{
continue;
}
items.Add(new KeywordsSuggestResponseItemEntity() { kw = kwSource });
}
}
}
}
手机扫一扫
移动阅读更方便
你可能感兴趣的文章