es全部数据下载，大数量scan或游标分页方式 scroll；es7 mapping，dense-vector向量索引创建和查询；按字段查询；嵌套对象

本文链接：https://blog.csdn.net/weixin_42357472/article/details/111849209

参考：https://blog.csdn.net/weixin_40341116/article/details/80821655

1、scan、scroll

建议用scan

import elasticsearch
import elasticsearch.helpers
es = elasticsearch.Elasticsearch([{"host": "", "port": }])
results = elasticsearch.helpers.scan(es,
    index="test_index",
    query={"query": {"match_all": {}}},
)

for item in results:
    print(item['_id'], item['_source']['name'])

比如某index总共4675 docments
在这里插入图片描述
&&& size 2000 与下面total[‘value’]/2000)+1相同，数值要一致

from elasticsearch import Elasticsearch
import json

# 获取全部数据测试

#es = Elasticsearch([{"host": "4*******9", "port": 8710}])

es = Elasticsearch(hosts="http:******0/", http_auth=('abc','dataanalysis'))
query_json ={
"query": {
"bool": {
"must": [
{
"match_all": { }
}
],
"must_not": [ ],
"should": [ ]
}
},

"sort": [ ],
"aggs": { }
}
#scroll 保持多少分钟
query = es.search(index='kibana_sam******rce',body=query_json,scroll='5m',size=2000)
print(query)
results = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total']  # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果
print( results,total,scroll_id )


#循环游标id
for i in range(0, int(total['value']/2000)+1):
    # scroll参数必须指定否则会报错
    query_scroll = es.scroll(scroll_id=scroll_id,scroll='5m')['hits']['hits']
    results += query_scroll

aaa= []

 for res in results:
        aaa.append(res['_id']+','+res['_source']['customer_first_name'])


print('done!')

print(len(aaa))

#保存numpy
import numpy as np
np.save("all_ab***atas.npy",np.array(results))

## 保存csv、json
import csv
import json

with open('./event_title.csv','w',newline='',encoding='utf-8') as flow:
    csv_writer = csv.writer(flow)
    for res in results:
        # print(res)
        csv_writer.writerow([res['_id']+','+res['_source']['title']])


with open('./data1.json','w',encoding='utf-8') as f:
    f.write(json.dumps(results,ensure_ascii=False))

在这里插入图片描述

from elasticsearch import Elasticsearch

es = Elasticsearch()

query_json = {
  "query": {
        "match_all": {}  # 获取所有数据
  }
}
page_num = 100  # 每次获取数据

query = es.search(index=8, body=query_json, scroll='5m', size=page_num)

results = query['hits']['hits']  # es查询出的结果第一页
total = query['hits']['total']  # es查询出的结果总量
scroll_id = query['_scroll_id']  # 游标用于输出es查询出的所有结果
every_num = int(total/page_num)  #

alist = []
for i in range(0, every_num+1):
    # scroll参数必须指定否则会报错
    query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
    results += query_scroll
for key in results:
    es_data_dict = key["_source"]["word"]
    # print(es_data_dict)
    alist.append(es_data_dict)
print(len(alist))

es7 mapping，dense-vector向量创建即查询

1、创建索引先，然后创建mapping
代码版：

def build_index(es_client,_index):
    print("there2")
    mappings = {
        "mappings": {
            # "_doc":{
            "properties": {
                "heat": {
                    "type": "long"
                },
                "hpicmd5": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "ignore_above": 256,
                            "type": "keyword"
                        }
                    }
                },
                "reference_time": {
                    "type": "long"
                }
            }
        }
    }
    es_client.indices.create(index=_index, body=mappings)
  
es_client = Elasticsearch(hosts=[{"host": "*******", "port": 8710}], 
build_index(es_client,_index)

在这里插入图片描述
2、插入数据
代码版：


def create_albume_dense_vector(i_json,es_client,_index):
    bulks = []
    
        
    bulks.append(
        {"_index": _index, "costTime": i_json["costTime"],
         "esSearchTime": i_json["esSearchTime"],"host": i_json["host"],"message": i_json["message"],"requestJson": i_json["requestJson"],

         })
    helpers.bulk(es_client, bulks, request_timeout=300)
 

    
    
es_client = Elasticsearch(hosts=[{"host": "****", "port": 8710}], timeout=1000, http_auth=None)
_index = 'search_user_log_test1'                     
i="""{"costTime":"35","esSearchltCode":"0","searchPath":[1,2,5],"shortVideo":[],"tts":"熊出没"}"""
i_json = json.loads(i)

create_albume_dense_vector(i_json,es_client,_index)

put需要传入id，post自动生成id

在这里插入图片描述

requests post

import requests
import json
 
host = "http://4***:8710/se****st1/_doc"

headers = {"Content-Type":"application/json","charset":"UTF-8"}
 

# r = requests.post(url)
r = requests.post(host,headers=headers,data=json.dumps(json.loads(json_string)))
#response = r.json()
print (r.text)

3、向量查询
{ "query": { "script_score": { "query": { "match_all": {} }, "script": { "source": "dotProduct(params.queryVector, doc['vv'])", "params": { "queryVector": [0.004, 0.14, -0.2,0.13,0.22] } } } } } 在这里插入图片描述

按字段查询

{
    "query": {
        "bool": {
            "must": [
                {"terms": {"vendor": ["13", "15"]}},
                {"term": {"type": "1"}},
                {"term": {"status": "1"}},
                {"terms": {"channelid.keyword": ["001", "002", "003", "004", "005", "023"]}},
            ]}}
}

elasticsearch嵌套对象的映射

参考：
https://www.elastic.co/guide/cn/elasticsearch/guide/current/nested-objects.html
https://blog.csdn.net/fu_huo_1993/article/details/88350132

查询语句

{
  "query": {
    "bool": {
      "must": [
        {
          "nested": {
            "path": "requ***lu.slots",
            "query": {
              "bool": {
                "must": [
                  {
                    "match": {
                      "requ****ts.name": "FreeWord"
                    }
                  },
                  {
                    "match": {
                      "requ****8ots.value": "免费"
                    }
                  }
                ]
              }
            }
          }
        }
      ]
    }
  }
}