参考:https://blog.csdn.net/weixin_40341116/article/details/80821655
1、scan、scroll
建议用scan
import elasticsearch
import elasticsearch.helpers
es = elasticsearch.Elasticsearch([{"host": "", "port": }])
results = elasticsearch.helpers.scan(es,
index="test_index",
query={"query": {"match_all": {}}},
)
for item in results:
print(item['_id'], item['_source']['name'])
比如某index总共4675 docments
&&& size 2000 与下面total[‘value’]/2000)+1相同,数值要一致
from elasticsearch import Elasticsearch
import json
# 获取全部数据测试
#es = Elasticsearch([{"host": "4*******9", "port": 8710}])
es = Elasticsearch(hosts="http:******0/", http_auth=('abc','dataanalysis'))
query_json ={
"query": {
"bool": {
"must": [
{
"match_all": { }
}
],
"must_not": [ ],
"should": [ ]
}
},
"sort": [ ],
"aggs": { }
}
#scroll 保持多少分钟
query = es.search(index='kibana_sam******rce',body=query_json,scroll='5m',size=2000)
print(query)
results = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total'] # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果
print( results,total,scroll_id )
#循环游标id
for i in range(0, int(total['value']/2000)+1):
# scroll参数必须指定否则会报错
query_scroll = es.scroll(scroll_id=scroll_id,scroll='5m')['hits']['hits']
results += query_scroll
aaa= []
for res in results:
aaa.append(res['_id']+','+res['_source']['customer_first_name'])
print('done!')
print(len(aaa))
#保存numpy
import numpy as np
np.save("all_ab***atas.npy",np.array(results))
## 保存csv、json
import csv
import json
with open('./event_title.csv','w',newline='',encoding='utf-8') as flow:
csv_writer = csv.writer(flow)
for res in results:
# print(res)
csv_writer.writerow([res['_id']+','+res['_source']['title']])
with open('./data1.json','w',encoding='utf-8') as f:
f.write(json.dumps(results,ensure_ascii=False))
from elasticsearch import Elasticsearch
es = Elasticsearch()
query_json = {
"query": {
"match_all": {} # 获取所有数据
}
}
page_num = 100 # 每次获取数据
query = es.search(index=8, body=query_json, scroll='5m', size=page_num)
results = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total'] # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果
every_num = int(total/page_num) #
alist = []
for i in range(0, every_num+1):
# scroll参数必须指定否则会报错
query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
results += query_scroll
for key in results:
es_data_dict = key["_source"]["word"]
# print(es_data_dict)
alist.append(es_data_dict)
print(len(alist))
es7 mapping,dense-vector向量创建即查询
1、创建索引先,然后创建mapping
代码版:
def build_index(es_client,_index):
print("there2")
mappings = {
"mappings": {
# "_doc":{
"properties": {
"heat": {
"type": "long"
},
"hpicmd5": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"reference_time": {
"type": "long"
}
}
}
}
es_client.indices.create(index=_index, body=mappings)
es_client = Elasticsearch(hosts=[{"host": "*******", "port": 8710}],
build_index(es_client,_index)
2、插入数据
代码版:
def create_albume_dense_vector(i_json,es_client,_index):
bulks = []
bulks.append(
{"_index": _index, "costTime": i_json["costTime"],
"esSearchTime": i_json["esSearchTime"],"host": i_json["host"],"message": i_json["message"],"requestJson": i_json["requestJson"],
})
helpers.bulk(es_client, bulks, request_timeout=300)
es_client = Elasticsearch(hosts=[{"host": "****", "port": 8710}], timeout=1000, http_auth=None)
_index = 'search_user_log_test1'
i="""{"costTime":"35","esSearchltCode":"0","searchPath":[1,2,5],"shortVideo":[],"tts":"熊出没"}"""
i_json = json.loads(i)
create_albume_dense_vector(i_json,es_client,_index)
put需要传入id,post自动生成id
requests post
import requests
import json
host = "http://4***:8710/se****st1/_doc"
headers = {"Content-Type":"application/json","charset":"UTF-8"}
# r = requests.post(url)
r = requests.post(host,headers=headers,data=json.dumps(json.loads(json_string)))
#response = r.json()
print (r.text)
3、向量查询
{ "query": { "script_score": { "query": { "match_all": {} }, "script": { "source": "dotProduct(params.queryVector, doc['vv'])", "params": { "queryVector": [0.004, 0.14, -0.2,0.13,0.22] } } } } }
按字段查询
{
"query": {
"bool": {
"must": [
{"terms": {"vendor": ["13", "15"]}},
{"term": {"type": "1"}},
{"term": {"status": "1"}},
{"terms": {"channelid.keyword": ["001", "002", "003", "004", "005", "023"]}},
]}}
}
elasticsearch嵌套对象的映射
参考:
https://www.elastic.co/guide/cn/elasticsearch/guide/current/nested-objects.html
https://blog.csdn.net/fu_huo_1993/article/details/88350132
查询语句
{
"query": {
"bool": {
"must": [
{
"nested": {
"path": "requ***lu.slots",
"query": {
"bool": {
"must": [
{
"match": {
"requ****ts.name": "FreeWord"
}
},
{
"match": {
"requ****8ots.value": "免费"
}
}
]
}
}
}
}
]
}
}
}