SNOWPIPE-Continuous Data Loading in Snowflake
SNOWPIPE-Continuous Data Loading in Snowflake
In Snowflake
Loading data continuously in Snowflake can be a challenging task,
especially without the use of an external stage. This article will
demonstrate a simple approach to loading data from your local
machine using a Python program.
1. SnowPipe Architecture
2. SnowPipe components
SQL Scripts
-- you can craete named stages and then list them using list command
show stages;
list @STG01;
list @~ pattern='.*test.*';
list @~ pattern='.*.gz';
list @~ pattern='.*.html';
list @%customer_parquet/;
-- now lets query the data using $ notation
select
metadata$filename,
metadata$file_row_number,
$1:CUSTOMER_KEY::varchar,
$1:NAME::varchar,
$1:ADDRESS::varchar,
$1:COUNTRY_KEY::varchar,
$1:PHONE::varchar,
$1:ACCT_BAL::decimal(10,2),
$1:MKT_SEGMENT::varchar,
$1:COMMENT::varchar
from @%customer_parquet_ff ;
-- ***********************************
-- Step-05
-- load into table
copy into my_customer from @stg_010/history;
-- create a pipe object & understand its construct.
drop pipe my_pipe_10;
create or replace pipe my_pipe_10
as
copy into my_customer from @stg_010/delta;
Python Script
logging.basicConfig(
filename='/tmp/ingest.log',
level=logging.DEBUG)
logger = getLogger(__name__)
private_key_text = private_key_obj.private_bytes(
Encoding.PEM, PrivateFormat.PKCS8, NoEncryption()).decode('utf-8')
file_list=['/customer_101.csv']
ingest_manager = SimpleIngestManager(account='<snowflake-account>',
host='<ab12345.es-
east.azure>.snowflakecomputing.com',
user='<user-name>',
pipe='<pipename>',
private_key=private_key_text)
staged_file_list = []
for file_name in file_list:
staged_file_list.append(StagedFile(file_name, None))
try:
resp = ingest_manager.ingest_files(staged_file_list)
except HTTPError as e:
logger.error(e)
exit(1)
print("Section: Assert")
assert(resp['responseCode'] == 'SUCCESS')
while True:
history_resp = ingest_manager.get_history()
if len(history_resp['files']) > 0:
print('Ingest Report:\n')
print(history_resp)
break
else:
# wait for 20 seconds
time.sleep(20)
hour = timedelta(hours=1)
date = datetime.datetime.utcnow() - hour
history_range_resp = ingest_manager.get_history_range(date.isoformat() +
'Z')